In [7]:
import sklearn
print(sklearn.__version__)


1.4.1.post1


In [7]:
!pip install --upgrade requests pandas scikit-learn numpy joblib dill

Collecting pandas
  Downloading pandas-2.2.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.0/13.0 MB[0m [31m27.3 MB/s[0m eta [36m0:00:00[0m
Collecting scikit-learn
  Downloading scikit_learn-1.4.1.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.1/12.1 MB[0m [31m50.5 MB/s[0m eta [36m0:00:00[0m
Collecting numpy
  Downloading numpy-1.26.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.2/18.2 MB[0m [31m59.2 MB/s[0m eta [36m0:00:00[0m
Collecting dill
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2024.1-py2.py3-none-any.whl

In [10]:
import requests
from io import StringIO
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import FunctionTransformer, MinMaxScaler
from sklearn.pipeline import Pipeline
import numpy as np
import dill
from joblib import dump


# Traemos el Prepro01 (df preprocesado, no el virgen)
github_link = "https://raw.githubusercontent.com/Martinerramuspe/02-DATA_SCIENCIE/main/01-TELECOM_CHURN_PROJECT/02-PREPROCESAMIENTO/Prepro01.csv"
response = requests.get(github_link)
csv_data = StringIO(response.text)
df = pd.read_csv(csv_data, encoding='utf-8')

# Definimos las características según lo ya definido en la etapa de elección de modelo (no incluido en el Prepro01).
X = df[['International plan', 'Voice mail plan', 'Number vmail messages',
        'Total day minutes', 'Total day charge', 'Total eve minutes',
        'Total eve charge', 'Total night minutes', 'Total night charge',
        'Total intl minutes', 'Total intl calls']]
y = df.Churn

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Instanciamos el modelo RandomForestClassifier
forest_model = RandomForestClassifier()

# Fit del modelo
forest_model.fit(X_train, y_train)

# Función que transforma el tipo de datos de la columna "Churn"
def transf_entero(df):
    df['Churn'] = df['Churn'].astype(int)
    return df

# Función que transforma 'International plan' a int.
def mapear_international_plan(df):
    df['International plan'] = df['International plan'].map({'Yes': 1, 'No': 0})
    return df

# Función que transforma 'Voice mail plan' a int.
def mapear_voice_mail_plan(df):
    df['Voice mail plan'] = df['Voice mail plan'].map({'Yes': 1, 'No': 0})
    return df

# Función para eliminar la columna 'State'.
def eliminar_state(df):
    return df.drop('State', axis=1)

# Función para eliminar outliers
def eliminar_outliers(df):
    columns_to_check = ['Account length', 'Area code',
                         'Number vmail messages', 'Total day minutes', 'Total day calls',
                         'Total day charge', 'Total eve minutes', 'Total eve calls',
                         'Total eve charge', 'Total night minutes', 'Total night calls',
                         'Total night charge', 'Total intl minutes', 'Total intl calls',
                         'Total intl charge', 'Customer service calls']

    df_copy = df.copy()

    for column in columns_to_check:
        q1 = df_copy[column].quantile(0.25)
        q3 = df_copy[column].quantile(0.75)
        iqr = q3 - q1
        lower_bound = q1 - 1.5 * iqr
        upper_bound = q3 + 1.5 * iqr

        # Reemplazar los valores extremos con los límites del rango
        df_copy[column] = np.where(df_copy[column] < lower_bound, lower_bound, df_copy[column])
        df_copy[column] = np.where(df_copy[column] > upper_bound, upper_bound, df_copy[column])

    return df_copy

# Función para normalizar variables
def norma_variables(df):
    columns_to_normalize = ['Account length', 'Area code',
                             'Number vmail messages', 'Total day minutes', 'Total day calls',
                             'Total day charge', 'Total eve minutes', 'Total eve calls',
                             'Total eve charge', 'Total night minutes', 'Total night calls',
                             'Total night charge', 'Total intl minutes', 'Total intl calls',
                             'Total intl charge', 'Customer service calls']

    df_copy = df.copy()
    scaler = MinMaxScaler()
    df_copy[columns_to_normalize] = scaler.fit_transform(df_copy[columns_to_normalize])
    return df_copy

# Función para filtrar columnas
def filtrar_columnas(df):
    columnas_a_mantener = [
        'International plan', 'Voice mail plan', 'Number vmail messages',
        'Total day minutes', 'Total day charge', 'Total eve minutes',
        'Total eve charge', 'Total night minutes', 'Total night charge',
        'Total intl minutes', 'Total intl calls'
    ]
    return df[columnas_a_mantener]

# Creación de transformadores
Trans_entero = FunctionTransformer(transf_entero)
Mapeo_01 = FunctionTransformer(mapear_international_plan)
Mapeo_02 = FunctionTransformer(mapear_voice_mail_plan)
Drop_state = FunctionTransformer(eliminar_state)
Outliers_drop = FunctionTransformer(eliminar_outliers)
Normalizacion = FunctionTransformer(norma_variables)
Best_caracter = FunctionTransformer(filtrar_columnas)

# Creación del pipeline de preprocesamiento
Prepro01 = Pipeline(steps=[
    ("Trans_entero", Trans_entero),
    ("Mapeo_01", Mapeo_01),
    ("Mapeo_02", Mapeo_02),
    ("Drop_state", Drop_state),
    ("Outliers_drop", Outliers_drop),
    ("Normalizacion", Normalizacion),
    ("Best_features", Best_caracter)
])

# Creación del pipeline con el Prepro01 y el modelo creado al principio del script
MODELO_PREPRO = Pipeline([
    ("PREPROCESAMIENTO", Prepro01),
    ("RANDOM_FOREST", forest_model)
])


# Guardar el pipeline de preprocesamiento en un archivo .joblib
dump(Prepro01, 'Prepro01.joblib')

# Guardar el modelo RandomForestClassifier en un archivo .joblib
dump(forest_model, 'RandomForestClassifier.joblib')


['RandomForestClassifier.joblib']

In [3]:
!pip freeze > requirements.txt


In [5]:
import requests
from io import StringIO
import pandas as pd
file_id = "16ypxCIBr9wSGVEaXqWdZUfz9w4xzccwo"
download_link = f"https://drive.google.com/uc?id={file_id}"
response = requests.get(download_link)
csv_data = StringIO(response.text)
df_virgen = pd.read_csv(csv_data, encoding='utf-8')

In [6]:
df_virgen.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2666 entries, 0 to 2665
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   State                   2666 non-null   object 
 1   Account length          2666 non-null   int64  
 2   Area code               2666 non-null   int64  
 3   International plan      2666 non-null   object 
 4   Voice mail plan         2666 non-null   object 
 5   Number vmail messages   2666 non-null   int64  
 6   Total day minutes       2666 non-null   float64
 7   Total day calls         2666 non-null   int64  
 8   Total day charge        2666 non-null   float64
 9   Total eve minutes       2666 non-null   float64
 10  Total eve calls         2666 non-null   int64  
 11  Total eve charge        2666 non-null   float64
 12  Total night minutes     2666 non-null   float64
 13  Total night calls       2666 non-null   int64  
 14  Total night charge      2666 non-null   

In [None]:
MODELO_PREPRO.predict(df_virgen)