# CONSTRUCCION PIPELINES
Procederemos a la construccion de un pipeline para el preprocesamiento, Y el empaquetedo del modelo definido y entrenado.
Finalmente, luego de construirlos, los guardaremos en archivos tipo joblib para su manipulacion.


## 01- BIBLIOTECAS

In [None]:
pip install pandas numpy scikit-learn requests



In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import FunctionTransformer
from io import StringIO
import requests
from sklearn.pipeline import Pipeline
from joblib import dump

In [None]:
pip install --upgrade pandas numpy scikit-learn requests

Collecting pandas
  Downloading pandas-2.2.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.0/13.0 MB[0m [31m22.1 MB/s[0m eta [36m0:00:00[0m
Collecting numpy
  Downloading numpy-1.26.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.2/18.2 MB[0m [31m43.7 MB/s[0m eta [36m0:00:00[0m
Collecting scikit-learn
  Downloading scikit_learn-1.4.1.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.1/12.1 MB[0m [31m71.6 MB/s[0m eta [36m0:00:00[0m
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2024.1-py2.py3-none-any.whl (345 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m345.4/345.4 kB[0m [31m30.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tzdata, numpy, pandas, sci

## 02- CREAMOS PAQUETE PREPROCESAMIENTO
Este paquete de preprocesamiento , contempla todos los conocimientos adquiridos de los anteriores notebooks.

In [None]:
# Función para filtrar columnas
def filtrar_columnas(df):
    columnas_a_mantener = [
        'International plan', 'Voice mail plan', 'Number vmail messages',
        'Total day minutes', 'Total day charge', 'Total eve minutes',
        'Total eve charge', 'Total night minutes', 'Total night charge',
        'Total intl minutes', 'Total intl calls'                   # Como se puede observar, en el prepro, NO se incluye el target
    ]
    return df[columnas_a_mantener]

# Función que transforma 'International plan' a int.
def mapear_international_plan(df):
    df_copy = df.copy()  # Crear una copia del DataFrame
    df_copy['International plan'] = df_copy['International plan'].map({'Yes': 1, 'No': 0})
    return df_copy

# Función que transforma 'Voice mail plan' a int.
def mapear_voice_mail_plan(df):
    df_copy = df.copy()  # Crear una copia del DataFrame
    df_copy['Voice mail plan'] = df_copy['Voice mail plan'].map({'Yes': 1, 'No': 0})
    return df_copy

# Función para eliminar outliers
def eliminar_outliers(df):
    columns_to_check = ['Number vmail messages', 'Total day minutes',
                             'Total day charge', 'Total eve minutes',
                             'Total eve charge', 'Total night minutes',
                             'Total night charge', 'Total intl minutes',
                             'Total intl calls']

    df_copy = df.copy()

    for column in columns_to_check:
        q1 = df_copy[column].quantile(0.25)
        q3 = df_copy[column].quantile(0.75)
        iqr = q3 - q1
        lower_bound = q1 - 1.5 * iqr
        upper_bound = q3 + 1.5 * iqr

        # Eliminar las filas con outliers solo en la columna actual
        df_copy = df_copy.drop(df_copy[(df_copy[column] < lower_bound) | (df_copy[column] > upper_bound)].index)

    return df_copy





# Creación de transformadores
Best_caracter = FunctionTransformer(filtrar_columnas)
Mapeo_01 = FunctionTransformer(mapear_international_plan)
Mapeo_02 = FunctionTransformer(mapear_voice_mail_plan)
Outliers_drop = FunctionTransformer(eliminar_outliers)


# Creación del pipeline de preprocesamiento
Prepro01 = Pipeline(steps=[
    ("Best_features", Best_caracter),
    ("Mapeo_01", Mapeo_01),
    ("Mapeo_02", Mapeo_02),
    ("Outliers_drop", Outliers_drop),
])
Prepro01

##03-ENTRENAMOS Y EMPAQUETAMOS MODELO

In [None]:
# Función para filtrar columnas
def filtrar_columnas(df):
    columnas_a_mantener = [
        'International plan', 'Voice mail plan', 'Number vmail messages',
        'Total day minutes', 'Total day charge', 'Total eve minutes',
        'Total eve charge', 'Total night minutes', 'Total night charge',
        'Total intl minutes', 'Total intl calls', 'Churn'                  # Como se puede observar, en el prepro, NO se incluye el target
    ]
    return df[columnas_a_mantener]

# Función que transforma 'International plan' a int.
def mapear_international_plan(df):
    df_copy = df.copy()  # Crear una copia del DataFrame
    df_copy['International plan'] = df_copy['International plan'].map({'Yes': 1, 'No': 0})
    return df_copy

# Función que transforma 'Voice mail plan' a int.
def mapear_voice_mail_plan(df):
    df_copy = df.copy()  # Crear una copia del DataFrame
    df_copy['Voice mail plan'] = df_copy['Voice mail plan'].map({'Yes': 1, 'No': 0})
    return df_copy

# Función para eliminar outliers
def eliminar_outliers(df):
    columns_to_check = ['Number vmail messages', 'Total day minutes',
                             'Total day charge', 'Total eve minutes',
                             'Total eve charge', 'Total night minutes',
                             'Total night charge', 'Total intl minutes',
                             'Total intl calls']

    df_copy = df.copy()

    for column in columns_to_check:
        q1 = df_copy[column].quantile(0.25)
        q3 = df_copy[column].quantile(0.75)
        iqr = q3 - q1
        lower_bound = q1 - 1.5 * iqr
        upper_bound = q3 + 1.5 * iqr

        # Eliminar las filas con outliers solo en la columna actual
        df_copy = df_copy.drop(df_copy[(df_copy[column] < lower_bound) | (df_copy[column] > upper_bound)].index)

    return df_copy



# Definir función de mapeo de la variable objetivo
def mapear_target(df):
    df_copy = df.copy()  # Crear una copia del DataFrame
    df_copy['Churn'] = df_copy['Churn'].astype(int)  # Convertir valores booleanos a enteros (0 para False, 1 para True)
    return df_copy

# Df virgen
file_id = "16ypxCIBr9wSGVEaXqWdZUfz9w4xzccwo"
download_link = f"https://drive.google.com/uc?id={file_id}"
response = requests.get(download_link)
csv_data = StringIO(response.text)
df = pd.read_csv(csv_data, encoding='utf-8')

#Aplicamos funciones
df1 = filtrar_columnas(df)
df1 = mapear_international_plan(df1)
df1 = mapear_voice_mail_plan(df1)
df1 = eliminar_outliers(df1)
df1 = mapear_target(df1)


# Definimos las características según lo ya definido en la etapa de elección de modelo (no incluido en el Prepro01).
X = df1[['International plan', 'Voice mail plan', 'Number vmail messages',
        'Total day minutes', 'Total day charge', 'Total eve minutes',
        'Total eve charge', 'Total night minutes', 'Total night charge',
        'Total intl minutes', 'Total intl calls']]
y = df1.Churn

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Entrenar el modelo XGBoost
forest_model = RandomForestClassifier()
forest_model.fit(X_train, y_train)

## 04-TESTEO DE PAQUETE DE PREPROCESAMIENTO Y MODELADO

In [None]:
#Traemos df para hacer prueba
file_id = "16ypxCIBr9wSGVEaXqWdZUfz9w4xzccwo"
download_link = f"https://drive.google.com/uc?id={file_id}"
response = requests.get(download_link)
csv_data = StringIO(response.text)
df = pd.read_csv(csv_data, encoding='utf-8')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2666 entries, 0 to 2665
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   State                   2666 non-null   object 
 1   Account length          2666 non-null   int64  
 2   Area code               2666 non-null   int64  
 3   International plan      2666 non-null   object 
 4   Voice mail plan         2666 non-null   object 
 5   Number vmail messages   2666 non-null   int64  
 6   Total day minutes       2666 non-null   float64
 7   Total day calls         2666 non-null   int64  
 8   Total day charge        2666 non-null   float64
 9   Total eve minutes       2666 non-null   float64
 10  Total eve calls         2666 non-null   int64  
 11  Total eve charge        2666 non-null   float64
 12  Total night minutes     2666 non-null   float64
 13  Total night calls       2666 non-null   int64  
 14  Total night charge      2666 non-null   

In [None]:
# Introducimos df virgen en preprocesamiento.
df1=Prepro01.transform(df)
df1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2501 entries, 0 to 2665
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   International plan     2501 non-null   int64  
 1   Voice mail plan        2501 non-null   int64  
 2   Number vmail messages  2501 non-null   int64  
 3   Total day minutes      2501 non-null   float64
 4   Total day charge       2501 non-null   float64
 5   Total eve minutes      2501 non-null   float64
 6   Total eve charge       2501 non-null   float64
 7   Total night minutes    2501 non-null   float64
 8   Total night charge     2501 non-null   float64
 9   Total intl minutes     2501 non-null   float64
 10  Total intl calls       2501 non-null   int64  
dtypes: float64(7), int64(4)
memory usage: 234.5 KB


In [None]:
df1.head()

Unnamed: 0,International plan,Voice mail plan,Number vmail messages,Total day minutes,Total day charge,Total eve minutes,Total eve charge,Total night minutes,Total night charge,Total intl minutes,Total intl calls
0,0,1,25,265.1,45.07,197.4,16.78,244.7,11.01,10.0,3
1,0,1,26,161.6,27.47,195.5,16.62,254.4,11.45,13.7,3
2,0,0,0,243.4,41.38,121.2,10.3,162.6,7.32,12.2,5
3,1,0,0,299.4,50.9,61.9,5.26,196.9,8.86,6.6,7
4,1,0,0,166.7,28.34,148.3,12.61,186.9,8.41,10.1,3


In [None]:
# Introducimos df preprocesado a modelo
dfr=forest_model.predict(df1)
dfr

array([0, 0, 0, ..., 0, 0, 0])

## DESCARGA DE PAQUETES

In [None]:
# Guardar el pipeline de preprocesamiento en un archivo .joblib
dump(Prepro01, 'Prepro01.joblib')

['Prepro01.joblib']

In [None]:
# Guardar el modelo RandomForestClassifier en un archivo .joblib
dump(forest_model, 'RandomForestClassifier.joblib')

['RandomForestClassifier.joblib']

In [None]:
!pip freeze > requirements.txt

# CONCLUSION
Las funcionces para el preprocesamiento, se van organizando a medida que se avanza con proyecto. En otras palabras, no hay un orden definitivo desde el comienza.