# CREACION PIPELINE.
Gracias a las etapas anteriorires, pudimos determinar  la series de caracteristicas para llegar al modelo con la mejor presicion:

- Paquete preprocesamiento: **Prepro01**


- Modelo: **Random Forest**


- Caracteristicas: **'International plan', 'Voice mail plan', 'Number vmail messages', 'Total day minutes', 'Total day charge', 'Total eve minutes', 'Total eve charge', 'Total night minutes', 'Total night charge', 'Total intl minutes', 'Total intl calls'**


- Hiperparametros: **'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False**

 En esta etapa se procedio a la creacion del empaquetado del modelo, pipeline


##00-BIBLIOTECAS

In [27]:
!pip install dill
!pip install --upgrade scikit-learn



In [28]:
import requests
from io import StringIO
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import cross_val_score as cv
from sklearn.model_selection import StratifiedKFold
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.preprocessing import FunctionTransformer
import sklearn
from joblib import dump
from joblib import load
import pickle
import joblib
import tempfile
import dill
from io import BytesIO



In [29]:
import sklearn
print(sklearn.__version__)


1.4.1.post1


In [30]:
!pip install --upgrade joblib



## 01-CONSTRUCCION MODELO PARA EMPAQUETAR.
Se toma el set de Prepro01,y se le añade el filtrado de las mejores caracteristicas, obtenidas en 03-SELECCION MODELO.

In [31]:
# Traemos el Prepro01 (df preprocesado, no el virgen)
github_link = "https://raw.githubusercontent.com/Martinerramuspe/02-DATA_SCIENCIE/main/01-TELECOM_CHURN_PROJECT/02-PREPROCESAMIENTO/Prepro01.csv"
response = requests.get(github_link)
csv_data = StringIO(response.text)
df = pd.read_csv(csv_data, encoding='utf-8')

In [32]:
# Definimos las "caracteristicas" segun lo ya definido en la etapa de eleccion de modelo (no incluido en el Prepro01).
X= df[[ 'International plan', 'Voice mail plan', 'Number vmail messages',
       'Total day minutes', 'Total day charge', 'Total eve minutes',
        'Total eve charge', 'Total night minutes', 'Total night charge',
        'Total intl minutes', 'Total intl calls']]
y=df.Churn

In [33]:
# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [34]:
# Instanciamos.
forest_model = RandomForestClassifier()

In [35]:
# Observamos los hiperparametros predefinidos por el modelo.
forest_model.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [36]:
# Fit.
forest_model.fit(X_train, y_train)

##02-CREACION DE FUNCIONES.

In [37]:
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# Funcion que Transformar el tipo de datos a la columna "Churn"
def transf_entero(df):
    df = df.copy()
    df['Churn'] = df['Churn'].astype(int)
    return df

# Funcion que transforma 'International plan' a int.
def mapear_international_plan(df):
    df = df.copy()
    df['International plan'] = df['International plan'].map({'Yes': 1, 'No': 0})
    return df

# Funcion que transforma 'Voice mail plan' a int.
def mapear_Voice_mail_plan(df):
    df = df.copy()
    df['Voice mail plan'] = df['Voice mail plan'].map({'Yes': 1, 'No': 0})
    return df

# Funcion para eliminar la columna 'State'.
def eliminar_state(df):
    return df.drop('State', axis=1)

def eliminar_outliers(df):
    columns_to_check = ['Account length', 'Area code',
                         'Number vmail messages', 'Total day minutes', 'Total day calls',
                         'Total day charge', 'Total eve minutes', 'Total eve calls',
                         'Total eve charge', 'Total night minutes', 'Total night calls',
                         'Total night charge', 'Total intl minutes', 'Total intl calls',
                         'Total intl charge', 'Customer service calls']

    for column in columns_to_check:
        q1 = df[column].quantile(0.25)
        q3 = df[column].quantile(0.75)
        iqr = q3 - q1
        lower_bound = q1 - 1.5 * iqr
        upper_bound = q3 + 1.5 * iqr

        # Reemplazar los valores extremos con los límites del rango
        df[column] = np.where(df[column] < lower_bound, lower_bound, df[column])
        df[column] = np.where(df[column] > upper_bound, upper_bound, df[column])

    return df

# normalizacion
def norma_variables(df):
    columns_to_normalize = ['Account length', 'Area code',
                             'Number vmail messages', 'Total day minutes', 'Total day calls',
                             'Total day charge', 'Total eve minutes', 'Total eve calls',
                             'Total eve charge', 'Total night minutes', 'Total night calls',
                             'Total night charge', 'Total intl minutes', 'Total intl calls',
                             'Total intl charge', 'Customer service calls']

    scaler = MinMaxScaler()
    df[columns_to_normalize] = scaler.fit_transform(df[columns_to_normalize])
    return df

# Filtrado de caracteristicas
def filtrar_columnas(df):
    columnas_a_mantener = [
        'International plan', 'Voice mail plan', 'Number vmail messages',
        'Total day minutes', 'Total day charge', 'Total eve minutes',
        'Total eve charge', 'Total night minutes', 'Total night charge',
        'Total intl minutes', 'Total intl calls'
    ]
    return df[columnas_a_mantener]


##03-CREACION DE FUNCTION_TRANSFORMERS.

In [38]:
Trans_entero=FunctionTransformer(transf_entero)
Trans_entero

In [39]:
Mapeo_01=FunctionTransformer(mapear_international_plan)
Mapeo_01

In [40]:
Mapeo_02=FunctionTransformer(mapear_Voice_mail_plan)
Mapeo_02

In [41]:
Drop_state=FunctionTransformer(eliminar_state)
Drop_state

In [42]:
Outliers_drop=FunctionTransformer(eliminar_outliers)
Outliers_drop

In [43]:
Normalizacion=FunctionTransformer(norma_variables)
Normalizacion

In [44]:
Best_caracter=FunctionTransformer(filtrar_columnas)
Best_caracter

##04-CREACION PIPELINES.

In [45]:
# Creacion pipeline preprocesado
Prepro01 = Pipeline(steps=[
    ("Trans_entero", Trans_entero),  # Añade la función 'transf_entero' como paso en el pipeline
    ("Mapeo_01", Mapeo_01),
    ("Mapeo_02", Mapeo_02),
    ("Drop_state", Drop_state),
    ("Outliers_drop", Outliers_drop),
    ("Normalizacion", Normalizacion),
    ("Best_features", Best_caracter)
])
Prepro01

In [46]:
# Creacion de pipeline con el Prepro01 y el modelo creado al principio del scrip.
MODELO_PREPRO=Pipeline([
    ("PREPROCESAMIENTO",Prepro01),
    ("RANDOM_FOREST",forest_model)
])
MODELO_PREPRO

In [47]:
# HACEMOS MINI PRUEBA PARA COMPROBAR FUNCIONAMIENTO
# Traemos df virgen.
file_id = "16ypxCIBr9wSGVEaXqWdZUfz9w4xzccwo"
download_link = f"https://drive.google.com/uc?id={file_id}"
response = requests.get(download_link)
csv_data = StringIO(response.text)
df = pd.read_csv(csv_data, encoding='utf-8')
# Predecimos con el pipeline
MODELO_PREPRO.predict(df)

array([0, 0, 0, ..., 0, 0, 0])

In [48]:
# Guardar el modelo en un archivo .joblib
joblib.dump(MODELO_PREPRO, 'Model_prepro.joblib')

['Model_prepro.joblib']

In [25]:
# Definir la URL del archivo .joblib
url = "https://github.com/Martinerramuspe/02-DATA_SCIENCIE/raw/main/01-TELECOM_CHURN_PROJECT/05-DESPLIEGUE/Model_prepro.joblib"

# Realizar la solicitud HTTP para obtener el contenido del archivo
response = requests.get(url)

# Cargar el modelo desde el contenido de la respuesta
Modelo_cargado = joblib.load(BytesIO(response.content))

In [26]:
Modelo_cargado.predict(df)

array([0, 0, 0, ..., 0, 0, 0])

In [None]:
Model_prepro = joblib.load(r'C:\Users\erram\Model_prepro.joblib')