# Scripts del Proyecto

### Script 1: Preparacion de datos para el entrenamiento

In [96]:
import numpy as np
import pandas as pd

In [97]:
## Leemos la data de entrenamiento
dataset = pd.read_csv('../data/raw/Data.csv',sep=',')

In [98]:
# Convertimos la variale TotalCharges
dataset[['TotalCharges']] = dataset[['TotalCharges']].apply(pd.to_numeric, errors='coerce')

In [99]:
from sklearn import preprocessing

## selection of category variables
target = 'Churn'
exclude = ['customerID','Churn']

cols = [x for x in dataset.columns if x not in exclude + [target]]
cols_cat = dataset[cols].select_dtypes(['object']).columns.tolist()
index_categorical=[cols.index(x) for x in cols_cat]

## For Training
for i in cols_cat:
    le = preprocessing.LabelEncoder()
    le.fit(list(dataset[i].dropna()))
    dataset.loc[~dataset[i].isnull(),i]=le.transform(dataset[i].dropna())

In [100]:
# Eliminamos los valores faltantes
dataset = dataset.dropna(axis = 0)

In [101]:
# Mantener sólo las variables relevantes para el Modelo
dfp = dataset
dfp.to_csv("../data/processed/Data_train.csv")

### Script 2: Código de Entrenamiento

In [102]:
import pandas as pd
from imblearn.over_sampling import RandomOverSampler
from sklearn.ensemble import RandomForestClassifier
import pickle

In [103]:
# Cargar la tabla transformada
df = pd.read_csv("../data/processed/Data_train.csv")
df = df.iloc[:,1:]
X_train = df.drop(['Churn','customerID'],axis=1)
y_train = df[['Churn']]

In [104]:
# Entrenamos el modelo con toda la muestra
ros = RandomOverSampler(random_state=2022)

# fit predictor and target variablex_ros, 
x_ros, y_ros = ros.fit_resample(X_train, y_train)

# Entrenamos el modelo con toda la muestra
rf = RandomForestClassifier(random_state=2022)
rf.fit(x_ros, y_ros) # Entrenando un algoritmo

  rf.fit(x_ros, y_ros) # Entrenando un algoritmo


In [105]:
# Guardamos el modelo entrenado para usarlo en produccion
filename = '../models/best_model.pkl'
pickle.dump(rf, open(filename, 'wb'))

### Script 3: Preparación de Datos de Validación

In [106]:
import numpy as np
import pandas as pd

In [107]:
## Leemos la data de entrenamiento
dataset = pd.read_csv('../data/raw/Data_new.csv',sep=',')

In [108]:
# Convertimos la variale TotalCharges
dataset[['TotalCharges']] = dataset[['TotalCharges']].apply(pd.to_numeric, errors='coerce')

In [109]:
from sklearn import preprocessing

## selection of category variables
target = 'Churn'
exclude = ['customerID','Churn']

cols = [x for x in dataset.columns if x not in exclude + [target]]
cols_cat = dataset[cols].select_dtypes(['object']).columns.tolist()
index_categorical=[cols.index(x) for x in cols_cat]

## For Training
for i in cols_cat:
    le = preprocessing.LabelEncoder()
    le.fit(list(dataset[i].dropna()))
    dataset.loc[~dataset[i].isnull(),i]=le.transform(dataset[i].dropna())

In [110]:
# Eliminamos los valores faltantes
dataset = dataset.dropna(axis = 0)

In [111]:
# Mantener sólo las variables relevantes para el Modelo
dfp = dataset
dfp.to_csv("../data/processed/Data_val.csv")

### Script 4: Código de Validación

In [112]:
import pandas as pd
from imblearn.over_sampling import RandomOverSampler
from sklearn.ensemble import RandomForestClassifier
import pickle
import matplotlib.pyplot as plt
from sklearn.metrics import *

In [113]:
# Cargar la tabla transformada
df = pd.read_csv("../data/processed/Data_val.csv")
df = df.iloc[:,1:]
X_test = df.drop(['Churn','customerID'],axis=1)
y_test = df[['Churn']]

In [114]:
# Leemos el modelo entrenado!
filename = '../models/best_model.pkl'
model = pickle.load(open(filename, 'rb'))

In [115]:
# Predecimos sobre el set de datos de implementacion con el modelo entrenado
y_pred_test=model.predict(df.drop(['Churn','customerID'],axis=1)) 

In [116]:
## Metricas de validación
def calc_metrics(y_test,y_pred_test):
    cm_test = confusion_matrix(y_test,y_pred_test)
    print("Matriz de confusion: ")
    print(cm_test)
    accuracy_test=accuracy_score(y_test,y_pred_test)
    print("Accuracy: ", accuracy_test)
    precision_test=precision_score(y_test,y_pred_test)
    print("Precision: ", precision_test)
    recall_test=recall_score(y_test,y_pred_test)
    print("Recall: ", recall_test)

In [117]:
def save_plot(title):
    plt.title(title)
    fig = plt.gcf()
    filename = title.replace(" ", "_").lower()
    fig.savefig('{}'.format(filename), dpi=500)
    plt.clf()

In [118]:
plot_confusion_matrix(model, X_test, y_test)
save_plot('Confusion Matrix')



<Figure size 432x288 with 0 Axes>

In [119]:
plot_roc_curve(model, X_test, y_test)
save_plot('ROC Curve')



<Figure size 432x288 with 0 Axes>

### Script 5: Preparación de Datos de Score (Automatización)

In [120]:
import numpy as np
import pandas as pd

In [121]:
## Leemos la data de entrenamiento
dataset = pd.read_csv('../data/raw/Data_score.csv',sep=',')

In [122]:
# Convertimos la variale TotalCharges
dataset[['TotalCharges']] = dataset[['TotalCharges']].apply(pd.to_numeric, errors='coerce')

In [123]:
from sklearn import preprocessing

## selection of category variables
target = 'Churn'
exclude = ['customerID','Churn']

cols = [x for x in dataset.columns if x not in exclude + [target]]
cols_cat = dataset[cols].select_dtypes(['object']).columns.tolist()
index_categorical=[cols.index(x) for x in cols_cat]

## For Training
for i in cols_cat:
    le = preprocessing.LabelEncoder()
    le.fit(list(dataset[i].dropna()))
    dataset.loc[~dataset[i].isnull(),i]=le.transform(dataset[i].dropna())

In [124]:
# Eliminamos los valores faltantes
dataset = dataset.dropna(axis = 0)

In [125]:
# Mantener sólo las variables relevantes para el Modelo
dfp = dataset.drop(['Churn','customerID'],axis=1)
dfp.to_csv("../data/processed/Data_score.csv")

### Scipt 6: Código de Scoring (Automatización)

In [126]:
import numpy as np
import pandas as pd
from imblearn.over_sampling import RandomOverSampler
from sklearn.ensemble import RandomForestClassifier
import pickle

In [127]:
# Cargar la tabla transformada
df = pd.read_csv("../data/processed/Data_score.csv")
df = df.iloc[:,1:]

In [128]:
# Leemos el modelo entrenado!
filename = '../models/best_model.pkl'
model = pickle.load(open(filename, 'rb'))

In [129]:
# Predecimos sobre el set de datos de implementacion con el modelo entrenado
scores=model.predict(df).reshape(-1,1)

In [130]:
# Exportamos el resultado del modelo para cargarlo en el Feature Store o Data Mart de Modelos
# Le asignamos nombres a las columnas
df_score = pd.DataFrame(scores, columns=['PREDICT'])
# Exportamos la solucion
df_score.to_csv('../data/scores/final_score.csv')