# Scripts del Proyecto

### Script 1: Preparación de datos para el entrenamiento

In [1]:
import pandas as pd

In [3]:
# Leemos la data de desarrollo de modelos
df = pd.read_csv('../data/raw/insurances.csv')

In [4]:
# Leemos la data de desarrollo de modelos
df = pd.read_csv('../data/raw/insurances.csv')

In [5]:
# Recodificamos variables 
df["Gender"] = df["Gender"].replace({"Male": 0, "Female": 1}).astype("int32")
df["Region_Code"] = df["Region_Code"].astype(int)
df["Vehicle_Age"] = df["Vehicle_Age"].replace({"< 1 Year": 0, "1-2 Year": 1, "> 2 Years": 2}).astype("int32")
df["Vehicle_Damage"] = df["Vehicle_Damage"].replace({"No": 0, "Yes": 1}).astype("int32")
df["Annual_Premium"] = df["Annual_Premium"].astype(int)
df["Policy_Sales_Channel"] = df["Policy_Sales_Channel"].astype(int)

In [6]:
# Transformamos variables
df["Previously_Insured_Annual_Premium"] = pd.factorize(
    df["Previously_Insured"].astype(str) + df["Annual_Premium"].astype(str)
)[0]

df["Previously_Insured_Vehicle_Age"] = pd.factorize(
    df["Previously_Insured"].astype(str) + df["Vehicle_Age"].astype(str)
)[0]

df["Previously_Insured_Vehicle_Damage"] = pd.factorize(
    df["Previously_Insured"].astype(str) + df["Vehicle_Damage"].astype(str)
)[0]

df["Previously_Insured_Vintage"] = pd.factorize(
    df["Previously_Insured"].astype(str) + df["Vintage"].astype(str)
)[0]

In [8]:
dfp = df[["Gender","Age","Previously_Insured","Vehicle_Age","Vehicle_Damage","Annual_Premium","Policy_Sales_Channel","Vintage","Previously_Insured_Annual_Premium","Previously_Insured_Vehicle_Damage","Previously_Insured_Vintage", "Response"]]
dfp.to_csv("../data/processed/insurance_train.csv")

### Script 2: Código de entrenamiento

In [9]:
import xgboost as xgb
import pickle

In [10]:
# Cargar la tabla transformada
df = pd.read_csv("../data/processed/insurance_train.csv")
X_train = df.drop(['Response'],axis=1)
y_train = df[['Response']]

In [11]:
# Entrenamos el modelo con toda la muestra
xgb_mod=xgb.XGBClassifier(max_depth=2, n_estimators=50, objective='binary:logistic', seed=0, silent=True, subsample=.8)
xgb_mod.fit(X_train, y_train)

Parameters: { "silent" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [12]:
# Guardamos el modelo entrenado para usarlo en produccion
filename = '../models/best_model.pkl'
pickle.dump(xgb_mod, open(filename, 'wb'))

### Script 3: Preparación de Datos de Validación


In [13]:
import pandas as pd

In [14]:
# Leemos la tabla de validación
df = pd.read_csv("../data/raw/insurances_new.csv")

In [None]:
# Recodificamos variables 
df["Gender"] = df["Gender"].replace({"Male": 0, "Female": 1}).astype("int32")
df["Region_Code"] = df["Region_Code"].astype(int)
df["Vehicle_Age"] = df["Vehicle_Age"].replace({"< 1 Year": 0, "1-2 Year": 1, "> 2 Years": 2}).astype("int32")
df["Vehicle_Damage"] = df["Vehicle_Damage"].replace({"No": 0, "Yes": 1}).astype("int32")
df["Annual_Premium"] = df["Annual_Premium"].astype(int)
df["Policy_Sales_Channel"] = df["Policy_Sales_Channel"].astype(int)

In [None]:
# Transformamos variables
df["Previously_Insured_Annual_Premium"] = pd.factorize(
    df["Previously_Insured"].astype(str) + df["Annual_Premium"].astype(str)
)[0]

df["Previously_Insured_Vehicle_Age"] = pd.factorize(
    df["Previously_Insured"].astype(str) + df["Vehicle_Age"].astype(str)
)[0]

df["Previously_Insured_Vehicle_Damage"] = pd.factorize(
    df["Previously_Insured"].astype(str) + df["Vehicle_Damage"].astype(str)
)[0]

df["Previously_Insured_Vintage"] = pd.factorize(
    df["Previously_Insured"].astype(str) + df["Vintage"].astype(str)
)[0]

In [None]:
dfp = df[["Gender","Age","Previously_Insured","Vehicle_Age","Vehicle_Damage","Annual_Premium","Policy_Sales_Channel","Vintage","Previously_Insured_Annual_Premium","Previously_Insured_Vehicle_Damage","Previously_Insured_Vintage", "Response"]]
dfp.to_csv("../data/processed/insurance_val.csv")

### Script 4: Código de Validación

In [210]:
import pandas as pd
import xgboost as xgb
import pickle
import matplotlib.pyplot as plt
from sklearn.metrics import *

In [None]:
# Cargar la tabla transformada
df = pd.read_csv("../data/processed/insurance_val.csv")
X_test = df.drop(['Response'],axis=1)
y_test = df[['Response']]

In [None]:
# Leemos el modelo entrenado!
filename = '../models/best_model.pkl'
model = pickle.load(open(filename, 'rb'))

In [None]:
# Predecimos sobre el set de datos de implementacion con el modelo entrenado
y_pred_test=model.predict(df.drop(['Response'],axis=1)) 

In [None]:
## Metricas de validación
def calc_metrics(y_test,y_pred_test):
    cm_test = confusion_matrix(y_test,y_pred_test)
    print("Matriz de confusion: ")
    print(cm_test)
    accuracy_test=accuracy_score(y_test,y_pred_test)
    print("Accuracy: ", accuracy_test)
    precision_test=precision_score(y_test,y_pred_test)
    print("Precision: ", precision_test)
    recall_test=recall_score(y_test,y_pred_test)
    print("Recall: ", recall_test)

In [None]:
def save_plot(title):
    plt.title(title)
    fig = plt.gcf()
    filename = title.replace(" ", "_").lower()
    fig.savefig('{}'.format(filename), dpi=500)
    plt.clf()

In [None]:
plot_confusion_matrix(model, X_test, y_test)
save_plot('Confusion Matrix')

In [None]:
plot_roc_curve(model, X_test, y_test)
save_plot('ROC Curve')

### Script 5: Preparación de Datos de Score (Automatización)

In [None]:
# Leemos la tabla de entrenamiento
df = pd.read_csv("../data/raw/insurance_score.csv").set_index('ID')

In [None]:
# Recodificamos variables 
df["Gender"] = df["Gender"].replace({"Male": 0, "Female": 1}).astype("int32")
df["Region_Code"] = df["Region_Code"].astype(int)
df["Vehicle_Age"] = df["Vehicle_Age"].replace({"< 1 Year": 0, "1-2 Year": 1, "> 2 Years": 2}).astype("int32")
df["Vehicle_Damage"] = df["Vehicle_Damage"].replace({"No": 0, "Yes": 1}).astype("int32")
df["Annual_Premium"] = df["Annual_Premium"].astype(int)
df["Policy_Sales_Channel"] = df["Policy_Sales_Channel"].astype(int)

In [None]:

# Transformamos variables
df["Previously_Insured_Annual_Premium"] = pd.factorize(
    df["Previously_Insured"].astype(str) + df["Annual_Premium"].astype(str)
)[0]

df["Previously_Insured_Vehicle_Age"] = pd.factorize(
    df["Previously_Insured"].astype(str) + df["Vehicle_Age"].astype(str)
)[0]

df["Previously_Insured_Vehicle_Damage"] = pd.factorize(
    df["Previously_Insured"].astype(str) + df["Vehicle_Damage"].astype(str)
)[0]

df["Previously_Insured_Vintage"] = pd.factorize(
    df["Previously_Insured"].astype(str) + df["Vintage"].astype(str)
)[0]

In [None]:
dfp = df[["Gender","Age","Previously_Insured","Vehicle_Age","Vehicle_Damage","Annual_Premium","Policy_Sales_Channel","Vintage","Previously_Insured_Annual_Premium","Previously_Insured_Vehicle_Damage","Previously_Insured_Vintage", "Response"]]
dfp.to_csv("../data/processed/insurance_score.csv")

### Scipt 6: Código de Scoring (Automatización)

In [None]:
import pandas as pd
import xgboost as xgb
import pickle

In [None]:
# Cargar la tabla transformada
df = pd.read_csv("../data/processed/insurance_score.csv")

In [None]:
# Leemos el modelo entrenado!
filename = '../models/best_model.pkl'
model = pickle.load(open(filename, 'rb'))

In [None]:
# Predecimos sobre el set de datos de implementacion con el modelo entrenado
scores=model.predict(df).reshape(-1,1)

In [None]:
# Exportamos el resultado del modelo para cargarlo en el Feature Store o Data Mart de Modelos
# Le asignamos nombres a las columnas
df_score = pd.DataFrame(scores, columns=['PREDICT'])
# Exportamos la solucion
df_score.to_csv('../data/scores/final_score.csv')