# Ensamble los modelos de la parte III y IV en uno solo. ¿Cúal es su score en validación y en test?

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from feature_engine.encoding import MeanEncoder
from sklearn.ensemble import StackingClassifier
from sklearn.metrics import roc_auc_score
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import pickle

In [2]:
# Separo en train, test y validación.
df = pd.read_csv('data/secondary_data.csv', sep=';')
X = df.drop(['class'], axis=1)
y = df['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
X_test, X_validacion, y_test, y_validacion = train_test_split(X_test, y_test, test_size=0.5, random_state=123)
# 80% train, 10% test, 10% validación.

In [3]:
# Imputo los valores nulos.
si = SimpleImputer(strategy='constant', fill_value='#')
X_train = pd.DataFrame(si.fit_transform(X_train), columns=X_train.columns)
X_test = pd.DataFrame(si.transform(X_test), columns=X_test.columns)
X_validacion = pd.DataFrame(si.transform(X_validacion), columns=X_validacion.columns)

# Separo las variables continuas.
continuas = ['cap-diameter','stem-height','stem-width']
continuas_train = X_train[continuas].astype(float)
continuas_test = X_test[continuas].astype(float)
continuas_validacion = X_validacion[continuas].astype(float)

X_train.drop(continuas, axis=1, inplace=True)
X_test.drop(continuas, axis=1, inplace=True)
X_validacion.drop(continuas, axis=1, inplace=True)

# Encodeo la variable target.
ohe = OneHotEncoder(drop='first', sparse_output=False)
y_train = ohe.fit_transform(y_train.values.reshape(-1,1))[:,0]
y_test = ohe.transform(y_test.values.reshape(-1,1))[:,0]
y_validacion = ohe.transform(y_validacion.values.reshape(-1,1))[:,0]

# Encodeo las variables categoricas.
ohe = OneHotEncoder(drop='if_binary', sparse_output=False)
ohe_columnas = ['does-bruise-or-bleed','gill-spacing','veil-type','has-ring','ring-type','habitat','season']
ohe_encodeadas_train = pd.DataFrame(ohe.fit_transform(X_train[ohe_columnas]))
ohe_encodeadas_test = pd.DataFrame(ohe.transform(X_test[ohe_columnas]))
ohe_encodeadas_validacion = pd.DataFrame(ohe.transform(X_validacion[ohe_columnas]))

me = MeanEncoder()
me_columnas = [x for x in df.columns if x not in ohe_columnas and x not in continuas and x != 'class']
me_encodeadas_train = me.fit_transform(X_train[me_columnas], y_train)
me_encodeadas_test = me.transform(X_test[me_columnas])
me_encodeadas_validacion = me.transform(X_validacion[me_columnas])

# Junto todo en sus 3 datasets.
columnas_me_ohe = me_columnas + [x for x in ohe.get_feature_names_out()]
X_train = pd.concat([me_encodeadas_train, ohe_encodeadas_train], axis=1)
X_test = pd.concat([me_encodeadas_test, ohe_encodeadas_test], axis=1)
X_validacion = pd.concat([me_encodeadas_validacion, ohe_encodeadas_validacion], axis=1)

X_train.columns = columnas_me_ohe
X_test.columns = columnas_me_ohe
X_validacion.columns = columnas_me_ohe

X_train = pd.concat([X_train, continuas_train], axis=1)
X_test = pd.concat([X_test, continuas_test], axis=1)
X_validacion = pd.concat([X_validacion, continuas_validacion], axis=1)

## Importo los modelos de RandomForset y XGBoost

In [4]:
# RandomForest de la parte 3
rf = pickle.load(open('modelos/rf.pkl', 'rb'))

# XGBoost de la parte 4
xg = pickle.load(open('modelos/xgb.pkl', 'rb'))

In [5]:
ensamble = StackingClassifier(estimators=[('RandomForest', rf), ('XGBoost', xg)], cv='prefit', final_estimator=LogisticRegression())
ensamble.fit(X_train, y_train)

## Auc Roc

In [6]:
preds_vali = ensamble.predict_proba(X_validacion)[:,-1]
print(f'Puntaje auc_roc en validación: {roc_auc_score(y_validacion, preds_vali)}')

Puntaje auc_roc en validación: 1.0


In [7]:
preds_test = ensamble.predict_proba(X_test)[:,-1]
print(f'Puntaje auc_roc en test: {roc_auc_score(y_test, preds_test)}')

Puntaje auc_roc en test: 1.0
