# DiploDatos Kaggle Competition

Presentamos un código creado como ejemplo de base para la competición.

Deben:

- Explorar los datos y aprender de ellos.
- Probar diferentes modelos y ver cuáles ajustan mejor dado los datos.
- **Obtener una accuracy mejor que la que se presenta en este ejemplo.**
- Tratar de obtener la accuracy más alta posible!
- Discutir la elección de modelo.

El análisis exploratorio y el preprocesamiento de los datos queda a libertad de cada grupo y no deben quedarse con este simple ejemplo.

In [None]:
import itertools
import pandas as pd
import numpy as np
# gráficos
import matplotlib.pyplot as plt
import seaborn as sns
# preprocesamiento de datos
from sklearn.preprocessing import (
    StandardScaler, 
    LabelEncoder, 
    OneHotEncoder, 
    OrdinalEncoder, 
    MinMaxScaler,
)
from sklearn.impute import SimpleImputer
# aprendizaje automático supervisado
from sklearn.model_selection import (
    train_test_split, 
    GridSearchCV, 
    StratifiedKFold, 
    cross_val_score, 
    cross_validate, 
    KFold, 
    LeaveOneOut, 
    LeavePOut,
)
# métricas
from sklearn.metrics import (
    recall_score, 
    accuracy_score, 
    precision_score, 
    f1_score, 
    roc_auc_score, 
    mean_squared_error, 
    r2_score, 
    confusion_matrix, 
    classification_report, 
    ConfusionMatrixDisplay,
)
# modelos
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC

In [None]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('Etiqueta correcta')
    plt.xlabel('Etiqueta predicha')

### Analisis exploratorio y visualizacion del DataFrame

In [None]:
# Conjunto de Train
URL = 'https://drive.google.com/file/d/16SSOt06KitPEkAXQwfPyM16jojqCxwtl/view?usp=sharing'
path = 'https://drive.google.com/uc?export=download&id='+URL.split('/')[-2]
df = pd.read_csv(path)

In [None]:
# Conjunto de Test
URL = 'https://drive.google.com/file/d/1EVGW3CQeKZjtkdusFIp9KjqF87rFl264/view?usp=sharing'
path = 'https://drive.google.com/uc?export=download&id='+URL.split('/')[-2]
test_df = pd.read_csv(path)

In [None]:
df

In [None]:
# Analizamos las columnas
df.columns

In [None]:
# Analizamos la cantidad de datos por columna y si hay valores nulos
df.info()

In [None]:
# Analizamos la cantidad de datos faltantes
missing_values_count = df.isna().sum()
missing_values_count

### Imputacion y transformacion de variables

In [None]:
# Extraemos nuestro target del resto de variables
y = df.Transported
X = df.copy()

In [None]:
# Para la Columna Destino cambiamos los nombres de las columnas
dic_dest = {'TRAPPIST-1e' : 'Destination_TRAPPIST-1e',
        '55 Cancri e' : 'Destination_55 Cancri e',
        'PSO J318.5-22' : 'Destination_PSO J318.5-22',
        np.nan : np.nan}
X['Destination'] = X['Destination'].map(dic_dest)

In [None]:
# Hacemos un one hot (con get.dummies) dividiendo las categorias de destino
x_dest = pd.get_dummies(X['Destination'])
for i in range(len(x_dest)):
    if (x_dest.iloc[i,0]+x_dest.iloc[i,1]+x_dest.iloc[i,2]) == 0:
        x_dest.iloc[i,:] = x_dest.iloc[i,:].replace(0, np.nan)
a = x_dest.columns
X[a] = x_dest

In [None]:
# Para la Columna HomePlanet cambiamos los nombres 
dic_planet = {'Earth' : 'HomePlanet_Earth',
              'Europa' : 'HomePlanet_Europa',
              'Mars' : 'HomePlanet_Mars',
              np.nan : np.nan}
X['HomePlanet'] = X['HomePlanet'].map(dic_planet)

In [None]:
# Hacemos un one hot dividiendo las categorias de HomePlanet
x_home = pd.get_dummies(X['HomePlanet'])
for i in range(len(x_dest)):
    if (x_home.iloc[i,0]+x_home.iloc[i,1]+x_home.iloc[i,2]) == 0:
        x_home.iloc[i,:] = x_home.iloc[i,:].replace(0, np.nan)
a = x_home.columns
X[a] = x_home

In [None]:
# Separamos en tres columnas "Cabin"
X[['Cabin_Deck','Cabin_Num','Cabin_Side']] = X['Cabin'].str.split('/',expand=True)

In [None]:
# Hacemos encodig de variables categoricas que tienen un cierto orden
Label_cols = ['VIP', 'CryoSleep', 'Cabin_Deck', 'Cabin_Side' ]
for col in Label_cols:
    X[col] = LabelEncoder().fit_transform(X[col])

In [None]:
# Volvemos a  definir las variables nan para luego ser imputadas 
Label_col = ['VIP', 'CryoSleep', 'Cabin_Side' ]
X[Label_col] = X[Label_col].replace(2, np.nan)
X['Cabin_Deck'] = X['Cabin_Deck'].replace(8, np.nan)

In [None]:
# Limpiamos el dataset antes de imputar los valores faltantes
X = X.drop(['Cabin', 'Destination', 'HomePlanet', 'Name', 'PassengerId'],axis=1)

In [None]:
X.isna().sum()

In [None]:
X['Cabin_Num'] = X['Cabin_Num'].astype(float)
X.info()

In [None]:
# Estandarizamos y escalamos los valores antes de imputar
x_names = X.columns
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [None]:
# Imputamos todos los valores con knn imputer
from sklearn.impute import KNNImputer

X_knn = X.copy()
knn_imputer = KNNImputer(n_neighbors=9, weights="uniform")
X_knn = knn_imputer.fit_transform(X_knn)

# Volvemos a convertirlo en DF
X_knn = pd.DataFrame(X_knn, columns=x_names)
X_knn

In [None]:
X_knn.describe()

In [None]:
X_knn.isna().sum()

In [None]:
# Analizamos la correlación entre las variables
plt.figure(figsize=(16,10))
sns.heatmap(X_knn.corr(),annot=True,fmt='.2g')
plt.title('Correlacion entre variables', fontsize=14)

 Con la caracteristica que más se correlaciona con nuestro target es CryoSleep, aunque es baja  al igual que el resto.

In [None]:
X_knn.isna().sum()

In [None]:
# Elimino mi target del resto del dataset
X_knn = X_knn.drop('Transported', axis=1)

In [None]:
# Dividimos los datos para el entrenamiento y test
x_train, x_test, y_train, y_test = train_test_split(X_knn, y, train_size=0.8, random_state = 8)

In [None]:
x_train

### Prueba con varios modelos de aprendisaje supervisado

In [None]:
clfs =  [DecisionTreeClassifier(),
        RandomForestClassifier(),
        LogisticRegression(),
        MLPClassifier(),
        XGBClassifier(),
        GaussianNB(),
        LinearSVC()]

names = ['Arbol de decisión',
        'Random Forest', 
        'Regresión Logística',
        'Perceptrón multicapa',
        'XGBoost',
        'Naive Bayes',
        'SVM']

trained_models = []
accuracy_models = []
for clf, name in zip(clfs, names):
    print(name)
    clf.fit(x_train, y_train)
    train_predictions = clf.predict(x_train)
    accuracy = accuracy_score(y_train, train_predictions)
    print(f"Accuracy train {name}: %.2f%%" % (accuracy * 100.0))

    test_predictions = clf.predict(x_test)
    accuracy = accuracy_score(y_test, test_predictions)
    print(f"Accuracy test {name}: %.2f%%" % (accuracy * 100.0))
    trained_models.append(clf)    
    accuracy_models.append(accuracy)

#### Regresion Logistica

In [None]:
param_grid_re = [{
    'penalty' : ['l1', 'l2'],
    'solver' : ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'], 
    'C': [0.6 ,0.7, 0.8, 0.9, 0.5, 1],
    'random_state' : [43],
    'max_iter' : [100, 200, 1000]
}]

In [None]:
# Regresión Logística
clf_re = LogisticRegression()
cv_re = GridSearchCV(clf_re, param_grid_re, scoring='accuracy') 
cv_re.fit(x_train, y_train);

In [None]:
cv_re.best_params_

In [None]:
# Reportamos accuracy promedio y varianza para todas las configuraciones
results = cv_re.cv_results_
# 'mean_test_score' accuracy promedio
# 'std_test_score' varianza
df = pd.DataFrame(results)
df[['param_penalty', 
    'param_solver', 
    'param_C', 
    'param_random_state', 
    'param_max_iter', 
    'mean_test_score', 
    'std_test_score']]\
  .sort_values(by=['mean_test_score'], ascending=False)

#### Perceptrón multicapa 

In [None]:
# Perceptrón multicapa 
from sklearn import neural_network
from sklearn.metrics import accuracy_score, classification_report, ConfusionMatrixDisplay

param_grid = [{
    'hidden_layer_sizes' : [10, 15],
    'activation' : ['logistic', 'tanh', 'relu'],
    'solver' : ['sgd'], 
    'alpha': [0.0001, 0.001, 0.01], 
    'batch_size' : [50, 20],
    'random_state' : [43],
    'learning_rate' : ['constant', 'adaptive'],
    'learning_rate_init' : [0.001, 0.01, 0.1],
    'shuffle' : [True],
    'verbose' : [True],
    'tol' : [0.0022, 0.01, 0.0001, 0.0008],
    'max_iter' : [1000]
}]

In [None]:
model = neural_network.MLPClassifier()
cv = GridSearchCV(model, param_grid, scoring='accuracy') # cv None (default) to use the default 5-fold cross validation
cv.fit(x_train, y_train);

In [None]:
cv.cv_results_.keys()

In [None]:
cv.best_params_

In [None]:
# Reportamos accuracy promedio y varianza para todas las configuraciones
results_neu = cv.cv_results_
# 'mean_test_score' accuracy promedio
# 'std_test_score' varianza
df_neu = pd.DataFrame(results_neu)
df_neu = df_neu.sort_values(by=['mean_test_score'], ascending=False)
df_neu

In [None]:
# Usamos neural_network
from sklearn import neural_network
from sklearn.metrics import accuracy_score, classification_report, ConfusionMatrixDisplay

import matplotlib.pyplot as plt

clf = neural_network.MLPClassifier(
    activation='tanh',
    solver='sgd',
    alpha=0.01, 
    hidden_layer_sizes=(10),
    learning_rate='adaptive',
    learning_rate_init=0.1,
    batch_size=20,
    random_state=43,
    max_iter=1000,
    verbose=True,
    shuffle=True,
    tol=0.0001,
    )

In [None]:
clf.fit(x_train, y_train.ravel())

In [None]:
clf.score(x_test, y_test)

In [None]:
total_params = 0
for case, name in zip(clf.coefs_, ('wi','bias')):
    print(name, '=', np.size(case))
    total_params += np.size(case)

for case, name in zip(clf.intercepts_, ('wi','bias')):
    print(name, '=', np.size(case))
    total_params += np.size(case)

print('total params = ', total_params)

In [None]:
np.round(np.exp(clf.predict_log_proba(x_train)),1)

In [None]:
predictions = clf.predict(x_train)
print (f'Accuracy: {accuracy_score(y_train, predictions)*100:.2f}')

#### Random Forest

In [None]:
param_grid_rf = [{
       'n_estimators': [ 180, 200,250],
        'criterion'    : ['gini'],
        'max_depth'    : [ 10, 20],
        'max_features' : [3, 4, 5],
        'n_jobs'       : [-1], # means using all processors
        'random_state' : [0]
}]

In [None]:
clf_rf = RandomForestClassifier()
cv = GridSearchCV(clf_rf, param_grid_rf, scoring='accuracy') 
cv.fit(x_train, y_train);

In [None]:
cv.cv_results_.keys()

In [None]:
cv.best_params_

In [None]:
# Reportamos accuracy promedio y varianza para todas las configuraciones
results_rf = cv.cv_results_
df_rf = pd.DataFrame(results_rf)
df_rf = pd.DataFrame(results_rf).sort_values(by=['mean_test_score'], ascending=False)
df_rf

In [None]:
cv.score(x_train, y_train)

In [None]:
predictions = cv.predict(x_train)
print (f'Accuracy: {accuracy_score(y_train, predictions)*100:.2f}')


#### DecisionTreeClassifier


In [None]:
param_grid_dt = {
    'criterion' : ['gini', 'entropía', 'log_loss'],
    'max_depth': [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], 
    'min_samples_leaf': [1, 5, 10, 15, 20],
    'random_state' : [0],
    'ccp_alpha' : [0, 0.2, 0.6, 0.9, 1, 1.3, 2, 5]
}

In [None]:
clf_dt = DecisionTreeClassifier()
cv_dt = GridSearchCV(clf_dt, param_grid_dt, scoring='accuracy') # cv None (default) to use the default 5-fold cross validation
cv_dt.fit(x_train, y_train)

In [None]:
cv_dt.best_params_

In [None]:
# Reportamos accuracy promedio y varianza para todas las configuraciones
results_dt = cv_dt.cv_results_
df_dt = pd.DataFrame(results_dt).sort_values(by=['mean_test_score'], ascending=False)
df_dt

In [None]:
cv_dt.score(x_train, y_train)

In [None]:
predictions = cv_dt.predict(x_train)
print (f'Accuracy: {accuracy_score(y_train, predictions)*100:.2f}')

#### Naive Bayes

In [None]:
gnb = GaussianNB()
gnb.fit(x_train, y_train)
gnb.score(x_test, y_test)

In [None]:
y_pred = gnb.predict(x_test)
print("Métricas de validación para el mejor ajuste")
print("Accuracy = %s" % accuracy_score(y_test, y_pred))
print("Precision = %s" % precision_score(y_test, y_pred))
print("Recall = %s" % recall_score(y_test, y_pred))
print("F1 = %s" % f1_score(y_test, y_pred))

cm = confusion_matrix(y_test, y_pred)
plot_confusion_matrix(cm, ['Cumplió_0', 'Incumplió_1'])

#### XGBoost

In [None]:
from xgboost import XGBClassifier
xgb = XGBClassifier()
xgb.fit(x_train, y_train)

In [None]:
xgb.score(x_test, y_test)

In [None]:
model = XGBClassifier(random_state=0)
from scipy.stats import uniform, randint

params = {
    "max_depth": range(1, 11, 2), # default 3
    "n_estimators": range(50, 400, 50), # default 100   
}

cv = GridSearchCV(model, params, scoring='accuracy') # cv None (default) to use the default 5-fold cross validation
cv.fit(x_train, y_train)

In [None]:
cv.best_params_

In [None]:
# Reportamos accuracy promedio y varianza para todas las configuraciones
results = cv.cv_results_
print("Best: %f using %s" % (cv.best_score_, cv.best_params_))
means = results['mean_test_score']
stds = results['std_test_score']
parameters = results['params']
for mean, stdev, param in zip(means, stds, parameters):
	print("%f (%f) with: %r" % (mean, stdev, param))


In [None]:
# Graficamos los resultados
max_depth = range(1, 11, 2)
n_estimators = range(50, 400, 50)
scores = np.array(means).reshape(len(max_depth), len(n_estimators))
for i, value in enumerate(max_depth):
    plt.plot(n_estimators, scores[i], label='depth: ' + str(value))
plt.legend()
plt.xlabel('n_estimators')
plt.ylabel('accuracy')
plt.savefig('n_estimators_vs_max_depth.png')

In [None]:
best_model = cv.best_estimator_
# Métricas de validación
y_pred = best_model.predict(x_test)
print("Métricas de validación para el mejor ajuste")
print("Accuracy = %s" % accuracy_score(y_test, y_pred))
print("Precision = %s" % precision_score(y_test, y_pred))
print("Recall = %s" % recall_score(y_test, y_pred))
print("F1 = %s" % f1_score(y_test, y_pred))

cm = confusion_matrix(y_test, y_pred)
plot_confusion_matrix(cm, ['Cumplió_0', 'Incumplió_1'])

#### SVM

In [None]:
# Usamos SVM, clasificador lineal basado en una máquina de soporte compacto
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report, ConfusionMatrixDisplay

Grandes valores de C resultan en márgenes menores y pequeños valores de C resultan en márgenes más amplios

In [None]:
metrics = {}
for C in np.linspace(0.1, 100, 10):
    # "hinge" is the standard SVM loss
    clf = LinearSVC(C=C, loss="hinge", random_state=26)
    clf.fit(x_train, y_train.ravel())
    predictions = clf.predict(x_train)
    # calculamos el Accuracy
    metrics.update({ 'C = '+str(round(C, 3)): (accuracy_score(y_train, predictions)*100) })

print('\n=== Accuracys ===\n')
metrics

In [None]:
print('=== Best Model ===\n {}, Accuracy: {}'.format(
        max(metrics, key=metrics.get), 
        max(metrics.values())
    )
)

### Usamos VotingClassifier para los mejores modelos

Dentro de los modelos entrenados que mayor accuracy nos dieron, instanciamos los mismos con los mejores hiperparámetros, encontrados durante el entrenamiento,  para dar una predicción mejor con el Voting. El cual es un clasificador compuesto de varios clasificadores, en este caso usamos Random Forest (Bagging), perceptron multicapas con redes neuronales y xgboost (Boosting). El "Voting" simplemente elige la clase que tuvo "más votos" de los modelos que lo componen. El método de votacion elegido es "soft" que considera la probabilidad de los votos.

In [None]:
from sklearn.ensemble import VotingClassifier
clf1 = RandomForestClassifier(
    criterion = 'entropy',
    n_jobs = -1,
    max_depth = 20,
    max_features = 5,
    n_estimators = 200,
    random_state = 0,
    )

clf2 = neural_network.MLPClassifier(
    activation='tanh',
    solver='sgd',
    alpha=0.01, 
    hidden_layer_sizes=(10),
    learning_rate='adaptive',
    learning_rate_init=0.1,
    batch_size=20,
    random_state=43,
    max_iter=1000,
    verbose=True,
    shuffle=True,
    tol=0.0001,
    )
clf3 = XGBClassifier(
        max_depth= 7,
        n_estimators = 150
        )

eclf1 = VotingClassifier(estimators=[('RF', clf1), ('PM', clf2), ('NE', clf3)], voting='soft')
eclf1.fit(x_train, y_train)
predictions = eclf1.predict(x_test)
print(classification_report(y_test, test_predictions))


## Generar la salida para entregar

#### Imputacion y curacion

In [None]:
test_df

In [None]:
test_df.info()

In [None]:
X_t = test_df.copy()

In [None]:
# Para la columna Destino cambiamos los nan por la moda 
dic_dest_t = {'TRAPPIST-1e' : 'Destination_TRAPPIST-1e',
        '55 Cancri e' : 'Destination_55 Cancri e',
        'PSO J318.5-22' : 'Destination_PSO J318.5-22',
        np.nan : np.nan}
X_t['Destination'] = X_t['Destination'].map(dic_dest_t)

In [None]:
# Hacemos un One Hot dividiendo las categorias de Destino
x_dest_t = pd.get_dummies(X_t['Destination'])
for i in range(len(x_dest_t)):
    if (x_dest_t.iloc[i,0]+x_dest_t.iloc[i,1]+x_dest_t.iloc[i,2]) == 0:
        x_dest_t.iloc[i,:] = x_dest_t.iloc[i,:].replace(0, np.nan)
a = x_dest_t.columns
X_t[a] = x_dest_t

In [None]:
# Para la columna HomePlanet cambiamos los nan por la moda 
dic_planet_t = {'Earth' : 'HomePlanet_Earth',
              'Europa' : 'HomePlanet_Europa',
              'Mars' : 'HomePlanet_Mars',
              np.nan : np.nan}
X_t['HomePlanet'] = X_t['HomePlanet'].map(dic_planet_t)

In [None]:
# Hacemos un One Hot dividiendo las categorias de HomePlanet
x_home_t = pd.get_dummies(X_t['HomePlanet'])
for i in range(len(x_home_t)):
    if (x_home_t.iloc[i,0]+x_home_t.iloc[i,1]+x_home_t.iloc[i,2]) == 0:
        x_home_t.iloc[i,:] = x_home_t.iloc[i,:].replace(0, np.nan)
a = x_home_t.columns
X_t[a] = x_home_t

In [None]:
# Separamos en tres columnas "Cabin"
X_t[['Cabin_Deck','Cabin_Num','Cabin_Side']] = X_t['Cabin'].str.split('/',expand=True)

In [None]:
# Hacemos encodig de variables categoricas que tienen un cierto orden
Label_cols = ['VIP', 'CryoSleep', 'Cabin_Deck', 'Cabin_Side' ]
for col in Label_cols:
    X_t[col] = LabelEncoder().fit_transform(X_t[col])

In [None]:
# Volvemos a definir las variables nan para luego ser imputadas
Label_col = ['VIP', 'CryoSleep', 'Cabin_Side' ]
X_t[Label_col] = X_t[Label_col].replace(2, np.nan)
X_t['Cabin_Deck'] = X_t['Cabin_Deck'].replace(8, np.nan)

In [None]:
id_t = X_t['PassengerId']

In [None]:
# Limpiamos el dataset antes de imputar los valores faltantes
X_t = X_t.drop(['Cabin', 'Destination', 'HomePlanet', 'Name', 'PassengerId'],axis=1)

In [None]:
X_t['Cabin_Num'] = X_t['Cabin_Num'].astype(float)
X_t.info()

In [None]:
# Estandarizamos y escalamos los valores antes de imputar
x_names = X_t.columns
scaler = StandardScaler()
X_t = scaler.fit_transform(X_t)

In [None]:
# Imputamos todos los valores con Knn Imputer
from sklearn.impute import KNNImputer

X_knn_t = X_t.copy()
knn_imputer = KNNImputer(n_neighbors=9, weights="uniform")
X_knn_t = knn_imputer.fit_transform(X_knn_t)

# Volvemos a convertirlo en DF
X_knn_t = pd.DataFrame(X_knn_t, columns=x_names)
X_knn_t

### Generamos la salida

In [None]:
# Realizamos la predicción con el conjunto de test
test_pred = eclf1.predict(X_knn_t)

In [None]:
submission = pd.DataFrame(list(zip(id_t, test_pred)), columns=["PassengerId", "Transported"])
submission.to_csv("sample_submission.csv", header=True, index=False)

In [None]:
submission

In [None]:
#from google.colab import files
#files.download('sample_submission.csv')