In [1]:
import pandas as pd
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss, ConfusionMatrixDisplay, confusion_matrix, accuracy_score
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [2]:
#Load training dataset, remove missing values in Y, create X and Y matrices.

df_train = pd.read_csv('data/train.csv')

df_train.dropna(subset=['Y'], inplace=True)

X = df_train.drop('Y', axis = 1)
y = df_train[['Y']]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size=0.8, test_size=0.2, shuffle=True, random_state=0
)

#Load test predictors

X_test_real = pd.read_csv('data/Xtest.csv')

In [3]:
default_grid_params = dict(cv=5, n_jobs=4)

param_grid_knn = {
    'n_neighbors': [100], # 3, 5, 7, 10, 14, 20, 50, 100, 200, 500, 1000
    'weights': ['distance'], # 'uniform'
    'p': [1], # 2
}

param_grid_lr = {
    'penalty': ['l2'],
    'C': [100, 1000], # 0.001, 0.01, 0.1, 1, 10, 
    'fit_intercept': [True], # False
}

param_grid_rf = {
    'criterion': ['gini', 'entropy', 'log_loss'],
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 2, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

param_grid_gb = {
    'loss': ['log_loss'],
    'learning_rate': [0.02], # 0.1, 0.5
    'n_estimators': [300], # 100, 200 
    'criterion': ['squared_error'],
    'max_depth': [5], # None, 2, 10
}

## Pipeline for models

In [4]:
#Select predictors, apply pre-processing steps, define the model, and fit the model to the training data.
sub = ["X1", "X2", "X6", "X11", "X12"]
X_train_sub  = X_train[sub]
X_test_sub = X_test[sub]

"""
- If “mean”, then replace missing values using the mean along each column. Can only be used with numeric data.
- If “median”, then replace missing values using the median along each column. Can only be used with numeric data.
- If “most_frequent”, then replace missing using the most frequent value along each column. Can be used with strings or numeric data.
    If there is more than one such value, only the smallest is returned.
- If “constant”, then replace missing values with fill_value. Can be used with strings or numeric data.
"""

numeric_features = sub
numeric_features.remove("X12")
numeric_features.remove("X11")
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')), # mean, median, most_frequent
    ('scaler', StandardScaler())])

categorical_features = ['X11' if "X11" in sub else None] + ['X12' if "X12" in sub else None]
categorical_features.remove(None)
categorical_features.remove(None)
X_train_sub = X_train_sub.astype({'X11':'category', 'X12':'category'})
X_test_sub = X_test_sub.astype({'X11':'category', 'X12':'category'})

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')), # obligatoire
    ('onehot', OneHotEncoder(handle_unknown='ignore', drop='first'))]) #Same as pd.get_dummies 


preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

grids = {'KNN': GridSearchCV(KNeighborsClassifier(), param_grid_knn, scoring='neg_log_loss', **default_grid_params),
        'Logistic Regression': GridSearchCV(LogisticRegression(), param_grid_lr, scoring='neg_log_loss', **default_grid_params),
        'Random Forest': GridSearchCV(RandomForestClassifier(), param_grid_rf, scoring='neg_log_loss', **default_grid_params),
        'Gradient Boosting': GridSearchCV(GradientBoostingClassifier(), param_grid_gb, scoring='neg_log_loss', **default_grid_params)
        }

## Fit all models 

In [5]:
for model_name, model in grids.items():
    model = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', model)])
    model.fit(X_train_sub, y_train.values.ravel())
    grids[model_name] = model
    print(f'{model_name} fitted')

## Compute the predicted probability for each class on the training set and evaluate on the log-loss.

In [None]:
print("Log-loss on training set...")
log_loss_test = {}
for model_name, model in grids.items():
    pred_prob_train = pd.DataFrame(model.predict_proba(X_test_sub))
    loss = log_loss(y_test, pred_prob_train)
    print(f'{model_name}: {loss}')
    log_loss_test[model_name] = loss

In [None]:
# Best model compare by the log-loss on the test set.
best_model = min(log_loss_test, key=log_loss_test.get)
print(f'Best model is {best_model} with a log-loss of {log_loss_test[best_model]}')

Best model is Random Forest with a log-loss of 0.4532672388265402


## Predict on the test predictors, and save the probabilities to a csv file. 

In [8]:
pred_prob_test = pd.DataFrame(grids[best_model].predict_proba(X_test_real))
pred_prob_test.rename(columns = {0: 'Y_1', 1: 'Y_2', 2: 'Y_3', 3: 'Y_4', 4: 'Y_5', 5:'Y_6', 6:'Y_7'}, inplace = True)
idx = pred_prob_test.index
pred_prob_test.insert(0, 'id', idx)
pred_prob_test.to_csv("Group13.csv", index=False)
pred_prob_test.head()