In [1]:
import pandas as pd
import numpy as np
import itertools
import copy

from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.base import TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, HistGradientBoostingClassifier, ExtraTreesClassifier
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss, ConfusionMatrixDisplay, confusion_matrix, accuracy_score
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [130]:
#Load training dataset, remove missing values in Y, create X and Y matrices.

df_train = pd.read_csv('data/train.csv')

df_train.dropna(subset=['Y'], inplace=True)

X = df_train.drop('Y', axis = 1)
y = df_train[['Y']]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size=0.8, test_size=0.2, shuffle=True, random_state=0
)

#Load test predictors

X_test_real = pd.read_csv('data/Xtest.csv')

In [131]:
default_grid_params = dict(cv=5, n_jobs=4, verbose=1, scoring='neg_log_loss')

param_grid_knn = {
    'n_neighbors': [100], # 3, 5, 7, 10, 14, 20, 50, 100, 200, 500, 1000
    'weights': ['distance'], # 'uniform'
    'p': [1], # 2
}

param_grid_lr = {
    'penalty': ['l2'],
    'C': [100, 1000], # 0.001, 0.01, 0.1, 1, 10, 
    'fit_intercept': [True], # False
}

param_grid_rf = {
    'n_estimators': [5000],
    'criterion' : ["entropy"],
    "max_depth" : [400], # 100
    'min_samples_split' : [2],
    'min_samples_leaf' : [1]
}

param_grid_gb = {
    'loss': ['log_loss'],
    'learning_rate': [0.02, 0.1, 0.5], # 0.1, 0.5
    'n_estimators': [300, 500, 750, 1000], # 100, 200 
    'criterion': ['squared_error'],
    'max_depth': [None, 2, 5, 10], # None, 2, 10
    'validation_fraction' : [0.2],
    'n_iter_no_change' : [5],
    'tol' : [1e-4]
}

param_grid_hgb = {
    'loss': ['log_loss'],
    'learning_rate': [0.02],
    'max_iter': [420],
    'max_leaf_nodes': [None],
    'min_samples_leaf': [20],
    'l2_regularization': [1],
    'validation_fraction' : [0.2],
    'n_iter_no_change' : [5],
    'tol' : [1e-4]
}

## Pipeline for models

In [132]:
class Std(TransformerMixin):
    
    def fit(self, X, y=None, **fit_params):
        return self

    def transform(self, X, y=None, **fit_params):
        mean_X = np.mean(X, axis=0)
        std_X = np.std(X, axis=0)
        X = X[(abs(X - mean_X) < 4 * std_X).all(axis=1)]
        return X


def preprocessing(sub, X_train_sub, X_test_sub):
    numeric_features = copy.copy(sub)
    categorical_features = list()
    if "X11" in sub:
        numeric_features.remove("X11")
        categorical_features.append("X11")
    if "X12" in sub:
        numeric_features.remove("X12")
        categorical_features.append("X12")

    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')), # mean, median, most_frequent
        ('std', Std()),
        ('scaler', StandardScaler())], verbose=True)

    if len(categorical_features) > 1:
        X_train_sub = X_train_sub.astype({'X11':'category', 'X12':'category'})
        X_test_sub = X_test_sub.astype({'X11':'category', 'X12':'category'})
    elif len(categorical_features) == 1:
        if categorical_features[0] == "X11":
            X_train_sub = X_train_sub.astype({'X11':'category'})
            X_test_sub = X_test_sub.astype({'X11':'category'})
        else:
            X_train_sub = X_train_sub.astype({'X12':'category'})
            X_test_sub = X_test_sub.astype({'X12':'category'})

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')), # obligatoire
        ('onehot', OneHotEncoder(handle_unknown='ignore', drop='first'))], verbose=True) #Same as pd.get_dummies 

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)])
    
    return preprocessor, X_train_sub, X_test_sub

### Permet de vérifier si les colonnes trouvés par la correlation sont bien les mêmes que celles trouvés par combinaison.

### Code générique pour construire notre model.

In [139]:
## Fit all models 
sub = ["X1", "X6", "X10", "X11", "X12"]
X_train_sub  = X_train
X_test_sub = X_test

preprocessor, X_train_sub, X_test_sub = preprocessing(sub, X_train_sub, X_test_sub)

grids = {#'KNN': GridSearchCV(KNeighborsClassifier(), param_grid_knn, **default_grid_params),
        #'Logistic Regression': GridSearchCV(LogisticRegression(max_iter=1000), param_grid_lr, **default_grid_params),
        'Random Forest': GridSearchCV(ExtraTreesClassifier(), param_grid_rf, **default_grid_params),
        #'Gradient Boosting': GridSearchCV(HistGradientBoostingClassifier(), param_grid_hgb, **default_grid_params)
        }

class DenseTransformer(TransformerMixin):

    def fit(self, X, y=None, **fit_params):
        return self

    def transform(self, X, y=None, **fit_params):
        return np.asarray(X.todense())

In [140]:
for model_name, model in grids.items():
    model = Pipeline(steps=[('preprocessor', preprocessor),
                            ('to_dense', DenseTransformer()),
                            ('pca', PCA(n_components=0.965, random_state=42, svd_solver='full', n_oversamples=100)),
                            ('classifier', model)], verbose=True)
    model.fit(X_train_sub, y_train.values.ravel())
    grids[model_name] = model
    print(f'{model_name} fitted')

[Pipeline] ........... (step 1 of 3) Processing imputer, total=   0.0s
[Pipeline] ............... (step 2 of 3) Processing std, total=   0.0s
[Pipeline] ............ (step 3 of 3) Processing scaler, total=   0.0s
[Pipeline] ........... (step 1 of 2) Processing imputer, total=   0.0s
[Pipeline] ............ (step 2 of 2) Processing onehot, total=   0.0s
[Pipeline] ...... (step 1 of 4) Processing preprocessor, total=   0.1s
[Pipeline] .......... (step 2 of 4) Processing to_dense, total=   0.0s
[Pipeline] ............... (step 3 of 4) Processing pca, total=   0.1s
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[Pipeline] ........ (step 4 of 4) Processing classifier, total=11.6min
Random Forest fitted


## Compute the predicted probability for each class on the training set and evaluate on the log-loss.

In [141]:
print("Log-loss on training set...")
log_loss_test = {}
for model_name, model in grids.items():
    pred_prob_train = pd.DataFrame(model.predict_proba(X_test_sub))
    loss = log_loss(y_test, pred_prob_train)
    print(f'{model_name}: {loss}')
    log_loss_test[model_name] = loss

Log-loss on training set...
Random Forest: 0.29201919470309745


In [142]:
# Best model compare by the log-loss on the test set.
best_model = min(log_loss_test, key=log_loss_test.get)
print(f'Best model is {best_model} with a log-loss of {log_loss_test[best_model]}')

Best model is Random Forest with a log-loss of 0.29201919470309745


## Predict on the test predictors, and save the probabilities to a csv file. 

In [None]:
pred_prob_test = pd.DataFrame(grids[best_model].predict_proba(X_test_real))
pred_prob_test.rename(columns = {0: 'Y_1', 1: 'Y_2', 2: 'Y_3', 3: 'Y_4', 4: 'Y_5', 5:'Y_6', 6:'Y_7'}, inplace = True)
idx = pred_prob_test.index
pred_prob_test.insert(0, 'id', idx)
pred_prob_test.to_csv("Group13.csv", index=False)
pred_prob_test.head()

https://askcodez.com/comment-ameliorer-randomforest-la-performance.html

https://stats.stackexchange.com/questions/53240/practical-questions-on-tuning-random-forests