In [5]:
import pandas as pd
import numpy as np
import itertools
import copy

from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, ExtraTreesClassifier
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss, ConfusionMatrixDisplay, confusion_matrix, accuracy_score
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [6]:
#Load training dataset, remove missing values in Y, create X and Y matrices.

df_train = pd.read_csv('data/train.csv')

df_train.dropna(subset=['Y'], inplace=True)

X = df_train.drop('Y', axis = 1)
y = df_train[['Y']]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size=0.8, test_size=0.2, shuffle=True, random_state=0
)

#Load test predictors

X_test_real = pd.read_csv('data/Xtest.csv')

In [7]:
default_grid_params = dict(cv=5, n_jobs=4, verbose=0, scoring='neg_log_loss')

param_grid_knn = {
    'n_neighbors': [100], # 3, 5, 7, 10, 14, 20, 50, 100, 200, 500, 1000
    'weights': ['distance'], # 'uniform'
    'p': [1], # 2
}

param_grid_lr = {
    'penalty': ['l2'],
    'C': [100, 1000], # 0.001, 0.01, 0.1, 1, 10, 
    'fit_intercept': [True], # False
}

param_grid_rf = {
    'n_estimators': [ 1000, 2000, 4000], # 1000
    'criterion' : ["log_loss"],
    "max_depth" : [ 50, 100, 300, 500], # 100
    'min_samples_split' : [2, 5], # 2
    'min_samples_leaf' : [1, 2] # 1
}

param_grid_gb = {
    'loss': ['log_loss'],
    'learning_rate': [0.02, 0.1, 0.5], # 0.1, 0.5
    'n_estimators': [300, 500, 750, 1000], # 100, 200 
    'criterion': ['squared_error'],
    'max_depth': [None, 2, 5, 10], # None, 2, 10
    'validation_fraction' : [0.2],
    'n_iter_no_change' : [5],
    'tol' : [1e-4]
}

## Pipeline for models

In [None]:
def preprocessing(sub, X_train_sub, X_test_sub):
    numeric_features = copy.copy(sub)
    categorical_features = list()
    if "X11" in sub:
        numeric_features.remove("X11")
        categorical_features.append("X11")
    if "X12" in sub:
        numeric_features.remove("X12")
        categorical_features.append("X12")
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')), # mean, median, most_frequent
        ('scaler', StandardScaler())])

    if len(categorical_features) > 1:
        X_train_sub = X_train_sub.astype({'X11':'category', 'X12':'category'})
        X_test_sub = X_test_sub.astype({'X11':'category', 'X12':'category'})
    elif len(categorical_features) == 1:
        if categorical_features[0] == "X11":
            X_train_sub = X_train_sub.astype({'X11':'category'})
            X_test_sub = X_test_sub.astype({'X11':'category'})
        else:
            X_train_sub = X_train_sub.astype({'X12':'category'})
            X_test_sub = X_test_sub.astype({'X12':'category'})

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')), # obligatoire
        ('onehot', OneHotEncoder(handle_unknown='ignore', drop='first'))]) #Same as pd.get_dummies 

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)])
    
    return preprocessor, X_train_sub, X_test_sub

### Permet de vérifier si les colonnes trouvés par la correlation sont bien les mêmes que celles trouvés par combinaison.

In [None]:
#Select predictors, apply pre-processing steps, define the model, and fit the model to the training data.
dict = {}
for i in range(1, 13):
    liste = list(itertools.combinations(X_train.columns, i))
    for sub in liste:
        sub = list(sub)
        X_train_sub  = X_train[sub]
        X_test_sub = X_test[sub]

        preprocessor, X_train_sub, X_test_sub = preprocessing(sub, X_train_sub, X_test_sub)

        grids = {'KNN': GridSearchCV(KNeighborsClassifier(), param_grid_knn, **default_grid_params)}

        model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', grids["KNN"])])
        model.fit(X_train_sub, y_train.values.ravel())
        grids["KNN"] = model
        print("KNN fitted")
        pred_prob_train = pd.DataFrame(model.predict_proba(X_test_sub))
        loss = log_loss(y_test, pred_prob_train)
        print(f'KNN : {loss} for {sub}')
        dict[(i, tuple(sub))] = loss

### Code générique pour construire notre model.

In [8]:
## Fit all models 
sub = ["X1", "X6", "X10", "X11", "X12"]
X_train_sub  = X_train[sub]
X_test_sub = X_test[sub]

preprocessor, X_train_sub, X_test_sub = preprocessing(sub, X_train_sub, X_test_sub)

grids = {'KNN': GridSearchCV(KNeighborsClassifier(), param_grid_knn, **default_grid_params),
        'Logistic Regression': GridSearchCV(LogisticRegression(max_iter=1000), param_grid_lr, **default_grid_params),
        'Random Forest': GridSearchCV(ExtraTreesClassifier(), param_grid_rf, **default_grid_params),
        'Gradient Boosting': GridSearchCV(GradientBoostingClassifier(), param_grid_gb, **default_grid_params)
        }

In [10]:
for model_name, model in grids.items():
    model = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', model)])
    model.fit(X_train_sub, y_train.values.ravel())
    grids[model_name] = model
    print(f'{model_name} fitted')

Random Forest fitted


KeyboardInterrupt: 

## Compute the predicted probability for each class on the training set and evaluate on the log-loss.

In [11]:
print("Log-loss on training set...")
log_loss_test = {}
for model_name, model in grids.items():
    pred_prob_train = pd.DataFrame(model.predict_proba(X_test_sub))
    loss = log_loss(y_test, pred_prob_train)
    print(f'{model_name}: {loss}')
    log_loss_test[model_name] = loss

Log-loss on training set...
Random Forest: 0.33469724616722873


NotFittedError: This GridSearchCV instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [None]:
# Best model compare by the log-loss on the test set.
best_model = min(dict, key=dict.get)
print(f'Best model is {best_model} with a log-loss of {dict[best_model]}')

## Predict on the test predictors, and save the probabilities to a csv file. 

In [None]:
pred_prob_test = pd.DataFrame(grids[best_model].predict_proba(X_test_real))
pred_prob_test.rename(columns = {0: 'Y_1', 1: 'Y_2', 2: 'Y_3', 3: 'Y_4', 4: 'Y_5', 5:'Y_6', 6:'Y_7'}, inplace = True)
idx = pred_prob_test.index
pred_prob_test.insert(0, 'id', idx)
pred_prob_test.to_csv("Group13.csv", index=False)
pred_prob_test.head()

https://askcodez.com/comment-ameliorer-randomforest-la-performance.html

https://stats.stackexchange.com/questions/53240/practical-questions-on-tuning-random-forests