In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# 1. Fonctions pour affiner les modèles

Le but de ce notebook est de répertorier différentes fonctions qui vont permettre d'ajuster les paramètres de modèles pour avoir la MAE la plus minime possible

In [3]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score

## RandomForest 

### Cross Validation 

Parametrage des n_estimators avec un cv donné

In [None]:
def get_score_RF(n_estimators, cv):
    """Return the average MAE of random forest model.
    
    Keyword argument:
    n_estimators -- the number of trees in the forest
    cv --- number of validation
    """
    pipeline = Pipeline(steps=[('imputer',SimpleImputer()),('model',RandomForestRegressor(n_estimators = n_estimators, random_state=0))])
    scores = -1 * cross_val_score(pipeline, X, y,
                              cv=cv,
                              scoring='neg_mean_absolute_error')
    return scores.mean()

### Classique MAE pour RF

In [6]:
def score_dataset(X_train, X_valid, y_train, y_valid):
    model= RandomForestRegressor(n_estimators=100, reandom_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds)

## XG-Boost 

Parametrage des n_estimators

In [2]:
def get_score_XGBoost(n_estimator):
    model = XGBRegressor(n_estimators=n_estimator, random_state=0)
    model.fit(X_train,y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(preds, y_valid)

Parametrage du learning_rate

In [None]:
def get_score_XGB_lr(learning_rate):
    model = XGBRegressor(n_estimators = 1000, learning_rate= learning_rate, random_state=0)
    model.fit(X_train,y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(preds, y_valid)

# 2 Fonctions de featuring

## 2.1 Mutual Information

In [None]:
from sklearn.feature_selection import mutual_info_regression

In [None]:
def make_mi_scores(X, y, discrete_features):
    mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

In [5]:
def plot_mi_scores(scores):
    scores = scores.sort_values(ascending=True)
    width = np.arange(len(scores))
    ticks = list(scores.index)
    plt.barh(width, scores)
    plt.yticks(width, ticks)
    plt.title("Mutual Information Scores")

# 3 Fonctions de preprocess 

# 3.1 Categorical Data 

Fonction qui pour un set de données d'entrainement donné retourne les colonnnes catégoricielles avec leur cardinal si définit comme Vrai( pour vraiment être flemmard ) sinon renvoi 

In [7]:
def get_obj_cols(X_train, card=False):
        return [col for col in X_train.columns if X_train.col.dtype == 'object']

Fonction qui pour une liste de colonnes "catégoricielles"  et un cardinal minimum donné , retourne les colonnes à One Hot ou à Ordinal

In [None]:
def get_obj_cols_strategy(obj_cols, card):
    ordinal_cols = []
    one_hot_cols = []
    for col in obj_cols:
        col_card = X_train.col.nunique()
        if col_card >= card:
            ordinal_cols.append(col)
        else:
            one_hot_cols.append(col)
    return ordinal_cols, one_hot_cols

In [None]:
pour les colonnes à OH , retourne un X_train , X_valid qui ont été One Hot

In [None]:
def col_one_hot_encode(low_cardinality_cols):
    # Apply one-hot encoder to each column with categorical data
    # On pourra modifier le handle_unknown ou différents paramètres de OneHotEncoder
    OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
    OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[low_cardinality_cols]))
    OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[low_cardinality_cols]))

    # One-hot encoding removed index; put it back
    OH_cols_train.index = X_train.index
    OH_cols_valid.index = X_valid.index

    # Remove categorical columns (will replace with one-hot encoding)
    num_X_train = X_train.drop(object_cols, axis=1)
    num_X_valid = X_valid.drop(object_cols, axis=1)

    # Add one-hot encoded columns to numerical features
    OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
    OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)

    # Ensure all columns have string type
    OH_X_train.columns = OH_X_train.columns.astype(str)
    OH_X_valid.columns = OH_X_valid.columns.astype(str)
    return OH_X_train, OH_X_valid 

Meme principe pour un ordinal 

In [None]:
def col_ord_encode(high_cardinality_cols):
    ordinal_encoder = OrdinalEncoder()
    ord_X_train[high_cardinality_cols]=ordinal_encoder.fit_transform(X_train[high_cardinality_cols])
    ord_X_valid[high_cardinality_cols]=ordinal_encoder.transform(X_train[high_cardinality_cols])
    return ord_X_train, ord_X_valid

Si des catégories sont présentes sur le set de validation mais pas sur celui d'entrainement, <br>
En les "encodant" on peut avoir des soucis .... <br>
La fonction suivante permet de drop celles-ci si c'est la stratégie adoptée pour les gérer 


In [None]:
def drop_categ_cols_withpb(obj_cols):
    # Columns that can be safely ordinal encoded
    good_label_cols = [col for col in object_cols if 
                   set(X_valid[col]).issubset(set(X_train[col]))]      
    # Problematic columns that will be dropped from the dataset
    bad_label_cols = list(set(object_cols)-set(good_label_cols))
    
    label_X_train = X_train.drop(bad_label_cols, axis=1)
    label_X_valid = X_valid.drop(bad_label_cols, axis=1)
    return label_X_train, label_X_valid