conda install -c anaconda py-xgboost

In [1]:
# Set seed
seed = 8

# Data manipulation
import numpy as np
import pandas as pd
from seaborn import load_dataset

# Machine learning pipeline
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin


# Import des données
exclude = ['pclass', 'embarked', 'who', 'adult_male', 'alive', 'alone']
df = load_dataset('titanic').drop(columns=exclude)

# Coup d'oeil rapide
print(f"{df.shape[0]} lignes, {df.shape[1]} colonnes")
df.head()

891 lignes, 9 colonnes


Unnamed: 0,survived,sex,age,sibsp,parch,fare,class,deck,embark_town
0,0,male,22.0,1,0,7.25,Third,,Southampton
1,1,female,38.0,1,0,71.2833,First,C,Cherbourg
2,1,female,26.0,0,0,7.925,Third,,Southampton
3,1,female,35.0,1,0,53.1,First,C,Southampton
4,0,male,35.0,0,0,8.05,Third,,Southampton


### Etape 1 : Découpage des données : 
- Découpage en jeux d'entrainnement et de test. 
- Découpage en caractéristiques numériques et catégoriques

In [2]:
# Définition de la cible
cible = 'survived'
caract = df.drop(columns=cible).columns

# Train-Test
X_train, X_test, y_train, y_test = train_test_split(df[caract], df[cible], 
                                                    test_size=.2, random_state=seed, 
                                                    stratify=df[cible])

# Analyse rapide des données
print(f"Jeu d'entrainnement ({X_train.shape[0]} lignes): Répartition des valeurs de la cible")
print(y_train.value_counts(normalize=True))
print(f"\nJeu de test ({X_test.shape[0]} lignes): Répartition des valeurs de la cible")
print(y_train.value_counts(normalize=True))

# Groupes de caractéristiques
num = X_train.select_dtypes(['number']).columns
print(f'\nNumeriques: {num}')
cat = X_train.columns.difference(num)
X_train[cat] = X_train[cat].astype('object')
print(f'Catégoriques: {cat}')

Jeu d'entrainnement (712 lignes): Répartition des valeurs de la cible
0    0.616573
1    0.383427
Name: survived, dtype: float64

Jeu de test (179 lignes): Répartition des valeurs de la cible
0    0.616573
1    0.383427
Name: survived, dtype: float64

Numeriques: Index(['age', 'sibsp', 'parch', 'fare'], dtype='object')
Catégoriques: Index(['class', 'deck', 'embark_town', 'sex'], dtype='object')


### Etape 2 : Prétraitement : 
Création de classes et de fonctions de prétraitement : 
- Traite_Nan: remplace les valeurs manquantes par une valeur constante et retourne les données modifiées dans un DataFrame. (développée ici pour les variables catégoriques, pour les numériques nous utiliserons plutôt la classe simpleImputer)
- Reduit_Card: agrège les catégories peu fréquentes dans la catégorie «autre» et retourne les données transformées dans un DataFrame

In [3]:
class Traite_Nan(BaseEstimator, TransformerMixin):
    """Une classe permetant de remplacer les Nan par une valeur constante.
    
    Parametres
    ----------
    val : (optionnel) La valeur à utiliser en remplacement
    """
    def __init__(self, val="manquante"):
        self.val = val

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X.fillna(self.val, inplace=True)
        return X

In [4]:
class Reduit_Card(BaseEstimator, TransformerMixin):
    """Une classe qui regroupe les catégories les moins fréquentes dans une catégorie "autre".
    
    Paramétres
    ----------
    seuil: (optionnel) Un réel définissant le seuil de détérmination d'une
    catégorie peu fréquente.  
    """
    def __init__(self, seuil=.01):
        self.seuil = seuil

    def fit(self, X, y=None):
        self.top_categories = {}
        for feature in X.columns:
            frequences = pd.Series(X[feature].value_counts(normalize=True))
            top_categories = frequences[frequences>self.seuil].index
            self.top_categories[feature] = list(top_categories)
        return self

    def transform(self, X):
        for feature in X.columns:
            X[feature] = np.where(X[feature].isin(self.top_categories[feature]), 
                                  X[feature], 'autre')
        return X

### Etape 3 : Transformation des données
Prétraitement les caractéristiques numériques et catégorielles en parallèle en utilisant ColumnTransformer et Pipeline.

- diviser les données en deux groupes: catégoriques et numériques
- appliquer différents transformateurs à chaque groupe
- Assembler coller les résultats


In [5]:
# Pipeline de prétraitement
cat_pipe = Pipeline([('Traite_Nan', Traite_Nan()),
                             ('Reduit_Card', Reduit_Card()),
                             ('encoder', OneHotEncoder(handle_unknown='ignore', sparse=False))])

num_pipe = Pipeline([('Traite_Nan', SimpleImputer()),
                           ('scaler', MinMaxScaler())])

preprocessor = ColumnTransformer(transformers=[('categ', cat_pipe, cat),
                                               ('numerique', num_pipe, num)])
# Fit et transform sur le jeu d'entrainement
preprocessor.fit(X_train)
categ = preprocessor.named_transformers_['categ']['encoder'].get_feature_names(cat)
columns = np.append(categ, num)
X_train_transfo = pd.DataFrame(preprocessor.transform(X_train), columns=columns)
X_train_transfo.head()

Unnamed: 0,class_First,class_Second,class_Third,deck_A,deck_B,deck_C,deck_D,deck_E,deck_F,deck_autre,...,embark_town_Cherbourg,embark_town_Queenstown,embark_town_Southampton,embark_town_autre,sex_female,sex_male,age,sibsp,parch,fare
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.258608,0.0,0.0,0.031425
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.363052,0.0,0.0,0.013565
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.258608,0.0,0.0,0.016461
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.363052,0.0,0.0,0.015835
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.22091,0.125,0.0,0.034743


In [6]:
X_train[num]

Unnamed: 0,age,sibsp,parch,fare
624,21.00,0,0,16.1000
825,,0,0,6.9500
652,21.00,0,0,8.4333
444,,0,0,8.1125
49,18.00,1,0,17.8000
...,...,...,...,...
11,58.00,0,0,26.5500
644,0.75,2,1,19.2583
506,33.00,0,2,26.0000
326,61.00,0,0,6.2375


In [7]:
X_train_transfo[num]

Unnamed: 0,age,sibsp,parch,fare
0,0.258608,0.000,0.000000,0.031425
1,0.363052,0.000,0.000000,0.013565
2,0.258608,0.000,0.000000,0.016461
3,0.363052,0.000,0.000000,0.015835
4,0.220910,0.125,0.000000,0.034743
...,...,...,...,...
707,0.723549,0.000,0.000000,0.051822
708,0.004147,0.250,0.166667,0.037590
709,0.409399,0.000,0.333333,0.050749
710,0.761247,0.000,0.000000,0.012175
