In [73]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, f1_score
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, f1_score



In [67]:

X_train = pd.read_pickle('data/X_train.pkl')
y_train = pd.read_pickle('data/y_train.pkl')

X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [71]:
categorical_cols = X_train.select_dtypes(include='object').columns
numeric_cols = X_train.select_dtypes(include='number').columns

preprocessor = ColumnTransformer([
    ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), categorical_cols),
    ('num', StandardScaler(), numeric_cols)  # <-- Standardisation ici
])

model = Pipeline([
    ('preprocess', preprocessor),
    ('classifier', RandomForestClassifier())
])

param_grid_rapide = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [None, 20],
    'classifier__min_samples_split': [2, 5],
    'classifier__min_samples_leaf': [1, 2]
}

param_grid_lent = {
    'classifier__n_estimators': [100, 200, 300, 500],        # + d'arbres = + de stabilité
    'classifier__max_depth': [10, 20, 30, None],             # autorise des arbres plus complexes
    'classifier__min_samples_split': [2, 5, 10],             # plus de granularité sur les divisions
    'classifier__min_samples_leaf': [1, 2, 4],               # évite les feuilles trop petites
    'classifier__max_features': ['sqrt', 'log2', None],      # explore plus d'options
    'classifier__bootstrap': [True, False],                  # True = bagging classique
    'classifier__class_weight': ['balanced', None],          # utile si tes classes sont déséquilibrées
}

f1_scorer = make_scorer(f1_score, pos_label='Yes')


grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid_rapide,
    cv=5,  # 3
    scoring=f1_scorer,  # 'precision', 'recall', 'roc_auc'
    n_jobs=-1,  
    verbose=2
)

grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_


Fitting 5 folds for each of 16 candidates, totalling 80 fits


KeyboardInterrupt: 

In [69]:
best_param = grid_search.best_params_
model.set_params(**best_param)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, pos_label='Yes')
print(f"Best parameters: {best_param}")
print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")

Best parameters: {'classifier__max_depth': 20, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 200}
Accuracy: 0.9989047619047619
F1 Score: 0.43902439024390244


In [None]:
# Colonnes
categorical_cols = X_train.select_dtypes(include='object').columns
numeric_cols = X_train.select_dtypes(include='number').columns

# Préprocessing
preprocessor = ColumnTransformer([
    ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), categorical_cols),
    ('num', StandardScaler(), numeric_cols)
])

# Pipeline avec placeholder pour le classifieur
model = Pipeline([
    ('preprocess', preprocessor),
    ('classifier', RandomForestClassifier())  # placeholder
])

# Grille avec plusieurs modèles et paramètres spécifiques
param_grid = [
    {
        'classifier': [RandomForestClassifier(random_state=42)],
        'classifier__n_estimators': [100, 200],
        'classifier__max_depth': [None, 20],
        'classifier__min_samples_split': [2, 5],
        'classifier__min_samples_leaf': [1, 2]
    },
    {
        'classifier': [AdaBoostClassifier(random_state=42)],
        'classifier__n_estimators': [50, 100],
        'classifier__learning_rate': [0.5, 1.0]
    },
    {
        'classifier': [KNeighborsClassifier()],
        'classifier__n_neighbors': [5, 10],
        'classifier__weights': ['uniform', 'distance']
    },
    {
        'classifier': [XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)],
        'classifier__n_estimators': [100, 200],
        'classifier__max_depth': [3, 6],
        'classifier__learning_rate': [0.1, 0.3]
    }
]

# Scorer
f1_scorer = make_scorer(f1_score, pos_label='Yes')

# GridSearchCV
grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    cv=5,
    scoring=f1_scorer,
    n_jobs=-1,
    verbose=2
)

grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_

print("Meilleur modèle :", best_model)
print("Meilleurs paramètres :", grid_search.best_params_)


Fitting 5 folds for each of 32 candidates, totalling 160 fits


In [70]:
# Extract feature importances from the classifier
feature_importances = best_model.named_steps['classifier'].feature_importances_

# Combine feature names and their importances
feature_names = numeric_cols.tolist() + categorical_cols.tolist()
importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=True)

# Display the top features
print(importance_df.head(50))

                  Feature  Importance
7                latitude    0.000000
32               use_chip    0.000000
2                     zip    0.001687
6             birth_month    0.002606
3                     mcc    0.003779
24                 second    0.003884
4             current_age    0.005943
27    acct_open_date year    0.008877
5              birth_year    0.009691
35                 gender    0.009887
34                 errors    0.010685
21                    day    0.013369
36             card_brand    0.016000
33         merchant_state    0.016474
28   acct_open_date month    0.016539
14            card_number    0.017448
0                  amount    0.017870
26           expires year    0.019088
12           credit_score    0.020006
31  acct_open_date minute    0.020787
39       card_on_dark_web    0.021129
13       num_credit_cards    0.021352
37              card_type    0.022699
29     acct_open_date day    0.022958
19                   year    0.024079
30    acct_o