In [48]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, f1_score



In [49]:

X_train = pd.read_pickle('data/X_train.pkl')
y_train = pd.read_pickle('data/y_train.pkl')


X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [None]:
categorical_cols = X_train.select_dtypes(include='object').columns
numeric_cols = X_train.select_dtypes(include='number').columns

preprocessor = ColumnTransformer([
    ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), categorical_cols),
    ('num', 'passthrough', numeric_cols)
])

model = Pipeline([
    ('preprocess', preprocessor),
    ('classifier', RandomForestClassifier())
])

param_grid_rapide = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [None, 10, 20],
    'classifier__min_samples_split': [2, 5],
    'classifier__min_samples_leaf': [1, 2],
    'classifier__max_features': ['sqrt', 'log2'],
}

param_grid_lent = {
    'classifier__n_estimators': [100, 200, 300, 500],        # + d'arbres = + de stabilité
    'classifier__max_depth': [10, 20, 30, None],             # autorise des arbres plus complexes
    'classifier__min_samples_split': [2, 5, 10],             # plus de granularité sur les divisions
    'classifier__min_samples_leaf': [1, 2, 4],               # évite les feuilles trop petites
    'classifier__max_features': ['sqrt', 'log2', None],      # explore plus d'options
    'classifier__bootstrap': [True, False],                  # True = bagging classique
    'classifier__class_weight': ['balanced', None],          # utile si tes classes sont déséquilibrées
}

f1_scorer = make_scorer(f1_score, pos_label='Yes')


grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid_rapide,
    cv=5,  # 3
    scoring=f1_scorer,  # 'precision', 'recall', 'roc_auc'
    n_jobs=-1,  
    verbose=2
)


grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_


Fitting 5 folds for each of 48 candidates, totalling 240 fits
[CV] END classifier__max_depth=None, classifier__max_features=sqrt, classifier__min_samples_leaf=1, classifier__min_samples_split=2, classifier__n_estimators=100; total time=  23.4s
[CV] END classifier__max_depth=None, classifier__max_features=sqrt, classifier__min_samples_leaf=1, classifier__min_samples_split=2, classifier__n_estimators=100; total time=  24.0s
[CV] END classifier__max_depth=None, classifier__max_features=sqrt, classifier__min_samples_leaf=1, classifier__min_samples_split=2, classifier__n_estimators=100; total time=  23.9s
[CV] END classifier__max_depth=None, classifier__max_features=sqrt, classifier__min_samples_leaf=1, classifier__min_samples_split=2, classifier__n_estimators=100; total time=  24.1s
[CV] END classifier__max_depth=None, classifier__max_features=sqrt, classifier__min_samples_leaf=1, classifier__min_samples_split=2, classifier__n_estimators=100; total time=  24.3s
[CV] END classifier__max_dep

In [55]:
best_model = grid_search.best_estimator_
print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation score: ", grid_search.best_score_)

best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, pos_label='Yes')
print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")

conf_matrix = confusion_matrix(y_test, y_pred)

Best parameters found:  {'classifier__max_depth': None, 'classifier__max_features': 'sqrt', 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 200}
Best cross-validation score:  0.4714834152334152
Accuracy: 0.9990
F1 Score: 0.5287


In [59]:
# Extract feature importances from the classifier
feature_importances = best_model.named_steps['classifier'].feature_importances_

# Combine feature names and their importances
feature_names = numeric_cols.tolist() + categorical_cols.tolist()
importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)

# Display the top features
print(importance_df.head(50))

                  Feature  Importance
1                 card_id    0.098269
14          yearly_income    0.088212
2                  amount    0.086604
11               latitude    0.060062
13      per_capita_income    0.050445
12              longitude    0.037651
32           expires year    0.028205
29                 minute    0.026654
22       num_cards_issued    0.026108
23           credit_limit    0.024176
37  acct_open_date minute    0.021565
30                 second    0.020490
27                    day    0.020292
21                    cvv    0.020270
25                   year    0.020243
10            birth_month    0.019776
20            card_number    0.019432
45               has_chip    0.019182
28                   hour    0.018618
9              birth_year    0.018466
24  year_pin_last_changed    0.017964
15             total_debt    0.017850
18                   id_y    0.015647
38               use_chip    0.015615
16           credit_score    0.015591
33    acct_o