In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split  
from sklearn.pipeline import Pipeline  
from sklearn.ensemble import RandomForestClassifier

from src.etl import get_features_by_type
from src.models.training import get_model_configs, train_model_gscv, save_model_pickle
from src.models.evaluation import predict_from_fitted_model, get_metrics
from src.models.preprocessing import create_preprocessor



In [2]:
df = pd.read_csv("https://minio.lab.sspcloud.fr/jbrablx/ai_insurance/raw/train.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 381109 entries, 0 to 381108
Data columns (total 12 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   id                    381109 non-null  int64  
 1   Gender                381109 non-null  object 
 2   Age                   381109 non-null  int64  
 3   Driving_License       381109 non-null  int64  
 4   Region_Code           381109 non-null  float64
 5   Previously_Insured    381109 non-null  int64  
 6   Vehicle_Age           381109 non-null  object 
 7   Vehicle_Damage        381109 non-null  object 
 8   Annual_Premium        381109 non-null  float64
 9   Policy_Sales_Channel  381109 non-null  float64
 10  Vintage               381109 non-null  int64  
 11  Response              381109 non-null  int64  
dtypes: float64(3), int64(6), object(3)
memory usage: 34.9+ MB


In [4]:
df.drop(['id'], axis=1, inplace=True)

In [5]:
get_features_by_type(df, df.columns)

(['Age',
  'Driving_License',
  'Region_Code',
  'Previously_Insured',
  'Annual_Premium',
  'Policy_Sales_Channel',
  'Vintage',
  'Response'],
 ['Gender', 'Vehicle_Age', 'Vehicle_Damage'])

In [6]:
df["Annual_Premium"].describe()

count    381109.000000
mean      30564.389581
std       17213.155057
min        2630.000000
25%       24405.000000
50%       31669.000000
75%       39400.000000
max      540165.000000
Name: Annual_Premium, dtype: float64

In [7]:
df.duplicated().sum()

269

In [8]:
df.drop_duplicates(inplace=True)

In [9]:
X = df.drop('Response', axis=1)  
y = df['Response']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
num_features = ['Age', 'Annual_Premium', 'Vintage']
cat_features = ['Gender', 'Vehicle_Age', 'Driving_License', 'Region_Code', 'Previously_Insured', 'Vehicle_Damage', 'Policy_Sales_Channel'] 

In [13]:
param_grid = {
    'model__n_estimators': [100, 200, 300],
    'model__max_depth': [10, 20, 30]
}

params_skf = {
    "n_splits": 5, "shuffle": True, "random_state": 42
}

params_gscv = {
    "scoring": ["accuracy", "precision", "recall", "f1", "roc_auc"],
    "refit": "f1",
    "verbose": 3,
    "n_jobs": -1
}

preprocessor = create_preprocessor(num_features, cat_features)
model = RandomForestClassifier(random_state=42)

In [14]:
grid_search_fitted = train_model_gscv(
    model=model,
    X_train=X_train,
    y_train=y_train,
    preprocessor=preprocessor,
    param_grid=param_grid,
    params_skf=params_skf,
    params_gscv=params_gscv,
    resampling_method=None
)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[CV 1/5] END model__max_depth=10, model__n_estimators=100; accuracy: (test=0.877) f1: (test=0.000) precision: (test=0.000) recall: (test=0.000) roc_auc: (test=0.854) total time=  20.5s


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[CV 4/5] END model__max_depth=10, model__n_estimators=100; accuracy: (test=0.877) f1: (test=0.000) precision: (test=0.000) recall: (test=0.000) roc_auc: (test=0.849) total time=  21.0s
[CV 3/5] END model__max_depth=10, model__n_estimators=100; accuracy: (test=0.877) f1: (test=0.000) precision: (test=0.000) recall: (test=0.000) roc_auc: (test=0.851) total time=  21.4s


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[CV 2/5] END model__max_depth=10, model__n_estimators=100; accuracy: (test=0.877) f1: (test=0.000) precision: (test=0.000) recall: (test=0.000) roc_auc: (test=0.852) total time=  23.6s
[CV 5/5] END model__max_depth=10, model__n_estimators=100; accuracy: (test=0.877) f1: (test=0.000) precision: (test=0.000) recall: (test=0.000) roc_auc: (test=0.850) total time=  23.4s


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[CV 1/5] END model__max_depth=10, model__n_estimators=200; accuracy: (test=0.877) f1: (test=0.000) precision: (test=0.000) recall: (test=0.000) roc_auc: (test=0.855) total time=  37.5s


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[CV 4/5] END model__max_depth=10, model__n_estimators=200; accuracy: (test=0.877) f1: (test=0.000) precision: (test=0.000) recall: (test=0.000) roc_auc: (test=0.850) total time=  38.3s


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[CV 3/5] END model__max_depth=10, model__n_estimators=200; accuracy: (test=0.877) f1: (test=0.000) precision: (test=0.000) recall: (test=0.000) roc_auc: (test=0.851) total time=  39.8s
[CV 5/5] END model__max_depth=10, model__n_estimators=200; accuracy: (test=0.877) f1: (test=0.000) precision: (test=0.000) recall: (test=0.000) roc_auc: (test=0.851) total time=  40.2s


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[CV 2/5] END model__max_depth=10, model__n_estimators=200; accuracy: (test=0.877) f1: (test=0.000) precision: (test=0.000) recall: (test=0.000) roc_auc: (test=0.852) total time=  41.7s


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[CV 1/5] END model__max_depth=10, model__n_estimators=300; accuracy: (test=0.877) f1: (test=0.000) precision: (test=0.000) recall: (test=0.000) roc_auc: (test=0.855) total time=  56.7s


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[CV 4/5] END model__max_depth=10, model__n_estimators=300; accuracy: (test=0.877) f1: (test=0.000) precision: (test=0.000) recall: (test=0.000) roc_auc: (test=0.850) total time=  57.7s


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[CV 2/5] END model__max_depth=10, model__n_estimators=300; accuracy: (test=0.877) f1: (test=0.000) precision: (test=0.000) recall: (test=0.000) roc_auc: (test=0.853) total time=  58.9s
[CV 5/5] END model__max_depth=10, model__n_estimators=300; accuracy: (test=0.877) f1: (test=0.000) precision: (test=0.000) recall: (test=0.000) roc_auc: (test=0.850) total time=  59.2s
[CV 3/5] END model__max_depth=10, model__n_estimators=300; accuracy: (test=0.877) f1: (test=0.000) precision: (test=0.000) recall: (test=0.000) roc_auc: (test=0.851) total time=  59.7s
[CV 4/5] END model__max_depth=20, model__n_estimators=100; accuracy: (test=0.877) f1: (test=0.001) precision: (test=0.444) recall: (test=0.001) roc_auc: (test=0.854) total time= 1.3min
[CV 3/5] END model__max_depth=20, model__n_estimators=100; accuracy: (test=0.877) f1: (test=0.001) precision: (test=0.500) recall: (test=0.000) roc_auc: (test=0.855) total time= 1.3min
[CV 1/5] END model__max_depth=20, model__n_estimators=100; accuracy: (test=

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[CV 2/5] END model__max_depth=20, model__n_estimators=200; accuracy: (test=0.877) f1: (test=0.001) precision: (test=0.333) recall: (test=0.000) roc_auc: (test=0.855) total time= 2.5min
[CV 3/5] END model__max_depth=20, model__n_estimators=200; accuracy: (test=0.877) f1: (test=0.000) precision: (test=0.000) recall: (test=0.000) roc_auc: (test=0.855) total time= 2.5min
[CV 1/5] END model__max_depth=20, model__n_estimators=200; accuracy: (test=0.877) f1: (test=0.000) precision: (test=0.333) recall: (test=0.000) roc_auc: (test=0.858) total time= 2.6min
[CV 5/5] END model__max_depth=20, model__n_estimators=200; accuracy: (test=0.877) f1: (test=0.000) precision: (test=0.111) recall: (test=0.000) roc_auc: (test=0.855) total time= 2.6min
[CV 2/5] END model__max_depth=30, model__n_estimators=100; accuracy: (test=0.877) f1: (test=0.044) precision: (test=0.482) recall: (test=0.023) roc_auc: (test=0.852) total time= 2.6min
[CV 3/5] END model__max_depth=30, model__n_estimators=100; accuracy: (test=

In [19]:
best_model = grid_search_fitted.best_estimator_
y_proba, y_pred = predict_from_fitted_model(best_model, X_test, 0.5)

In [22]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

get_metrics(y_test, y_pred, y_proba, 0.50)

(           Metric Value
 0  Accuracy Score  0.88
 1   ROC AUC Score  0.85
 2    Best cut-off  0.50,
 '              precision    recall  f1-score   support\n\n           0       0.88      1.00      0.93     66917\n           1       0.46      0.02      0.04      9251\n\n    accuracy                           0.88     76168\n   macro avg       0.67      0.51      0.49     76168\nweighted avg       0.83      0.88      0.83     76168\n')

In [23]:
save_model_pickle(model, None, best_model)

FileNotFoundError: [Errno 2] No such file or directory: 'pickle/randomforestclassifier.pkl'

In [6]:
import pandas as pd
import pickle

with open('https://minio.lab.sspcloud.fr/jbrablx/ai_insurance/outputs/Logistic_Regression/resampling_none.pkl', 'r') as f:
    m = pickle.load(f)
m

FileNotFoundError: [Errno 2] No such file or directory: 'https://minio.lab.sspcloud.fr/jbrablx/ai_insurance/outputs/Logistic_Regression/resampling_none.pkl'

In [1]:
import pickle
import src.models.evaluation as eval
from src.app.utils import display_classif_metrics

m = "Logistic Regression"
method = "over"
with open(f'https://minio.lab.sspcloud.fr/s3/jbrablx/ai_insurance/outputs/Logistic_Regression/resampling_none.pkl', 'rb') as f:
        model = pickle.load(f)

y_proba, y_pred = eval.predict_from_fitted_model(model, X_test)
metrics_df, classif_report = eval.get_metrics(y_test, y_pred, y_proba)

FileNotFoundError: [Errno 2] No such file or directory: 'https://minio.lab.sspcloud.fr/s3/jbrablx/ai_insurance/outputs/Logistic_Regression/resampling_none.pkl'

In [2]:
import os

os.listdir('https://minio.lab.sspcloud.fr/s3/jbrablx/ai_insurance/')

FileNotFoundError: [Errno 2] No such file or directory: 'https://minio.lab.sspcloud.fr/s3/jbrablx/ai_insurance/'

In [3]:
metrics_df

NameError: name 'metrics_df' is not defined

In [None]:
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

In [None]:
if show_roc:
    st.pyplot()
, show_roc=False, show_features_importance=False, show_tree=False

st.table(metrics_df)
st.text("Classification Report:")
st.markdown(classification_report_str)

if hasattr(model_used, 'named_steps'):
        # Assurez-vous d'accéder au modèle spécifique pour obtenir les importances des caractéristiques
        specific_model = model_used.named_steps['model']
if show_features_importance and hasattr(specific_model, 'feature_importances_'):
if show_tree and hasattr(specific_model, 'estimators_'):

train_model(model, X_train, y_train, preprocessor, method=None)

model = get_model(model_name)
specific_model = model.named_steps['model']

y_proba, y_pred = predict_from_fitted_model(model, X_test, threshold)

metrics_df, classif_report = get_metrics(model, y_test, y_pred, y_proba, threshold)
top_importances, top_feature_names = get_feature_importance_names(specific_model, model)

fig_roc = get_roc_auc(y_test, y_proba)
fig_importance = plot_feature_importance(top_importances, top_feature_names)
fig_tree = plot_model_tree(specific_model, model)

In [None]:
pd.DataFrame(grid_search.cv_results_).sort_values('rank_test_score')