In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV  
from sklearn.impute import SimpleImputer  
from sklearn.preprocessing import OneHotEncoder  
from sklearn.compose import ColumnTransformer  
from sklearn.pipeline import Pipeline  
from sklearn.ensemble import RandomForestClassifier  
from sklearn.metrics import accuracy_score, classification_report  

In [3]:
df = pd.read_csv("https://minio.lab.sspcloud.fr/jbrablx/ai_insurance/raw/train.csv")

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 381109 entries, 0 to 381108
Data columns (total 12 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   id                    381109 non-null  int64  
 1   Gender                381109 non-null  object 
 2   Age                   381109 non-null  int64  
 3   Driving_License       381109 non-null  int64  
 4   Region_Code           381109 non-null  float64
 5   Previously_Insured    381109 non-null  int64  
 6   Vehicle_Age           381109 non-null  object 
 7   Vehicle_Damage        381109 non-null  object 
 8   Annual_Premium        381109 non-null  float64
 9   Policy_Sales_Channel  381109 non-null  float64
 10  Vintage               381109 non-null  int64  
 11  Response              381109 non-null  int64  
dtypes: float64(3), int64(6), object(3)
memory usage: 34.9+ MB


In [5]:
df.drop(['id'], axis=1, inplace=True)

In [6]:
num_features = ['Age', 'Annual_Premium', 'Vintage']  
cat_features = ['Gender', 'Vehicle_Age', 'Driving_License', 'Region_Code', 'Previously_Insured', 'Vehicle_Damage', 'Policy_Sales_Channel'] 

In [7]:
num_transformer = SimpleImputer(strategy='median')  
cat_transformer = Pipeline(steps=[  
    ('imputer', SimpleImputer(strategy='most_frequent')),  
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  
])
preprocessor = ColumnTransformer(  
transformers=[  
    ('num', num_transformer, num_features),  
    ('cat', cat_transformer, cat_features)  
])

In [8]:
X = df.drop('Response', axis=1)  
y = df['Response']

In [9]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
model = RandomForestClassifier(n_estimators=100, random_state=42)  
pipeline = Pipeline(steps=[('preprocessor', preprocessor),  
                           ('model', model)])

In [11]:
pipeline.fit(X_train, y_train)

In [None]:
y_pred = pipeline.predict(X_val)  

In [None]:
print(f"Accuracy Score: {accuracy_score(y_val, y_pred):.2f}")
print("Classification Report:")
print(classification_report(y_val, y_pred))

In [None]:
param_grid = {  
    'model__n_estimators': [50, 100, 200],  
    'model__max_depth': [None, 5, 10, 20],  
}

In [None]:
grid_search = GridSearchCV(pipeline, param_grid, cv=5)  
grid_search.fit(X_train, y_train)  
print(f"Best params: {grid_search.best_params_}")  
print(f"Best cross-validation score: {grid_search.best_score_}")  

In [None]:
print("\nBest Parameters after Grid Search:")
print(grid_search.best_params_)
print("Best Model Score:")
print(grid_search.best_score_)

In [None]:
pd.DataFrame(grid_search.cv_results_).sort_values('rank_test_score')