# Packages

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import missingno as msno # for null Values


from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, LabelEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn import set_config 
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from imblearn.over_sampling import SMOTE
import joblib
import warnings
warnings.filterwarnings("ignore")


## Import dataset

In [None]:
data=pd.read_csv("/home/onyxia/work/PROJET_STATAPP/Data/Cleans/fraud_oracle.csv", sep=";")
data.head()

In [None]:
data.info()

We have 15420 entries and 33 variables : 9 quantitatives and 24 categorical. 

Data explanation : 

- Month: The month in which the accident actually occurred. 

- WeekOfMonth: The week of the month in which the accident actually occurred.

- DayOfWeek: The day of the week on which the accident actually occurred.

- Make: The manufacturer of the vehicle involved in the claim.

- AccidentArea: The area where the accident occurred (urban/rural).

- DayOfWeekClaimed: The day of the week on which the insurance claim was processed.

- MonthClaimed: The month in which the insurance claim was processed.

- WeekOfMonthClaimed: The week of the month in which the insurance claim was processed.

- Sex: The gender of the policyholder.

- MaritalStatus: The material status of the policyholder.

- Age: The age of the policyholder.

- Fault: Indicates whether the policyholder was at fault in the accident.

- PolicyType: The type of insurance policy.

- VehicleCategory: The category of the vehicle (e.g., sedan, SUV).

- VehiclePrice: The price of vehicle.

- FraudFound_P: Indicates whether fraud was detected in the insurance claim (our target variable)

- PolicyNumber: The unique identifier for the insurance policy.

- RepNumber: The unique identifier for the insurance representative handling the claim.

- Deductible: The amount that the policy holder must pay out of pocket before the insurance company pays the remaining costs.

- DriverRating: The rating of the driver, often based on driving history or other factors.

- Days_Policy_Accident: The number of days since the policy was issued until the accident occurred.

- Days_Policy_Claim: The number of days since the policy was issued until the claim was made.

- PastNumberOfClaims: The number of claims previously made by the policyholder. 

- AgeOfVehicle: The age of the vehicle involved in the claim.

- AgeOfPolicyHolder: The age of the policyholder.

- PoliceReportFiled: Indicates whether a police report was filed for the accident.

- WitnessPresent: Indicates whether a witness was present at the scene of the accident.

- AgentType: The type of insurance agent handling the policy (e.g., internal, external)

- NumberOfSuppliments: The number of supplementary documents or claims related to the main claim, categorized into ranges.

- AddressChange_Claim: Indicates whether the address of the policyholder was changed at the time of the claim, categorized into ranges.

- NumberOfCars: The number of cars insured under the policy, categorized into ranges.

- Year: The year in which the claim was made or processed.

- BasePolicy: The base policy type (e.g., Liability, Collision, All Perils).

In [None]:
msno.matrix(data)

We don't have missing values.

In [None]:
#Checking for duplicate values
data.duplicated().sum()

No duplicated value

## Target variable

In [None]:
target_counts = data['FraudFound_P'].value_counts()

# Pie chart
plt.figure(figsize=(6,6))
plt.pie(target_counts, labels=target_counts.index, autopct='%1.1f%%', startangle=90, colors=['skyblue','salmon'])
plt.title('Distribution of FraudFound_P')
plt.show()

Our target Variable "Fraud_Found_P" is highly imbalanced. 
Oversampling address these issues by balancing the dataset, which helps the model to learn from both classes more effectively. This improves the model's ability to correctly predict the minority class, leading to better overall performance and more reliable evaluation metrics.

## Qualitative variables

In [None]:
cat = data.select_dtypes(include=['object', 'category'])
cat_cols = cat.columns 

# Loop through and display detailed info for each categorical variable
for col in cat_cols:
    print(f"\n==================== {col} ====================")
    print(f"Number of unique modalities: {data[col].nunique()}")
    print("\nPercentages :")
    print((data[col].value_counts(normalize=True, dropna=False) * 100).round(2))


- The features "Sex", "PoliceReportFiled", and "WitnessPresent" are actually Boolean Types. Should be converted to 0 or 1.
- The features "AccidentalArea", "Fault", and "AgentType" each have only two unique values. Can also be converted to 0 or 1.
- PolicyType - appears to be a concatenation of VehicleCategory and BasePolicy (so we'll drop those two)
- Anomalies : day_of_week_claimed (8 modalities), month_claimed (13 modalities) none on days_policy_Accident

In [None]:
data[data['DayOfWeekClaimed'] == '0']

We'll drop this line cause it has many anomalies (Age = 0, Monthclaimed = 0 but a WeekOfMonthClaimed that is not null, ...)

In [None]:
data = data[data['DayOfWeekClaimed'] != '0']

Now we look at lines where Days_Policy_Accident = None. 

In [None]:
none_days=data[data['Days_Policy_Accident'] == 'none']
none_days.head()

None can means that the accident occured the same day the policy was issued. So we'll leave it like that. 

### Relation with the target variable

In [None]:
for cat in cat_cols:
    # Proportions per category
    prop_df = pd.crosstab(data[cat], data['FraudFound_P'], normalize='index')
    prop_df = prop_df.reset_index()
    prop_df = prop_df.melt(id_vars=cat, value_vars=[0,1], var_name='FraudFound_P', value_name='Proportion')

    # Graph
    fig = px.bar(prop_df, x=cat,
        y='Proportion',
        color='FraudFound_P',
        text='Proportion',
        barmode='stack',
        color_discrete_map={0:'skyblue', 1:'salmon'},
        labels={'FraudFound_P':'Fraud', 'Proportion':'Proportion'},
        title=f'Proportion of Fraud vs Non-Fraud by {cat}'
    )

    fig.update_traces(texttemplate='%{text:.1%}', textposition='inside')
    fig.update_layout(yaxis_tickformat='.0%')
    fig.show()


- Variables of dates(Month, WeekOfMonth, DayOfWeek, DayOfWeekClaimed, WeekOfMonthClaimed, MonthClaimed) seems not to influence the fraud too much
- The Mercedes owners are more likely to be involved in fraud,with almost double the incidence compared to the second-highest group, Accura owners. On the other hand, the Porche, Lexus, Jaguar, Ferrari owners have never been reported for fraud; all the four make are "High-End".
- The results suggest that males are significantly more likely to be involved in detected fraud cases compared to females. 
- Fraud is generally declared when the fault comes from the policy holder
- Fraudulent cases were detected most frequently under the "All Perils". Within specified policy types, "Sport-Collision" had the highest fraud detection rate.
- Fraud is most frequently detected among teenagers and retired seniors. Teenagers have weak financial power as they have not yet started their economic activities, and retired seniors experience weakened financial power post-retirement.
- Fraudster use to change their address 

In [None]:
# Test of chi2 

from scipy.stats import chi2_contingency

for var in cat_cols:
    # Crée la table de contingence
    contingency_table = pd.crosstab(data[var], data['FraudFound_P'])
    
    # Applique le test du chi-deux
    chi2, p, dof, expected = chi2_contingency(contingency_table)
    
    print(f"Variable: {var}")
    print(f"Chi2 Statistic: {chi2:.2f}, p-value: {p:.4f}")
    print("-"*40)


The khi-2 test shows us that NumberofCars, WitnessPresent, Days_Policy_claim, MaritalStatus, Dayofweekclaimed, dayofweek are not significative at 5%. 

## Quantitative variables

In [None]:
quanti = data.select_dtypes(include=['int64', 'float64'])
quanti_cols = data.select_dtypes(include=['int64', 'float64']).columns

In [None]:
quanti.describe()

The minimum value of "Age" is 0. It totally doesn't make sense.  
"PolicyNumber" and "RepNumber" are merely identification numbers. Let's drop them.

Strictly speaking, among all the features, only Age is a numeric variable. The rest can be interpreted as categorical variables.

### Age

In [None]:
# Create a plot with 2 subplots
fig = make_subplots(rows=1, cols=2, subplot_titles=("Histogram of Age", "Boxplot of Age"))

# Histogram
hist = px.histogram(data, x="Age", nbins=50)
fig.add_trace(hist.data[0], row=1, col=1)

# Boxplot
box = px.box(data, y="Age")
fig.add_trace(box.data[0], row=1, col=2)

fig.update_layout(
    showlegend=False,
    height=400,
    width=900
)

fig.show()


We have some outliers but they are plausible and their percentage is not very large. 
Actually, Age and AgeofPolicyHolder seems to be the same. Let's check it 

In [None]:

# Bornes et labels
bins = [0, 17, 20, 25, 30, 35, 40, 50, 65, float('inf')]
labels = ['0-17','18-20','21-25', '26-30', '31-35', '36-40', '41-50', '51-65', '65+']

# Discrétisation de la variable Age
data['Age_Binned'] = pd.cut(data['Age'], bins=bins, labels=labels, right=False)

# 3Distribution (%) pour Age (discrétisé)
dist_age = (
    data['Age_Binned']
    .value_counts(normalize=True)
    .sort_index()
    .reset_index()
    .rename(columns={'index': 'Category', 'Age_Binned': 'Proportion'})
)
dist_age.columns = ['Category', 'Proportion']

# Distribution (%) pour AgeOfPolicyHolder (déjà catégorielle)
dist_policy = (
    data['AgeOfPolicyHolder']
    .value_counts(normalize=True)
    .sort_index()
    .reset_index()
    .rename(columns={'index': 'Category', 'AgeOfPolicyHolder': 'Proportion'})
)
dist_policy.columns = ['Category', 'Proportion']

# Création de sous-graphiques côte à côte
fig = make_subplots(
    rows=1, cols=2,
    subplot_titles=("Distribution of Age (Binned)", "Distribution of AgeOfPolicyHolder"),
    shared_yaxes=True
)

# Ajout du graphique pour Age
fig.add_trace(
    go.Bar(
        x=dist_age['Category'],
        y=dist_age['Proportion'],
        text=[f"{p:.1%}" for p in dist_age['Proportion']],
        textposition='auto',
        marker_color='skyblue',
        name='Age'
    ),
    row=1, col=1
)

# Ajout du graphique pour AgeOfPolicyHolder
fig.add_trace(
    go.Bar(
        x=dist_policy['Category'],
        y=dist_policy['Proportion'],
        text=[f"{p:.1%}" for p in dist_policy['Proportion']],
        textposition='auto',
        marker_color='salmon',
        name='AgeOfPolicyHolder'
    ),
    row=1, col=2
)

# Mise en forme
fig.update_layout(
    title_text="Side-by-Side Comparison: Age vs AgeOfPolicyHolder",
    showlegend=False,
    yaxis_tickformat='.0%',
    height=500,
    width=1000
)

fig.update_xaxes(title_text="Age Category", row=1, col=1)
fig.update_xaxes(title_text="Policy Holder Age Category", row=1, col=2)
fig.update_yaxes(title_text="Proportion", row=1, col=1)

fig.show()


In [None]:
from scipy.stats import chi2_contingency

# S'assurer qu'il n'y a pas de valeurs manquantes
subset = data[['Age_Binned', 'AgeOfPolicyHolder']].dropna()

# Créer la table de contingence
contingency = pd.crosstab(subset['Age_Binned'], subset['AgeOfPolicyHolder'])

# Appliquer le test du Khi-deux
chi2, p, dof, expected = chi2_contingency(contingency)

# Afficher les résultats
print("Chi2 Statistic:", round(chi2, 2))
print("Degrees of Freedom:", dof)
print("p-value:", p)

# Optionnel : afficher la table de contingence
print("\nContingency Table:")
display(contingency)


There is a strong dependency between the variables. We''ll use AgeofPolicyHolder to impute Age

In [None]:
# There are  320 records where the age is 0
print('There are ', len(data[data['Age']==0]), 'records where the age is 0')

# For all this rows policy holder age  is between 16 and 17 years old.
print('For all this rows policy holder age is: ', data.loc[(data['Age']==0),'AgeOfPolicyHolder'].unique())

We  Will replace all 0 values with 16 or 17 randomly

In [None]:
random_ages = np.random.choice([16, 17], size=data['Age'].eq(0).sum())
data.loc[data['Age'] == 0, 'Age'] = random_ages

print('Now there are ', len(data[data['Age']==0]), 'records where the age is 0')

### Variable delay

We create a quantitative variable that will compute the distance between the day of the accident and the day of claim 

In [None]:
months_map = {
    'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4, 'May': 5, 'Jun': 6,
    'Jul': 7, 'Aug': 8, 'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12
}

# Mapping des mois en nombres
data['Month_num'] = data['Month'].map(months_map)
data['MonthClaimed_num'] = data['MonthClaimed'].map(months_map)

# Ajustement si MonthClaimed < Month (cas de passage à l’année suivante)
data.loc[data['MonthClaimed_num'] <= data['Month_num'], 'MonthClaimed_num'] += 12

# Calcul du délai approximatif en semaines
data['delay_weeks'] = (
    (data['MonthClaimed_num'] - data['Month_num']) * 5 +
    (data['WeekOfMonthClaimed'] - data['WeekOfMonth'])
)
data

In [None]:
# Variables quantitatives à analyser
vars_quanti = ["delay_weeks", "Age"]

for col in vars_quanti:
    # --- Boxplot ---
    fig_box = px.box(data,
        x="FraudFound_P",          
        y=col,                   
        color="FraudFound_P",      
        title=f"Boxplot de {col} selon la fraude"
    )
    fig_box.show()

Correlations are low

### Suppression des variables inutiles

In [None]:
cols_to_drop = [
    'Month', 'WeekOfMonth', 'DayOfWeek', 'DayOfWeekClaimed', 
    'MonthClaimed', 'WeekOfMonthClaimed', 'PolicyNumber', 
    'RepNumber', 'Days_Policy_Claim', 'AgeOfPolicyHolder', 
    'BasePolicy', 'Year', "VehicleCategory", 'Month_num', 'MonthClaimed_num'
]

data = data.drop(columns=cols_to_drop)

In [None]:
print(data.columns)

In [None]:
df = data

In [None]:
import os 
folder_path = "/home/onyxia/work/PROJET_STATAPP/Data/Cleans"
file_path = os.path.join(folder_path, "data_vehicle_cleaned.csv")
df.to_csv(file_path, index=False)

In [None]:
df=pd.read_csv("/home/onyxia/work/PROJET_STATAPP/Data/Cleans/data_vehicle_cleaned.csv", sep=",")

# Pipeline de feature engineering

## Feature engineering

Jusqu'ici, nous avons créé la variable delay_weeks, supprimé les variables qui ont nécessité sa création et d'autres variables non pertinentes. Nous avons par ailleurs, imputé les données manquantes sur Age. 

Nous procédons ici présent à l'encodage dans variables qualitatives, le scaling de Age et delay_weeks. 

Puis nous verrons le feature importance de chaque variable.

In [None]:
# Modalités des variables
exclude_cols = ['Age', 'delay_weeks', 'FraudFound_P']
for col in df.columns:
    if col not in exclude_cols:
        print(f"\n--- {col} ---")
        print(data[col].unique())  # affiche toutes les modalités


In [None]:
#Train-test-split
X = df.drop(columns=['FraudFound_P'])
y = df['FraudFound_P']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

### Pipeline

# Encodage des variables Make , PolicyType et MaritalStatus : df

In [None]:
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn import preprocessing
# define Feature Hashing Vectorizer
vectorizer_Make = HashingVectorizer(n_features=8, norm=None, alternate_sign=False, ngram_range=(1,1), binary=True)
vectorizer_PolicyType = HashingVectorizer(n_features=4, norm=None, alternate_sign=False, ngram_range=(1,1), binary=True)

vectorizer_MaritalStatus = HashingVectorizer(n_features=4, norm=None, alternate_sign=False, ngram_range=(1,1), binary=True)
# fit the hashing vectorizer and transform the education column
X_Make = vectorizer_Make.fit_transform(df["Make"])
X_PolicyType = vectorizer_PolicyType.fit_transform(df['PolicyType'])
# transformed and raw column to data frame
df_Make = pd.DataFrame(X_Make.toarray()).assign(Make = df["Make"])
df_Make.columns=['make_0','make_1','make_2','make_3','make_4', 'make_5', 'make_6', 'make_7', 'Make']
df_PolicyType = pd.DataFrame(X_PolicyType.toarray()).assign(PolicyType = df['PolicyType'])
df_PolicyType.columns=['PolicyType_0', 'PolicyType_1', 'PolicyType_2', 'PolicyType_3', 'PolicyType']
df_make_PolicyType = pd.concat([df_Make, df_PolicyType], axis=1)

lb = preprocessing.LabelBinarizer()
lb.fit(df["MaritalStatus"])

MaritalStatus_one_hot_sklearn_binar = pd.DataFrame(lb.transform(X_train["MaritalStatus"]), columns=lb.classes_)
MaritalStatus_one_hot_sklearn_binar

df.drop(['Make','MaritalStatus', 'PolicyType'], axis=1, inplace=True)
df_make_PolicyType.drop(['Make', 'PolicyType'], axis=1, inplace=True)

df = pd.concat([df, df_make_PolicyType, MaritalStatus_one_hot_sklearn_binar], axis=1, join='inner')



In [None]:
columns_ = list(df_make_PolicyType.columns) + list(MaritalStatus_one_hot_sklearn_binar.columns)
columns_

In [None]:
df

In [None]:
#Train-test-split
X = df.drop(columns=['FraudFound_P'])
y = df['FraudFound_P']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.preprocessing import LabelBinarizer, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# -----------------------------
# Colonnes par type
# -----------------------------
ordinal_cols = ['VehiclePrice', 'Days_Policy_Accident', 'PastNumberOfClaims', 
                'AgeOfVehicle', 'NumberOfSuppliments', 'AddressChange_Claim', 
                'NumberOfCars', 'DriverRating', 'Deductible']

binary_cols = ['AccidentArea','Sex','Fault','PoliceReportFiled','WitnessPresent','AgentType']
scale_cols = ['Age', 'delay_weeks'] 

# Ajoutez ces variables si nécessaires:
label_cols = ['MaritalStatus']  # À définir selon vos données
freq_cols = []  # À définir selon vos données
hash_cols = ['Make', 'PolicyType']  # Exemple de colonnes pour HashingVectorizer

# -----------------------------
# Transformers personnalisés
# -----------------------------
class HashingVectorizerWrapper(BaseEstimator, TransformerMixin):
    def __init__(self, n_features):
        self.n_features = n_features
        self.vectorizer = HashingVectorizer(
            n_features=n_features,
            norm=None,
            alternate_sign=False,
            ngram_range=(1,1),
            binary=True
        )
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        # X est un DataFrame -> convertir en série 1D de strings
        X_str = X.iloc[:,0].astype(str)
        transformed = self.vectorizer.transform(X_str)
        # Retourner un array dense pour compatibilité
        return transformed.toarray()

class LabelBinarizerWrapper(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.lb = LabelBinarizer()
        
    def fit(self, X, y=None):
        self.lb.fit(X.iloc[:,0])
        return self
    
    def transform(self, X):
        return self.lb.transform(X.iloc[:,0])

class BinaryEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.binary_map = {
            'No':0, 'Yes':1, 'Female':0, 'Male':1, 'Urban':1, 'Rural':0, 
            'Policy Holder':1, 'Third Party':0, 'External':0, 'Internal':1
        }
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X_trans = X.copy()
        for col in X_trans.columns:
            X_trans[col] = X_trans[col].map(self.binary_map)
        return X_trans.values  # Retourner un array pour compatibilité

class OrdinalMapper(BaseEstimator, TransformerMixin):
    def __init__(self, mapping_dict):
        self.mapping_dict = mapping_dict
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X_trans = X.copy()
        for col, map_dict in self.mapping_dict.items():
            X_trans[col] = X_trans[col].map(map_dict)
        return X_trans.values  # Retourner un array pour compatibilité

# Mapping des colonnes ordinales
ordinal_mapping = {
    'VehiclePrice': {'less than 20000':0, '20000 to 29000':1, '30000 to 39000':2,
                     '40000 to 59000':3, '60000 to 69000':4, 'more than 69000':5},
    'Days_Policy_Accident': {'none':0, '1 to 7':1, '8 to 15':2, '15 to 30':3, 'more than 30':4},
    'PastNumberOfClaims': {'none':0, '1':1, '2 to 4':2, 'more than 4':3},
    'AgeOfVehicle': {'new':0, '2 years':1, '3 years':2, '4 years':3, 
                     '5 years':4, '6 years':5, '7 years':6, 'more than 7':7},
    'NumberOfSuppliments': {'none':0, '1 to 2':1, '3 to 5':2, 'more than 5':3},
    'AddressChange_Claim': {'no change':0,'under 6 months':1, '1 year':2, 
                           '2 to 3 years':3, '4 to 8 years':4},
    'NumberOfCars': {'1 vehicle':0, '2 vehicles':1, '3 to 4':2, '5 to 8':3, 'more than 8':4},
    'DriverRating': {1:1, 2:2, 3:3, 4:4},
    'Deductible': {300:300, 400:400, 500:500, 700:700}
}

# Pipeline ordinal
ordinal_pipe = Pipeline([
    ('ordinal_map', OrdinalMapper(ordinal_mapping))
])

# -----------------------------
# Pipeline complet CORRIGÉ
# -----------------------------

preprocessor = ColumnTransformer([
    ('binary', BinaryEncoder(), binary_cols),
    ('ordinal', ordinal_pipe, ordinal_cols),
    ('scale', StandardScaler(), scale_cols),
    # Ajoutez d'autres transformers selon vos besoins:
    # ('hashing', HashingVectorizerWrapper(n_features=8), ['Make']),
    # ('label_bin', LabelBinarizerWrapper(), ['MaritalStatus'])
])



X_train_processed = pd.DataFrame(preprocessor.fit_transform(X_train))
X_test_processed = preprocessor.transform(X_test)

#

In [None]:
X_test_processed

In [None]:
len(all_cols)

In [None]:
 #Convertir en DataFrame avec noms de colonnes
all_cols = binary_cols + ordinal_cols + scale_cols 

df1 = pd.DataFrame(X_train_processed).reset_index(drop=True)
df1.columns = all_cols
df2 = X_train[columns_].reset_index(drop=True)
X_train_processed = pd.concat([df1, df2], axis=1, join='inner')



df1_test = pd.DataFrame(X_test_processed).reset_index(drop=True)
df1_test.columns = all_cols
df2_test = X_test[columns_].reset_index(drop=True)
X_test_processed = pd.concat([df1_test, df2_test], axis=1, join='inner')
#X_test_processed = pd.concat([pd.DataFrame(X_test_processed, columns=all_cols), X_test[columns_]], axis=1, join='inner')

In [None]:
X_test_processed


In [None]:
len(X_train_processed), len(X_train)

In [None]:
X_test_processed

In [None]:
X_train['delay_weeks'].describe()

In [None]:
set_config(display='diagram')
preprocessor

# Modélisation

## Feature importance

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
X_train_processed.shape, y_train.shape

In [None]:
X1 = X_train_processed.copy()
y1 = y_train.copy()

rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf.fit(X1, y1)

importances = pd.Series(rf.feature_importances_, index=X1.columns).sort_values(ascending=False)
plt.figure(figsize=(10,5))
importances.head(15).plot(kind='bar', color='teal')
plt.title("Top 15 Feature Importances - RandomForest")
plt.show()


### SMOTE

The Synthetic Minority Over-sampling Technique (SMOTE) is a method used in machine learning to address the issue of imbalanced datasets. Imbalanced datasets are common in classification problems, especially Fraud Detection datasets, where one class (often the minority class) has significantly fewer instances than the other class(es). SMOTE aims to balance the class distribution by generating synthetic examples from the minority class.

In [None]:
!{sys.executable} -m pip install --upgrade imbalanced-learn

In [None]:
from imblearn.over_sampling import SMOTE
from collections import Counter

print("Avant SMOTE :", Counter(y_train))

sm = SMOTE(sampling_strategy=0.25,  # 20% fraude,
     random_state=42, 
     k_neighbors=5)
X_train_res, y_train_res = sm.fit_resample(X_train_processed, y_train)

print("Après SMOTE :", Counter(y_train_res))

## Modèles

In [None]:
# -----------------------------
# Définir les modèles
# -----------------------------
models = {
    "LogisticRegression": LogisticRegression(max_iter=1000, random_state=42),
    "DecisionTree": DecisionTreeClassifier(random_state=42),
    "RandomForest": RandomForestClassifier(n_estimators=200, random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
    "LightGBM": LGBMClassifier(random_state=42)
}

# -----------------------------
# Entraîner, évaluer et sauvegarder les modèles
# -----------------------------
results = []

for name, model in models.items():
    print(f"\n--- Training {name} ---")
    
    # Entraînement
    model.fit(X_train_res, y_train_res)
    
    # Sauvegarde
    joblib.dump(model, f"{name}.pkl")
    
    # Prédiction
    y_pred = model.predict(X_test_processed)
    y_proba = model.predict_proba(X_test_processed)[:,1]  # probabilité pour AUC
    
    # Metrics
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_proba)
    
    results.append({
        "Model": name,
        "Accuracy": acc,
        "Precision": prec,
        "Recall": rec,
        "F1-score": f1,
        "AUC": auc
    })

# -----------------------------
# Tableau récapitulatif
# -----------------------------
results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by="F1-score", ascending=False).reset_index(drop=True)

print("\n=== Résultats des modèles ===")
print(results_df)


### Indice de Youden

In [None]:
from sklearn.metrics import roc_curve
lr_model = models["LogisticRegression"]

# Probabilité pour la classe positive
y_proba = lr_model.predict_proba(X_test_processed)[:, 1]

# Calculer TPR, FPR pour tous les seuils
fpr, tpr, thresholds = roc_curve(y_test, y_proba)

# Indice de Youden
youden_index = tpr - fpr
best_idx = np.argmax(youden_index)
best_threshold = thresholds[best_idx]

print("Meilleur seuil (indice de Youden) :", best_threshold)
print("TPR (Recall) à ce seuil :", tpr[best_idx])
print("FPR à ce seuil :", fpr[best_idx])

# Appliquer le seuil optimal
y_pred_best = (y_proba >= best_threshold).astype(int)

acc = accuracy_score(y_test, y_pred_best)
prec = precision_score(y_test, y_pred_best)
rec = recall_score(y_test, y_pred_best)
f1 = f1_score(y_test, y_pred_best)
auc = roc_auc_score(y_test, y_proba)  # AUC reste sur la probabilité

print("\n=== Métriques avec seuil optimal (Youden) ===")
print(f"Accuracy: {acc:.3f}, Precision: {prec:.3f}, Recall: {rec:.3f}, F1-score: {f1:.3f}, AUC: {auc:.3f}")


## CatBoost

In [None]:
from catboost import CatBoostClassifier

In [None]:
from sklearn.metrics import roc_curve

# Fonction pour obtenir le seuil optimal (Youden)
def optimal_threshold_youden(y_true, y_proba):
    fpr, tpr, thresholds = roc_curve(y_true, y_proba)
    youden = tpr - fpr
    idx = np.argmax(youden)
    return thresholds[idx], tpr[idx], 1 - fpr[idx]


# -----------------------------
# Définir les modèles
# -----------------------------
models = {
    "RandomForest": RandomForestClassifier(n_estimators=200, random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
    "CatBoost": CatBoostClassifier(
        iterations=500,
        depth=8,
        learning_rate=0.05,
        loss_function='Logloss',
        eval_metric='AUC',
        random_seed=42,
        verbose=False,
        class_weights=[1, 4]   # car 20% de fraude
    )
}

results = []

# -----------------------------
# Entraîner, évaluer, Youden
# -----------------------------
for name, model in models.items():
    print(f"\n--- Training {name} ---")
    
    # 1. Entraînement
    model.fit(X_train_res, y_train_res)
    
    # 2. Probabilités
    y_proba = model.predict_proba(X_test_processed)[:, 1]
    
    # 3. Seuil optimal (Youden)
    best_threshold, sens, spec = optimal_threshold_youden(y_test, y_proba)
    
    # 4. Nouvelle prédiction selon ce seuil
    y_pred_opt = (y_proba >= best_threshold).astype(int)
    
    # 5. Calcul des métriques au seuil optimal
    acc = accuracy_score(y_test, y_pred_opt)
    prec = precision_score(y_test, y_pred_opt)
    rec = recall_score(y_test, y_pred_opt)
    f1 = f1_score(y_test, y_pred_opt)
    auc = roc_auc_score(y_test, y_proba)
    

    # 6. Stocker les résultats
    results.append({
        "Model": name,
        "Best Threshold (Youden)": best_threshold,
        "Sensitivity": sens,
        "Specificity": spec,
        "Accuracy": acc,
        "Precision": prec,
        "Recall": rec,
        "F1-score": f1,
        "AUC": auc
    })


# -----------------------------
# Tableau récapitulatif
# -----------------------------
results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by="F1-score", ascending=False).reset_index(drop=True)

print("\n=== Résultats des modèles (au seuil optimal) ===")
print(results_df)


## Optimisation des hyperparamètres

### Catboost

| Hyperparamètres       | Description                                | 
| --------------------- | ------------------------------------------ | 
| `iterations`          | Nombre d’arbres                            | 
| `depth`               | Profondeur maximale des arbres             | 
| `learning_rate`       | Taux d’apprentissage                       | 
| `l2_leaf_reg`         | Régularisation L2                | 
| `border_count`        | Nombre de bins pour les features continues | 
| `class_weights`       | Importance des classes                     | 
| `bagging_temperature` | Pour booster la variance et régularisation | 
| `random_strength`     | Bruit aléatoire dans la construction       | 0–1                          

In [None]:
from sklearn.model_selection import GridSearchCV

# Définition du modèle de base
cat = CatBoostClassifier(
    loss_function='Logloss',
    eval_metric='AUC',
    random_seed=42,
    verbose=0,
    class_weights=[1, 4]  # ratio 20% fraude
)

# Grille d'hyperparamètres
param_grid = {
    'iterations': [100,300,500],
    'depth': [6, 8, 10],
    'learning_rate': [0.01, 0.05, 0.1],
    'l2_leaf_reg': [3, 5, 7]
}

# GridSearchCV
grid_search = GridSearchCV(
    estimator=cat,
    param_grid=param_grid,
    scoring='roc_auc',
    cv=3,
    verbose=2,
    n_jobs=-1
)

# Entraînement
grid_search.fit(X_train_res, y_train_res)

# Meilleurs paramètres
print("Meilleurs paramètres CatBoost :", grid_search.best_params_)
print("Meilleure AUC CV :", grid_search.best_score_)


In [None]:
# Récupération du meilleur modèle
best_cat = grid_search.best_estimator_

# Probabilités pour le test
y_proba = best_cat.predict_proba(X_test_processed)[:, 1]

# Calcul du seuil optimal (Youden)
from sklearn.metrics import roc_curve
def optimal_threshold_youden(y_true, y_proba):
    fpr, tpr, thresholds = roc_curve(y_true, y_proba)
    youden = tpr - fpr
    idx = np.argmax(youden)
    return thresholds[idx], tpr[idx], 1 - fpr[idx]

best_threshold, sens, spec = optimal_threshold_youden(y_test, y_proba)
y_pred_opt = (y_proba >= best_threshold).astype(int)

# Métriques finales
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
acc = accuracy_score(y_test, y_pred_opt)
prec = precision_score(y_test, y_pred_opt)
rec = recall_score(y_test, y_pred_opt)
f1 = f1_score(y_test, y_pred_opt)
auc = roc_auc_score(y_test, y_proba)

print("\n=== Résultats CatBoost optimisé ===")
print(f"Best Threshold (Youden): {best_threshold:.3f}")
print(f"Sensitivity: {sens:.3f}, Specificity: {spec:.3f}")
print(f"Accuracy: {acc:.3f}, Precision: {prec:.3f}, Recall: {rec:.3f}, F1: {f1:.3f}, AUC: {auc:.3f}")


### LightGBM

| Hyperparamètre      | Description                                                                                          |
| ------------------- | ---------------------------------------------------------------------------------------------------- |
| `num_leaves`        | Nombre maximum de feuilles par arbre (contrôle la complexité, équivalent à la profondeur effective). |
| `max_depth`         | Profondeur maximale des arbres (permet de limiter la complexité, -1 = illimité).                     |
| `learning_rate`     | Taux d’apprentissage, plus petit = modèle plus robuste mais nécessite plus d’arbres.                 |
| `n_estimators`      | Nombre total d’arbres (boosting rounds).                                                             |
| `min_data_in_leaf`  | Nombre minimum d’observations dans une feuille (augmente la régularisation).                         |
| `feature_fraction`  | Fractions de variables à utiliser pour chaque arbre (équivalent au “colsample”).                     |
| `bagging_fraction`  | Fraction d’observations utilisées pour chaque arbre (sous-échantillonnage).                          |
| `bagging_freq`      | Fréquence du bagging (0 = désactivé, >0 = ex : 5 = appliquer toutes les 5 itérations).               |
| `lambda_l1`         | Régularisation L1 (LASSO) sur les feuilles.                                                          |
| `lambda_l2`         | Régularisation L2 (Ridge) sur les feuilles.                                                          |
| `min_gain_to_split` | Gain minimal pour créer une nouvelle division (pruning).                                             |
| `class_weight`      | Gestion du déséquilibre : ex. `{0:1, 1:5}` ou `"balanced"`.                                          |
| `subsample_for_bin` | Taille de l’échantillon utilisé pour créer les histogrammes de binning.                              |
| `max_bin`           | Nombre maximal de bins pour les features continues (granularité des splits).                         |


In [None]:
from sklearn.model_selection import GridSearchCV

# -----------------------------
# Définition du modèle de base
# -----------------------------
lgbm = LGBMClassifier(
    objective='binary',
    random_state=42,
    class_weight='balanced'
)

# -----------------------------
# Grille d'hyperparamètres
# -----------------------------
param_grid = {
    'n_estimators': [100, 300, 500],
    'max_depth': [6, 8, 10],               
    'learning_rate': [0.01, 0.05, 0.1],   
}

# -----------------------------
# GridSearchCV
# -----------------------------
grid_search = GridSearchCV(
    estimator=lgbm,
    param_grid=param_grid,
    scoring='roc_auc',
    cv=3,
    verbose=2,
    n_jobs=-1
)

# -----------------------------
# Entraînement
# -----------------------------
grid_search.fit(X_train_res, y_train_res)

# -----------------------------
# Meilleurs paramètres
# -----------------------------
print("Meilleurs paramètres LightGBM :", grid_search.best_params_)
print("Meilleure AUC CV :", grid_search.best_score_)


In [None]:
# Récupération du meilleur modèle LightGBM
best_lgbm = grid_search.best_estimator_

# Probabilités pour le test
y_proba = best_lgbm.predict_proba(X_test_processed)[:, 1]

# ---- Calcul du seuil optimal (Youden) ----
from sklearn.metrics import roc_curve

def optimal_threshold_youden(y_true, y_proba):
    fpr, tpr, thresholds = roc_curve(y_true, y_proba)
    youden = tpr - fpr
    idx = np.argmax(youden)
    return thresholds[idx], tpr[idx], 1 - fpr[idx]

best_threshold, sens, spec = optimal_threshold_youden(y_test, y_proba)

# Prédictions avec le seuil optimal
y_pred_opt = (y_proba >= best_threshold).astype(int)

# ---- Métriques finales ----
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

acc = accuracy_score(y_test, y_pred_opt)
prec = precision_score(y_test, y_pred_opt)
rec = recall_score(y_test, y_pred_opt)
f1 = f1_score(y_test, y_pred_opt)
auc = roc_auc_score(y_test, y_proba)

print("\n=== Résultats LightGBM optimisé ===")
print(f"Best Threshold (Youden): {best_threshold:.3f}")
print(f"Sensitivity: {sens:.3f}, Specificity: {spec:.3f}")
print(f"Accuracy: {acc:.3f}, Precision: {prec:.3f}, Recall: {rec:.3f}, F1: {f1:.3f}, AUC: {auc:.3f}")
