In [69]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MaxAbsScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline 

In [53]:
df = pd.read_csv('dataset/bank_marketing_clean.csv')

In [54]:
df_process = df.copy()

In [55]:
# separation variable num et categorielles

var_num = df_process.select_dtypes(exclude='object').columns
var_cat = df_process.select_dtypes(include='object').columns

In [56]:
# Normalisation des variables numeriques

scaler = MaxAbsScaler()
X_scaled = scaler.fit_transform(df_process[var_num])
df_process[var_num] = X_scaled

In [58]:
# encodage des variables categorielles binaire seulement

# Sélectionner les colonnes binaires
bin_col = [col for col in var_cat if df_process[col].nunique() == 2]

# Retirer 'contact' de la liste
bin_col.remove('contact')

# Mapper les valeurs 'yes' et 'no' en 1 et 0
for col in bin_col:
    df_process[col] = df_process[col].map({'yes': 1, 'no': 0})


In [59]:
# encodage des variables categorielles multiclasses par 'onehotencoder'
cat_cols = df_process.select_dtypes(include=['object']).columns.difference(bin_col)

encoder = OneHotEncoder(sparse_output=False, drop='first')
cat_encoded = encoder.fit_transform(df_process[cat_cols])

cat_encoded_df = pd.DataFrame(cat_encoded, columns= encoder.get_feature_names_out(cat_cols))
df_process = pd.concat([df_process.drop(columns=cat_cols), cat_encoded_df], axis = 1)

In [60]:
pd.set_option('display.max_columns', None)

In [61]:
df_process

Unnamed: 0,age,default,housing,loan,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y,contact_telephone,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,education_basic.6y,education_basic.9y,education_high.school,education_illiterate,education_professional.course,education_university.degree,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,job_unemployed,marital_married,marital_single,month_aug,month_dec,month_jul,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_nonexistent,poutcome_success
0,0.571429,0,0,0,0.053070,0.017857,-0.037037,0.000000,0.323529,0.991843,-0.716535,0.962735,0.992904,0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,0.581633,0,0,0,0.030297,0.017857,-0.037037,0.000000,0.323529,0.991843,-0.716535,0.962735,0.992904,0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,0.377551,0,1,0,0.045954,0.017857,-0.037037,0.000000,0.323529,0.991843,-0.716535,0.962735,0.992904,0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,0.408163,0,0,0,0.030704,0.017857,-0.037037,0.000000,0.323529,0.991843,-0.716535,0.962735,0.992904,0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,0.571429,0,0,1,0.062424,0.017857,-0.037037,0.000000,0.323529,0.991843,-0.716535,0.962735,0.992904,0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41171,0.744898,0,1,0,0.067914,0.017857,-0.037037,0.000000,-0.323529,1.000000,-1.000000,0.203766,0.949408,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
41172,0.469388,0,0,0,0.077877,0.017857,-0.037037,0.000000,-0.323529,1.000000,-1.000000,0.203766,0.949408,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
41173,0.571429,0,1,0,0.038430,0.035714,-0.037037,0.000000,-0.323529,1.000000,-1.000000,0.203766,0.949408,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
41174,0.448980,0,0,0,0.089874,0.017857,-0.037037,0.000000,-0.323529,1.000000,-1.000000,0.203766,0.949408,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [62]:
df_process.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41176 entries, 0 to 41175
Data columns (total 48 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   age                            41176 non-null  float64
 1   default                        41176 non-null  int64  
 2   housing                        41176 non-null  int64  
 3   loan                           41176 non-null  int64  
 4   duration                       41176 non-null  float64
 5   campaign                       41176 non-null  float64
 6   pdays                          41176 non-null  float64
 7   previous                       41176 non-null  float64
 8   emp.var.rate                   41176 non-null  float64
 9   cons.price.idx                 41176 non-null  float64
 10  cons.conf.idx                  41176 non-null  float64
 11  euribor3m                      41176 non-null  float64
 12  nr.employed                    41176 non-null 

In [64]:
# separation des variables explicatives et variable cible

X = df_process.drop('y', axis = 1)
y = df_process['y']

In [65]:
# Diviser les données en ensembles d'apprentissage et de test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [67]:
# resultat gridsearch :

# Best Hyperparameters: {'logreg__C': 100, 'logreg__penalty': 'l1', 'logreg__solver': 'saga'}
# Best Cross-validation Score: 0.9117
#               precision    recall  f1-score   support

#            0       0.92      0.97      0.95      7265
#            1       0.66      0.40      0.49       971

#     accuracy                           0.90      8236
#    macro avg       0.79      0.68      0.72      8236
# weighted avg       0.89      0.90      0.89      8236

# La classe 1 présente des performances nettement inférieures par rapport à la classe 0 due à un déséquilibre des classes.

In [70]:
# remedier l'equilibre de classe par 'SMOTE' 

# Définir SMOTE et le modèle de régression logistique
smote = SMOTE(random_state=42)
logreg = LogisticRegression(C=100, penalty='l1', solver='saga', max_iter=1000) # d'apres gridsearch

# Créer un pipeline avec SMOTE et la régression logistique
pipeline = Pipeline([
    ('smote', smote),
    ('logreg', logreg)
])

# Entraîner le modèle
pipeline.fit(X_train, y_train)

# Prédire sur l'ensemble de test
y_pred = pipeline.predict(X_test)

# Afficher le rapport de classification
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.86      0.92      7265
           1       0.45      0.85      0.59       971

    accuracy                           0.86      8236
   macro avg       0.71      0.86      0.75      8236
weighted avg       0.92      0.86      0.88      8236



