In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN
from collections import Counter
import warnings
warnings.filterwarnings("ignore");

In [2]:
df = pd.read_csv('subscribe_data.csv', sep = ';')
df.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,34,entrepreneur,married,high.school,unknown,no,no,telephone,may,wed,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.859,5191.0,no
1,42,blue-collar,divorced,basic.9y,no,no,no,cellular,apr,mon,...,2,999,0,nonexistent,-1.8,93.075,-47.1,1.405,5099.1,no
2,30,blue-collar,married,basic.4y,no,yes,no,cellular,apr,mon,...,3,999,0,nonexistent,-1.8,93.075,-47.1,1.405,5099.1,no
3,33,services,married,high.school,no,unknown,unknown,cellular,aug,thu,...,2,999,0,nonexistent,-2.9,92.201,-31.4,0.873,5076.2,no
4,36,admin.,married,university.degree,unknown,yes,no,telephone,may,wed,...,2,999,0,nonexistent,1.1,93.994,-36.4,4.859,5191.0,no


In [3]:
# Changing 'unknown' and '999' mode in the 'pdays' columns as NaN
df = df.replace('unknown', np.nan)
df['pdays'] = df['pdays'].replace(999, np.nan)

In [4]:
df.isna().sum()

age                   0
job                 174
marital              40
education           870
default            4329
housing             479
loan                479
contact               0
month                 0
day_of_week           0
duration              0
campaign              0
pdays             19832
previous              0
poutcome              0
emp.var.rate          0
cons.price.idx        0
cons.conf.idx         0
euribor3m             0
nr.employed           0
y                     0
dtype: int64

In [None]:
# Percentage of NaN in the dataset
# Calcul du pourcentage de lignes contenant au moins un NaN
percentage_nan_rows = df.isna().any(axis=1).mean() * 100
print(f"Pourcentage de lignes contenant des NaN : {percentage_nan_rows:.2f}%")

# Pourcentage de NaN dans le dataset
percentage_nan = df.isna().any().mean() * 100
print(f"Pourcentage de NaN total : {percentage_nan:.2f}%")

In [5]:
# function - Train test split & Logistic Regression
def modeling(X, y) :
    # train test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

    # Normalization of the X variables
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Binomial logistic regression
    LR = LogisticRegression()
    LR.fit(X_train_scaled, y_train)
    y_pred = LR.predict(X_test_scaled)
    y_proba = LR.predict_proba(X_test)[:, 1]

    # Metrics on our test sample
    print(classification_report(y_test, y_pred))
    # Calcul des métriques
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred) #, pos_label='Minority')
    recall = recall_score(y_test, y_pred) # pos_label='Minority')
    f1 = f1_score(y_test, y_pred) # pos_label='Minority')
    roc_auc = roc_auc_score(y_test, y_proba)

    return accuracy, precision, recall, f1, roc_auc

First Model 

DVs and IV choice

In [8]:
# Our DV is whether the client subscribed a term deposit. This variable is called “y”. We choose five IVs:
# • The contact communication type: “contact”,
# • The marital status: “marital”,
# • The type of job: “job”,
# • Whether or not the client has a personal loan: “loan”,
# • The number of contacts performed during the campaign for a specific client: “campaign”.
# These are the modified variables with the fewest NA’s (“unknown” value). These are also not highly correlated variables. 
# Furthermore, these variables allow us to have information about both the marketing campaign and about the client more personally.

In [68]:
df_first_model = df[['contact', 'marital', 'job', 'loan', 'campaign', 'y']]

# NaN suppression
df_first_model = df_first_model.dropna()

# Converting object columns into categorical
encoder = LabelEncoder()

for col in df[['contact', 'marital', 'job', 'loan', 'y']] :
    df_first_model[col] = encoder.fit_transform(df_first_model[col])

# IV and DVs
y = df_first_model['y']
X = df_first_model.drop(columns = 'y', axis = 0)

In [10]:
accuracy_first_model_simple, precision_first_model_simple, recall_first_model_simple, f1_first_model_simple, roc_auc_first_model_simple = modeling(X, y)

              precision    recall  f1-score   support

           0       0.88      1.00      0.94      3517
           1       0.00      0.00      0.00       466

    accuracy                           0.88      3983
   macro avg       0.44      0.50      0.47      3983
weighted avg       0.78      0.88      0.83      3983



In [11]:
# Under sampling

# Vérification de la répartition initiale des classes
print(f"Répartition initiale des classes : {Counter(y)}")

# Appliquer le sous-échantillonnage (undersampling)
rus = RandomUnderSampler(sampling_strategy='auto', random_state=42)
X_us, y_us = rus.fit_resample(X, y)

# Vérification de la répartition après sous-échantillonnage
print(f"Répartition après sous-échantillonnage : {Counter(y_us)}")

Répartition initiale des classes : Counter({0: 17634, 1: 2277})
Répartition après sous-échantillonnage : Counter({0: 2277, 1: 2277})


In [12]:
accuracy_first_model_us, precision_first_model_us, recall_first_model_us, f1_first_model_us, roc_auc_first_model_us = modeling(X_us, y_us)

              precision    recall  f1-score   support

           0       0.69      0.48      0.57       473
           1       0.58      0.77      0.66       438

    accuracy                           0.62       911
   macro avg       0.64      0.63      0.61       911
weighted avg       0.64      0.62      0.61       911



In [13]:
# SMOTE
# Appliquer SMOTE pour augmenter la classe minoritaire
smote = SMOTE(random_state=42)
X_os, y_os = smote.fit_resample(X, y)

# Vérification de la répartition après SMOTE
print(f"Répartition après SMOTE : {Counter(y_os)}")

Répartition après SMOTE : Counter({0: 17634, 1: 17634})


In [14]:
accuracy_first_model_os, precision_first_model_os, recall_first_model_os, f1_first_model_os, roc_auc_first_model_os = modeling(X_os, y_os)

              precision    recall  f1-score   support

           0       0.71      0.46      0.56      3521
           1       0.60      0.81      0.69      3533

    accuracy                           0.64      7054
   macro avg       0.66      0.64      0.62      7054
weighted avg       0.66      0.64      0.62      7054



In [80]:
# Hybrid Sampling

# Combinaison de SMOTE et NearMiss
smote_enn = SMOTEENN(random_state=42)
X_hs, y_hs = smote_enn.fit_resample(X, y)

print(f"Répartition après SMOTEENN : {Counter(y_hs)}")

Répartition après SMOTEENN : Counter({0: 12523, 1: 258})


In [16]:
accuracy_first_model_hs, precision_first_model_hs, recall_first_model_hs, f1_first_model_hs, roc_auc_first_model_hs = modeling(X_hs, y_hs)

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      2506
           1       0.14      0.02      0.03        51

    accuracy                           0.98      2557
   macro avg       0.56      0.51      0.51      2557
weighted avg       0.96      0.98      0.97      2557



Second model

DVs and IV choice

In [19]:
# We would like to focus on more external attributes to see how it affects the outcome. 
# We decided to look at other attributes, both linked to the marketing campaign itself and also the personal characteristics of the client: 
# "poutcome", "previous", "education", "age" and "day_of_week".

In [20]:
df_second_model = df[["poutcome", "previous", "education", "age", "day_of_week", "y"]]

# NaN suppression
df_second_model = df_second_model.dropna()

# Converting object columns into categorical
encoder = LabelEncoder()

for col in df[["poutcome", "education", "day_of_week", "y"]] :
    df_second_model[col] = encoder.fit_transform(df_second_model[col])

# IV and DVs
y = df_second_model['y']
X = df_second_model.drop(columns = 'y', axis = 0)

In [21]:
accuracy_second_model_simple, precision_second_model_simple, recall_second_model_simple, f1_second_model_simple, roc_auc_second_model_simple = modeling(X, y)

              precision    recall  f1-score   support

           0       0.90      0.99      0.94      3519
           1       0.59      0.11      0.19       426

    accuracy                           0.90      3945
   macro avg       0.75      0.55      0.57      3945
weighted avg       0.87      0.90      0.86      3945



In [22]:
# Under sampling

# Vérification de la répartition initiale des classes
print(f"Répartition initiale des classes : {Counter(y)}")

# Appliquer le sous-échantillonnage (undersampling)
rus = RandomUnderSampler(sampling_strategy='auto', random_state=42)
X_us, y_us = rus.fit_resample(X, y)

# Vérification de la répartition après sous-échantillonnage
print(f"Répartition après sous-échantillonnage : {Counter(y_us)}")

Répartition initiale des classes : Counter({0: 17502, 1: 2222})
Répartition après sous-échantillonnage : Counter({0: 2222, 1: 2222})


In [23]:
accuracy_second_model_us, precision_second_model_us, recall_second_model_us, f1_second_model_us, roc_auc_second_model_us = modeling(X_us, y_us)

              precision    recall  f1-score   support

           0       0.57      0.81      0.67       458
           1       0.64      0.35      0.45       431

    accuracy                           0.59       889
   macro avg       0.60      0.58      0.56       889
weighted avg       0.60      0.59      0.56       889



In [24]:
# SMOTE
# Appliquer SMOTE pour augmenter la classe minoritaire
smote = SMOTE(random_state=42)
X_os, y_os = smote.fit_resample(X, y)

# Vérification de la répartition après SMOTE
print(f"Répartition après SMOTE : {Counter(y_os)}")

Répartition après SMOTE : Counter({0: 17502, 1: 17502})


In [25]:
accuracy_second_model_os, precision_second_model_os, recall_second_model_os, f1_second_model_os, roc_auc_second_model_os = modeling(X_os, y_os)

              precision    recall  f1-score   support

           0       0.55      0.80      0.66      3498
           1       0.64      0.36      0.46      3503

    accuracy                           0.58      7001
   macro avg       0.60      0.58      0.56      7001
weighted avg       0.60      0.58      0.56      7001



In [76]:
# Hybrid Sampling

# Combinaison de SMOTE et NearMiss
smote_enn = SMOTEENN(random_state=42)
X_hs, y_hs = smote_enn.fit_resample(X, y)

print(f"Répartition après SMOTEENN : {Counter(y_hs)}")

Répartition après SMOTEENN : Counter({0: 12523, 1: 258})


In [27]:
accuracy_second_model_hs, precision_second_model_hs, recall_second_model_hs, f1_second_model_hs, roc_auc_second_model_hs = modeling(X_hs, y_hs)

              precision    recall  f1-score   support

           0       0.89      0.97      0.93      2599
           1       0.87      0.65      0.75       860

    accuracy                           0.89      3459
   macro avg       0.88      0.81      0.84      3459
weighted avg       0.89      0.89      0.88      3459



In [66]:
metrics = {'Accuracy' : [accuracy_first_model_simple, accuracy_first_model_us, accuracy_first_model_os, accuracy_first_model_hs,
                        accuracy_second_model_simple, accuracy_second_model_us, accuracy_second_model_os, accuracy_second_model_hs],
           
           'Precision' : [precision_first_model_simple, precision_first_model_us, precision_first_model_os, precision_first_model_hs,
                         precision_second_model_simple, precision_second_model_us, precision_second_model_os, precision_second_model_hs],
           
           'Recall' : [recall_first_model_simple, recall_first_model_us, recall_first_model_os, recall_first_model_hs,
                      recall_second_model_simple, recall_second_model_us, recall_second_model_os, recall_second_model_hs],
           
           'F1 Score' : [f1_first_model_simple, f1_first_model_us, f1_first_model_os, f1_first_model_hs,
                        f1_second_model_simple, f1_second_model_us, f1_second_model_os, f1_second_model_hs],
           
           'ROC AUC' : [roc_auc_first_model_simple, roc_auc_first_model_us, roc_auc_first_model_os, roc_auc_first_model_hs,
                       roc_auc_second_model_simple, roc_auc_second_model_us, roc_auc_second_model_os, roc_auc_second_model_hs]}

# Créer un DataFrame pour afficher les résultats
metrics_df = pd.DataFrame(metrics, index = ['First Model Simple', 'First Model Under Sampling', 'First Model Over Sampling', 'First Model Hybrid Sampling',
                                           'Second Model Simple', 'Second Model Under Sampling', 'Second Model Over Sampling', 'Second Model Hybrid Sampling'])
#metrics_df = metrics_df.apply(lambda x: f'{x*100}%', axis = 1)
# Afficher les résultats
display(metrics_df)

Unnamed: 0,Accuracy,Precision,Recall,F1 Score,ROC AUC
First Model Simple,0.883003,0.0,0.0,0.0,0.612017
First Model Under Sampling,0.620198,0.578767,0.771689,0.661448,0.61751
First Model Over Sampling,0.63666,0.601549,0.81319,0.691539,0.6418
First Model Hybrid Sampling,0.978099,0.142857,0.019608,0.034483,0.875468
Second Model Simple,0.895817,0.594937,0.110329,0.186139,0.562694
Second Model Under Sampling,0.588301,0.635983,0.352668,0.453731,0.570756
Second Model Over Sampling,0.579346,0.642784,0.35855,0.460326,0.532083
Second Model Hybrid Sampling,0.890142,0.873832,0.652326,0.747004,0.548277


In [29]:
print('Our best model is the second one, using an hybrid sampling to equilibrate the outcome variable')

Our best model is the second one, using an hybrid sampling to equilibrate the outcome variable
