In [323]:
import pandas as pd
import numpy as np

In [324]:
# afficher les résultats en entier
pd.set_option('display.max_row',100)
pd.set_option('display.max_column', 50)

## Nettoyage des données et analyse descriptive

In [325]:
# Chargement du fichier csv
df = pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv")

In [326]:
# Visualisation des variables, des valeurs nulles et des types
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [327]:
# supprimer les lignes avec la variable "TotalCharges" nulle
df.dropna(subset=["TotalCharges"], inplace=True)

In [328]:
# détection des valeurs manquantes
df.isnull().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [329]:
# Visualisation des valeurs de chaque variable
for field in df:
    print(f"{field}:")
    print(df[f"{field}"].unique())

customerID:
['7590-VHVEG' '5575-GNVDE' '3668-QPYBK' ... '4801-JZAZL' '8361-LTMKD'
 '3186-AJIEK']
gender:
['Female' 'Male']
SeniorCitizen:
[0 1]
Partner:
['Yes' 'No']
Dependents:
['No' 'Yes']
tenure:
[ 1 34  2 45  8 22 10 28 62 13 16 58 49 25 69 52 71 21 12 30 47 72 17 27
  5 46 11 70 63 43 15 60 18 66  9  3 31 50 64 56  7 42 35 48 29 65 38 68
 32 55 37 36 41  6  4 33 67 23 57 61 14 20 53 40 59 24 44 19 54 51 26 39]
PhoneService:
['No' 'Yes']
MultipleLines:
['No phone service' 'No' 'Yes']
InternetService:
['DSL' 'Fiber optic' 'No']
OnlineSecurity:
['No' 'Yes' 'No internet service']
OnlineBackup:
['Yes' 'No' 'No internet service']
DeviceProtection:
['No' 'Yes' 'No internet service']
TechSupport:
['No' 'Yes' 'No internet service']
StreamingTV:
['No' 'Yes' 'No internet service']
StreamingMovies:
['No' 'Yes' 'No internet service']
Contract:
['Month-to-month' 'One year' 'Two year']
PaperlessBilling:
['Yes' 'No']
PaymentMethod:
['Electronic check' 'Mailed check' 'Bank transfer (automatic)'
 '

In [330]:
# Suppression de la colonne "customerID" car non nécessaire
df.drop(columns=["customerID"], inplace=True)

In [331]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7032 entries, 0 to 7042
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7032 non-null   object 
 1   SeniorCitizen     7032 non-null   int64  
 2   Partner           7032 non-null   object 
 3   Dependents        7032 non-null   object 
 4   tenure            7032 non-null   int64  
 5   PhoneService      7032 non-null   object 
 6   MultipleLines     7032 non-null   object 
 7   InternetService   7032 non-null   object 
 8   OnlineSecurity    7032 non-null   object 
 9   OnlineBackup      7032 non-null   object 
 10  DeviceProtection  7032 non-null   object 
 11  TechSupport       7032 non-null   object 
 12  StreamingTV       7032 non-null   object 
 13  StreamingMovies   7032 non-null   object 
 14  Contract          7032 non-null   object 
 15  PaperlessBilling  7032 non-null   object 
 16  PaymentMethod     7032 non-null   object 
 17  

## Pré-traitement des données

In [332]:
# import des librairies pour le feature engineering
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder

In [333]:
# Encodage de la variable 'Churn' (future target), avec 'No' = 0 et 'Yes'= 1, conversion de type en 'int'
#df['Churn'] = df['Churn'].map({"No": 0, "Yes": 1})
# df.loc[df['Churn'].isin(["No"]), 'Churn'] = 0
# df.loc[df['Churn'].isin(["Yes"]), 'Churn'] = 1
# df['Churn'] = df['Churn'].astype(int)
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})

In [334]:
df['Churn'].value_counts(normalize=True)

Churn
0    0.734215
1    0.265785
Name: proportion, dtype: float64

In [335]:
# Identification des variables numériques et catégorielles de la variable X pour le pré-traitement
numerical_features = df.select_dtypes(include=['int64', 'float64']).columns
categorical_features = df.select_dtypes(include=['object']).columns

# suppression de la variable "Churn" de numerical_features car ce n'est pas une feature
numerical_features = numerical_features[:3]
numerical_features

Index(['SeniorCitizen', 'tenure', 'MonthlyCharges'], dtype='object')

In [336]:
print(categorical_features)
print()
print(numerical_features)

Index(['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines',
       'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
       'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract',
       'PaperlessBilling', 'PaymentMethod'],
      dtype='object')

Index(['SeniorCitizen', 'tenure', 'MonthlyCharges'], dtype='object')


In [337]:
# Encodage des colonnes catégorielles avec LabelEncoder
for col in categorical_features:
    encoder = LabelEncoder()
    df[col] = encoder.fit_transform(df[col])

In [338]:
# Standardisation des colonnes numériques avec StandardScaler
standard_scaler = StandardScaler()
df[numerical_features] = standard_scaler.fit_transform(df[numerical_features])

In [339]:
df.sample(5)

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
158,1,-0.440327,1,1,-0.669089,1,0,0,2,0,2,0,2,0,2,0,3,-0.018223,1024.0,0
3754,1,-0.440327,0,0,-1.280248,1,0,2,1,1,1,1,1,1,0,0,3,-1.48745,20.05,0
5231,1,2.271039,1,0,-0.669089,1,2,1,0,0,2,0,2,2,0,1,2,1.128573,1587.55,1
6819,1,2.271039,0,0,0.919926,1,2,1,2,0,0,0,2,2,0,1,2,1.130235,5617.75,1
5119,0,-0.440327,0,0,-1.117272,1,0,1,0,0,0,2,0,0,0,1,3,0.360718,399.45,0


## Implémentation du modèle de deep learning

In [340]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

from sklearn.model_selection import train_test_split

In [341]:
# Séparation des features (X) de la target (y)

# Filtrage des colonnes features
features = df.iloc[:, 0:-1]
# conversion de la variable features en array numpy (X) pour la manipulation avec Pytorch
X = features.to_numpy()

# Filtrage de la colonne target
target = df.iloc[:, -1]
# conversion de la variable target en array numpy (y) pour la manipulation avec Pytorch
y = target.to_numpy()

In [342]:
# Visualisation des variables créées
print("features : ", features.shape)
print("X : ", X.shape)
print("target : ", target.shape)
print("y : ", y.shape)

features :  (7032, 19)
X :  (7032, 19)
target :  (7032,)
y :  (7032,)


In [343]:
# Train / Test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [344]:
# Vérification
print("X_train : ", X_train.shape)
print("y_train : ", y_train.shape)
print("X_test : ", X_test.shape)
print("y_test : ", y_test.shape)

X_train :  (5625, 19)
y_train :  (5625,)
X_test :  (1407, 19)
y_test :  (1407,)


In [345]:
# Conversion des variables X_train et y_train en Tensor (permettant la manipulation avec PyTorch)
dataset_train = TensorDataset(torch.tensor(X_train, dtype=torch.float32), torch.tensor(y_train, dtype=torch.float32))

# Conversion des variables X_test et y_test en Tensor (permettant la manipulation avec PyTorch)
dataset_test = TensorDataset(torch.tensor(X_test, dtype=torch.float32), torch.tensor(y_test, dtype=torch.float32))

In [346]:
print("dataset_train : ", dataset_train)
print("dataset_test : ", dataset_test)

dataset_train :  <torch.utils.data.dataset.TensorDataset object at 0x0000029FE4FF93A0>
dataset_test :  <torch.utils.data.dataset.TensorDataset object at 0x0000029FE1C293D0>


In [347]:
# Chargement des données avec DataLoader

# nombre d'échantillons inclus à chaque itération
batch_size = 32
# randomisation des données lors des itérations
shuffle = True

# Création des DataLoaders d'entrainement et de test
dataloader_train = DataLoader(dataset_train, batch_size=batch_size, shuffle=shuffle)
dataloader_test = DataLoader(dataset_test, batch_size=batch_size)

In [None]:
# Conception du modèle de réseaux de neurones
class ChurnModel(nn.Module):
    def __init__(self, input_dim):
        super(ChurnModel, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(32, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.model(x)

model = ChurnModel(input_dim=X.shape[1])

In [352]:
# Entrainement du modèle

criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

n_epochs = 100
for epoch in range(n_epochs):
    model.train()
    total_loss = 0
    for X_batch, y_batch in dataloader_train:
        optimizer.zero_grad()

        y_batch = y_batch.float()
        outputs = model(X_batch).view(-1)

        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{n_epochs}, Loss: {total_loss:.4f}")

Epoch 1/100, Loss: 100.5938
Epoch 2/100, Loss: 100.6849
Epoch 3/100, Loss: 100.1459
Epoch 4/100, Loss: 99.0366
Epoch 5/100, Loss: 99.3744
Epoch 6/100, Loss: 98.4211
Epoch 7/100, Loss: 98.3987
Epoch 8/100, Loss: 96.6351
Epoch 9/100, Loss: 95.4372
Epoch 10/100, Loss: 93.8139
Epoch 11/100, Loss: 92.5172
Epoch 12/100, Loss: 91.3028
Epoch 13/100, Loss: 90.0229
Epoch 14/100, Loss: 89.6143
Epoch 15/100, Loss: 90.1965
Epoch 16/100, Loss: 88.2259
Epoch 17/100, Loss: 87.4939
Epoch 18/100, Loss: 86.2510
Epoch 19/100, Loss: 84.5123
Epoch 20/100, Loss: 84.0996
Epoch 21/100, Loss: 84.3595
Epoch 22/100, Loss: 82.6695
Epoch 23/100, Loss: 82.7692
Epoch 24/100, Loss: 81.7379
Epoch 25/100, Loss: 82.1747
Epoch 26/100, Loss: 80.4416
Epoch 27/100, Loss: 79.8318
Epoch 28/100, Loss: 79.8604
Epoch 29/100, Loss: 79.8298
Epoch 30/100, Loss: 80.0217
Epoch 31/100, Loss: 79.9551
Epoch 32/100, Loss: 79.6167
Epoch 33/100, Loss: 79.3161
Epoch 34/100, Loss: 79.1200
Epoch 35/100, Loss: 78.5106
Epoch 36/100, Loss: 79.350

## Evaluation du modèle

In [350]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [356]:
# Evaluation du modèle
model.eval()
y_pred = []
y_true = []

with torch.no_grad():
    for X_batch, y_batch in dataloader_test:
        outputs = model(X_batch).view(-1)
        preds = (outputs >= 0.5).int()
        y_pred.extend(preds.numpy())
        y_true.extend(y_batch.numpy())

acc = accuracy_score(y_true, y_pred)
cm = confusion_matrix(y_true, y_pred)



In [357]:
# Visualisation
print(classification_report(y_true, y_pred))
print("Confusion Matrix:")
print(cm)

              precision    recall  f1-score   support

         0.0       0.83      0.88      0.85      1033
         1.0       0.60      0.49      0.54       374

    accuracy                           0.78      1407
   macro avg       0.71      0.69      0.70      1407
weighted avg       0.77      0.78      0.77      1407

Confusion Matrix:
[[913 120]
 [192 182]]
