In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('../data/raw/bank_churn.csv')

print(f"Dataset: {df.shape[0]} lignes, {df.shape[1]} colonnes")
print("\nAperçu:")
df.head()

Dataset: 10000 lignes, 13 colonnes

Aperçu:


Unnamed: 0,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,Num Of Products,Has Credit Card,Is Active Member,Estimated Salary,Churn
0,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [3]:
# Afficher toutes les colonnes
print("Colonnes disponibles:")
print(df.columns.tolist())

print("\n" + "="*50)
print("\nTypes de données:")
print(df.dtypes)

Colonnes disponibles:
['CustomerId', 'Surname', 'CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance', 'Num Of Products', 'Has Credit Card', 'Is Active Member', 'Estimated Salary', 'Churn']


Types de données:
CustomerId            int64
Surname              object
CreditScore           int64
Geography            object
Gender               object
Age                   int64
Tenure                int64
Balance             float64
Num Of Products       int64
Has Credit Card       int64
Is Active Member      int64
Estimated Salary    float64
Churn                 int64
dtype: object


In [4]:
# Liste des colonnes à supprimer
columns_to_drop = ['CustomerId', 'Surname']

# Créer une copie du dataframe pour le preprocessing
df_clean = df.drop(columns=columns_to_drop)

print(f"Colonnes supprimées: {columns_to_drop}")
print(f"\nNouvelle shape: {df_clean.shape}")
print(f"\nColonnes restantes: {df_clean.columns.tolist()}")

Colonnes supprimées: ['CustomerId', 'Surname']

Nouvelle shape: (10000, 11)

Colonnes restantes: ['CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance', 'Num Of Products', 'Has Credit Card', 'Is Active Member', 'Estimated Salary', 'Churn']


In [8]:
# Voir les valeurs uniques des variables catégorielles
print("Valeurs de Geography:")
print(df_clean['Geography'].unique())
print(f"Nombre: {df_clean['Geography'].nunique()}")

print("\n" + "="*50)

print("\nValeurs de Gender:")
print(df_clean['Gender'].unique())
print(f"Nombre: {df_clean['Gender'].nunique()}")

Valeurs de Geography:
['France' 'Spain' 'Germany']
Nombre: 3


Valeurs de Gender:
[1 0]
Nombre: 2


In [6]:
# Label Encoding pour Gender (binaire)
# Male = 0, Female = 1
df_clean['Gender'] = df_clean['Gender'].map({'Male': 0, 'Female': 1})

print("Gender encodé:")
print(df_clean['Gender'].value_counts())
print("\nVérification:")
print(df_clean[['Gender']].head(10))

Gender encodé:
Gender
0    5457
1    4543
Name: count, dtype: int64

Vérification:
   Gender
0       1
1       1
2       1
3       1
4       1
5       0
6       0
7       1
8       0
9       0


In [10]:
# Convertir les colonnes Geography en entiers (0/1)
geography_cols = ['France', 'Germany', 'Spain']

for col in geography_cols:
    df_encoded[col] = df_encoded[col].astype(int)


print("Geography converti en 0/1:")
print(df_encoded[geography_cols].head())

print("\n" + "="*50)
print(f"\nShape finale: {df_encoded.shape}")
print(f"Colonnes: {df_encoded.columns.tolist()}")

KeyError: 'France'

In [12]:
# Encoder Geography
df_encoded = pd.get_dummies(df_encoded, columns=['Geography'], prefix='Geography')

# Convertir les colonnes Geography en int
geography_cols = ['Geography_France', 'Geography_Germany', 'Geography_Spain']

for col in geography_cols:
    df_encoded[col] = df_encoded[col].astype(int)

print(df_encoded.head())


   CreditScore  Gender  Age  Tenure    Balance  Num Of Products  \
0          619       1   42       2       0.00                1   
1          608       1   41       1   83807.86                1   
2          502       1   42       8  159660.80                3   
3          699       1   39       1       0.00                2   
4          850       1   43       2  125510.82                1   

   Has Credit Card  Is Active Member  Estimated Salary  Churn  \
0                1                 1         101348.88      1   
1                0                 1         112542.58      0   
2                1                 0         113931.57      1   
3                0                 0          93826.63      0   
4                1                 1          79084.10      0   

   Geography_France  Geography_Germany  Geography_Spain  
0                 1                  0                0  
1                 0                  0                1  
2                 1             