In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split

In [6]:
########################
# NETTOYAGE DES DONNEES
########################

pd.set_option('display.max_columns', 500)

df = pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv")
df.columns = df.columns.str.replace(" ", "", regex=False)

print("shape : ", df.shape)

print(df.dtypes)

print(df.select_dtypes(include="object").head())

#  colonne TotalCharges : suppression des espaces, remplacement des vides par des NaN, conversion et type float
df["TotalCharges"] = df["TotalCharges"].str.strip().replace(",", ".", regex=True).replace("", np.nan).astype(float)
df["TotalCharges"].isna().sum()
df.dropna(inplace=True)

print("shape : ", df.shape)

boolean_cols = []
cat_cols = []
numeric_cols = df.select_dtypes(include="number").columns.tolist()
target = "Churn"


# colonnes objet : 
for c in df.select_dtypes(include="object").columns :
    df[c] = df[c].str.strip().replace("", np.nan)
    print(f"# {c} \n valeurs uniques : {len(df[c].unique())}") 
    if len(df[c].unique()) == 2 : 
        boolean_cols.append(c)
    if len(df[c].unique()) > 2 : 
        cat_cols.append(c)
    print(f"nombre de NaN : {df[c].isna().sum()}")
    print(f"valeurs uniques : {df[c].unique()}")
    print()


# colonnes numériques :
for c in df.select_dtypes(exclude="object").columns :
    print(f"# {c} \n valeurs uniques : {len(df[c].unique())}") 
    if len(df[c].unique()) == 2 : 
        boolean_cols.append(c)
    print(f"nombre de NaN : {df[c].isna().sum()}")
    print()    

boolean_cols.remove("Churn")

print("#"*50)
print("colonnes yes no : ", boolean_cols)
print("colonnes numeriques : ", numeric_cols)
print("colonnes 3 valeurs minimum :", cat_cols)
print("target :", target)

shape :  (7043, 21)
customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object
   customerID  gender Partner Dependents PhoneService     MultipleLines  \
0  7590-VHVEG  Female     Yes         No           No  No phone service   
1  5575-GNVDE    Male      No         No          Yes                No   
2  3668-QPYBK    Male      No         No          Yes                No   
3  7795-CFOCW    Male      No         No           No  No phone service   
4  

In [None]:
#########
#ENCODAGE
#########

X = df.drop(target, axis = 1)
y = df[target]

encoder_bool = OneHotEncoder(drop='first')
bool_encod = encoder_bool.fit_transform(X[boolean_cols])
df_bool_encod = pd.DataFrame(
    bool_encod.toarray(),
    columns=encoder_bool.get_feature_names_out(boolean_cols),
    index=X.index
)

encoder_cat = OneHotEncoder(drop='first')
cat_encod = encoder_cat.fit_transform(X[cat_cols])
df_cat_encod = pd.DataFrame(
    cat_encod.toarray(), 
    columns=encoder_cat.get_feature_names_out(cat_cols), 
    index=X.index
)

label_enc = LabelEncoder()
y = label_enc.fit_transform(y)

X = X.drop(boolean_cols, axis=1)
X = X.drop(cat_cols, axis=1)


##################
# MISE A L'ECHELLE
##################

standard_scaler = StandardScaler()
standard_scaler.fit_transform(X)

print("X avant mise en échelle")
print(X)
print(X.shape)


##################
# CONCATENATION
##################

X = pd.concat([X, df_bool_encod, df_cat_encod], axis = 1)

print("Après concaténation")
print(X)
print(X.shape)

X avant mise en échelle
      tenure  MonthlyCharges  TotalCharges
0          1           29.85         29.85
1         34           56.95       1889.50
2          2           53.85        108.15
3         45           42.30       1840.75
4          2           70.70        151.65
...      ...             ...           ...
7038      24           84.80       1990.50
7039      72          103.20       7362.90
7040      11           29.60        346.45
7041       4           74.40        306.60
7042      66          105.65       6844.50

[7032 rows x 3 columns]
(7032, 3)
Après concaténation
      tenure  MonthlyCharges  TotalCharges  gender_Male  Partner_Yes  \
0          1           29.85         29.85          0.0          1.0   
1         34           56.95       1889.50          1.0          0.0   
2          2           53.85        108.15          1.0          0.0   
3         45           42.30       1840.75          1.0          0.0   
4          2           70.70        151.65   

In [None]:
# Train / Test
X1, X_test, y1, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X1, y1, test_size=0.2, random_state=42, stratify=y1)

## Pipeline d'entrainement sous Pytorch

In [None]:
import torch

