## 1.Importing the dependencies

In [None]:
import pickle
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

In [37]:
df = pd.read_csv("..\data\processed.csv")

## 2.Train-Test-Split

In [38]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   object 
 1   SeniorCitizen     7043 non-null   int64  
 2   Partner           7043 non-null   object 
 3   Dependents        7043 non-null   object 
 4   tenure            7043 non-null   int64  
 5   PhoneService      7043 non-null   object 
 6   MultipleLines     7043 non-null   object 
 7   InternetService   7043 non-null   object 
 8   OnlineSecurity    7043 non-null   object 
 9   OnlineBackup      7043 non-null   object 
 10  DeviceProtection  7043 non-null   object 
 11  TechSupport       7043 non-null   object 
 12  StreamingTV       7043 non-null   object 
 13  StreamingMovies   7043 non-null   object 
 14  Contract          7043 non-null   object 
 15  PaperlessBilling  7043 non-null   object 
 16  PaymentMethod     7043 non-null   object 


In [39]:
df = df.convert_dtypes()  # Converts object columns to string dtype automatically

In [40]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   string 
 1   SeniorCitizen     7043 non-null   Int64  
 2   Partner           7043 non-null   string 
 3   Dependents        7043 non-null   string 
 4   tenure            7043 non-null   Int64  
 5   PhoneService      7043 non-null   string 
 6   MultipleLines     7043 non-null   string 
 7   InternetService   7043 non-null   string 
 8   OnlineSecurity    7043 non-null   string 
 9   OnlineBackup      7043 non-null   string 
 10  DeviceProtection  7043 non-null   string 
 11  TechSupport       7043 non-null   string 
 12  StreamingTV       7043 non-null   string 
 13  StreamingMovies   7043 non-null   string 
 14  Contract          7043 non-null   string 
 15  PaperlessBilling  7043 non-null   string 
 16  PaymentMethod     7043 non-null   string 


In [41]:
df["Churn"] = df["Churn"].replace({"Yes":1,"No":0})
#Pandas “downcasts” the column type automatically (e.g., from object to int64) when we replace string to int 

  df["Churn"] = df["Churn"].replace({"Yes":1,"No":0})


In [42]:
X = df.drop(columns =["Churn"])
y = df["Churn"]

In [43]:
# split training and test data 
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

## 3.Label Encoding

In [44]:
numerical_feature_list = ["tenure","MonthlyCharges","TotalCharges"]
categorical_feature_list = []
for col in X_train.columns:
    if col not in numerical_feature_list:
        categorical_feature_list.append(col)


In [45]:
print(categorical_feature_list)

['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod']


### Label encoding of categorical column 

In [46]:
#Initialize a dictionary to save the encoders
encoders={}

# apply label encoding and store the encoders
for column in categorical_feature_list:
    if column == "SeniorCitizen":
        continue
    label_encoder = LabelEncoder()
    X_train[column] = label_encoder.fit_transform(X_train[column])
    X_test[column] = label_encoder.transform(X_test[column])
    encoders[column] = label_encoder

#save the encoders to a pickle file 
with open("../data/encoders.pkl","wb") as f:
    pickle.dump(encoders,f)

In [47]:
encoders

{'gender': LabelEncoder(),
 'Partner': LabelEncoder(),
 'Dependents': LabelEncoder(),
 'PhoneService': LabelEncoder(),
 'MultipleLines': LabelEncoder(),
 'InternetService': LabelEncoder(),
 'OnlineSecurity': LabelEncoder(),
 'OnlineBackup': LabelEncoder(),
 'DeviceProtection': LabelEncoder(),
 'TechSupport': LabelEncoder(),
 'StreamingTV': LabelEncoder(),
 'StreamingMovies': LabelEncoder(),
 'Contract': LabelEncoder(),
 'PaperlessBilling': LabelEncoder(),
 'PaymentMethod': LabelEncoder()}

In [48]:
X_train.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges
2142,0,0,0,1,21,1,0,0,2,0,2,0,0,2,1,0,3,64.85,1336.8
1623,0,0,0,0,54,1,2,1,0,2,0,0,2,2,2,1,0,97.2,5129.45
6074,1,0,1,0,1,0,1,0,0,0,0,0,0,0,0,1,2,23.45,23.45
1362,1,0,0,0,4,1,0,1,0,0,0,0,0,0,0,1,2,70.2,237.95
6754,1,0,0,1,0,1,2,0,2,2,0,2,0,0,2,1,0,61.9,0.0


## 3.Balancing the Data

In [49]:
print(y_train.value_counts())

Churn
0    4138
1    1496
Name: count, dtype: int64


In [53]:
X_train = X_train.astype(float)

**Insights**

1.Fewer customers churned compared to those who stayed, indicating class imbalance

2.Use SMOTE for balacing dataset

In [54]:
smote = SMOTE(random_state = 42)

In [55]:
X_train_smote , y_train_smote = smote.fit_resample(X_train,y_train)

In [56]:
print(y_train_smote.value_counts())

Churn
0    4138
1    4138
Name: count, dtype: int64


In [59]:
X_train_smote

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges
0,0.000000,0.000000,0.000000,1.0,21.000000,1.000000,0.000000,0.000000,2.000000,0.000000,2.000000,0.000000,0.000000,2.000000,1.000000,0.000000,3.000000,64.850000,1336.800000
1,0.000000,0.000000,0.000000,0.0,54.000000,1.000000,2.000000,1.000000,0.000000,2.000000,0.000000,0.000000,2.000000,2.000000,2.000000,1.000000,0.000000,97.200000,5129.450000
2,1.000000,0.000000,1.000000,0.0,1.000000,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,2.000000,23.450000,23.450000
3,1.000000,0.000000,0.000000,0.0,4.000000,1.000000,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,2.000000,70.200000,237.950000
4,1.000000,0.000000,0.000000,1.0,0.000000,1.000000,2.000000,0.000000,2.000000,2.000000,0.000000,2.000000,0.000000,0.000000,2.000000,1.000000,0.000000,61.900000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8271,0.000000,0.374670,0.000000,0.0,5.625330,1.000000,0.749340,1.000000,0.000000,0.000000,0.000000,0.000000,2.000000,1.250660,0.000000,1.000000,1.625330,88.121375,451.834304
8272,0.000000,0.252922,0.747078,0.0,44.470783,1.000000,0.505843,1.000000,0.505843,0.505843,1.494157,0.000000,0.505843,2.000000,0.000000,1.000000,1.494157,90.725171,4048.457431
8273,0.781286,0.218714,0.000000,0.0,35.749711,1.000000,2.000000,1.000000,0.000000,1.562572,1.562572,0.000000,2.000000,1.562572,0.000000,1.000000,2.000000,101.603909,3547.522160
8274,0.940057,0.000000,0.000000,0.0,44.520454,0.940057,0.059943,1.880113,0.940057,0.940057,0.940057,1.059943,0.940057,0.940057,0.940057,0.940057,1.940057,20.891535,933.033556


In [61]:
datasets = {
    "X_train": X_train,
    "y_train": y_train,
    "X_test": X_test,
    "y_test": y_test,
    "X_train_smote": X_train_smote,
    "y_train_smote": y_train_smote,
}

for name, data in datasets.items():
    with open(f"../data/{name}.pkl", "wb") as f:
        pickle.dump(data, f)