In [61]:
pip install scikit-learn


Note: you may need to restart the kernel to use updated packages.


In [62]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [63]:
DATA_PATH = "../Data/Dataset_ATS_v2.csv"

df = pd.read_csv(DATA_PATH)
df.head()

Unnamed: 0,gender,SeniorCitizen,Dependents,tenure,PhoneService,MultipleLines,InternetService,Contract,MonthlyCharges,Churn
0,Female,0,No,1,No,No,DSL,Month-to-month,25,Yes
1,Male,0,No,41,Yes,No,DSL,One year,25,No
2,Female,0,Yes,52,Yes,No,DSL,Month-to-month,19,No
3,Female,0,No,1,Yes,No,DSL,One year,76,Yes
4,Male,0,No,67,Yes,No,Fiber optic,Month-to-month,51,No


In [64]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   gender           7043 non-null   object
 1   SeniorCitizen    7043 non-null   int64 
 2   Dependents       7043 non-null   object
 3   tenure           7043 non-null   int64 
 4   PhoneService     7043 non-null   object
 5   MultipleLines    7043 non-null   object
 6   InternetService  7043 non-null   object
 7   Contract         7043 non-null   object
 8   MonthlyCharges   7043 non-null   int64 
 9   Churn            7043 non-null   object
dtypes: int64(3), object(7)
memory usage: 550.4+ KB


In [65]:
df.describe(include="all")

Unnamed: 0,gender,SeniorCitizen,Dependents,tenure,PhoneService,MultipleLines,InternetService,Contract,MonthlyCharges,Churn
count,7043,7043.0,7043,7043.0,7043,7043,7043,7043,7043.0,7043
unique,2,,2,,2,2,2,3,,2
top,Male,,No,,Yes,No,DSL,Month-to-month,,No
freq,3555,,4933,,6361,4072,3947,3875,,5174
mean,,0.162147,,32.371149,,,,,64.758768,
std,,0.368612,,24.559481,,,,,30.09165,
min,,0.0,,0.0,,,,,18.0,
25%,,0.0,,9.0,,,,,36.0,
50%,,0.0,,29.0,,,,,70.0,
75%,,0.0,,55.0,,,,,90.0,


In [66]:
df.isnull().sum()

gender             0
SeniorCitizen      0
Dependents         0
tenure             0
PhoneService       0
MultipleLines      0
InternetService    0
Contract           0
MonthlyCharges     0
Churn              0
dtype: int64

In [67]:
# Replace empty strings with NaN
df.replace(" ", np.nan, inplace=True)

# Drop rows where target is missing
df.dropna(subset=["Churn"], inplace=True)

# Fill numeric columns with median
numeric_cols = df.select_dtypes(include=["int64", "float64"]).columns
for col in numeric_cols:
    df[col].fillna(df[col].median(), inplace=True)

# Fill categorical columns with mode
categorical_cols = df.select_dtypes(include=["object"]).columns
for col in categorical_cols:
    df[col].fillna(df[col].mode()[0], inplace=True)

df.isnull().sum()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values

gender             0
SeniorCitizen      0
Dependents         0
tenure             0
PhoneService       0
MultipleLines      0
InternetService    0
Contract           0
MonthlyCharges     0
Churn              0
dtype: int64

In [68]:
label_encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

df.head()

Unnamed: 0,gender,SeniorCitizen,Dependents,tenure,PhoneService,MultipleLines,InternetService,Contract,MonthlyCharges,Churn
0,0,0,0,1,0,0,0,0,25,1
1,1,0,0,41,1,0,0,1,25,0
2,0,0,1,52,1,0,0,0,19,0
3,0,0,0,1,1,0,0,1,76,1
4,1,0,0,67,1,0,1,0,51,0


In [69]:
scaler = StandardScaler()

X = df.drop(columns=["Churn"])
y = df["Churn"]

X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

X_scaled.head()

Unnamed: 0,gender,SeniorCitizen,Dependents,tenure,PhoneService,MultipleLines,InternetService,Contract,MonthlyCharges
0,-1.009559,-0.439916,-0.654012,-1.277445,-3.05401,-0.854176,-0.88566,-0.828207,-1.32135
1,0.990532,-0.439916,-0.654012,0.35137,0.327438,-0.854176,-0.88566,0.371271,-1.32135
2,-1.009559,-0.439916,1.529024,0.799294,0.327438,-0.854176,-0.88566,-0.828207,-1.520755
3,-1.009559,-0.439916,-0.654012,-1.277445,0.327438,-0.854176,-0.88566,0.371271,0.373593
4,0.990532,-0.439916,-0.654012,1.410099,0.327438,-0.854176,1.129102,-0.828207,-0.457261


In [70]:
print("Shape of full dataset:", df.shape)
print("Shape of scaled features:", X_scaled.shape)
print("Target distribution:\n", y.value_counts())

Shape of full dataset: (7043, 10)
Shape of scaled features: (7043, 9)
Target distribution:
 Churn
0    5174
1    1869
Name: count, dtype: int64


In [71]:
df.to_csv("../Data/cleaned_dataset.csv", index=False)
X_scaled.to_csv("../Data/X_scaled.csv", index=False)
y.to_csv("../Data/y.csv", index=False)