In [10]:
import numpy as np
from sklearn.model_selection import train_test_split
import pandas as pd #To process the dataset

In [11]:
#Load the dataset from the repository
df = pd.read_csv("https://raw.githubusercontent.com/Ganindu-Deshapriya/Customer_Churn_Prediction_Group_06/refs/heads/main/Customer_Churn_Prediction_Group_06/dataset/churn_data.csv")
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [12]:
# Check for duplicate rows and remove, keeping the first occurrence
print(df.duplicated().sum())

0


In [13]:
# Check for the missing values
missing_values_count = df.isnull().sum()
print(missing_values_count)

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64


In [14]:
# create a copy of the dataset
df_copy = df.copy()

In [15]:
numerical_cols = df.select_dtypes(include=['float64']).columns
print(numerical_cols)
categorical_cols = df.select_dtypes(include=['object']).columns
print(categorical_cols)

Index(['MonthlyCharges'], dtype='object')
Index(['customerID', 'gender', 'Partner', 'Dependents', 'PhoneService',
       'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup',
       'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies',
       'Contract', 'PaperlessBilling', 'PaymentMethod', 'TotalCharges',
       'Churn'],
      dtype='object')


In [16]:
df_copy = df_copy.drop(['customerID'], axis = 1)

# Map binary variables
df_copy['Churn'] = df_copy['Churn'].map({'Yes': 1, 'No': 0})

# Map multiclass in One-hot encoding
df_copy = pd.get_dummies(df_copy, drop_first=True)

df_copy.head()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,Churn,gender_Male,Partner_Yes,Dependents_Yes,PhoneService_Yes,MultipleLines_No phone service,MultipleLines_Yes,...,TotalCharges_995.35,TotalCharges_996.45,TotalCharges_996.85,TotalCharges_996.95,TotalCharges_997.65,TotalCharges_997.75,TotalCharges_998.1,TotalCharges_999.45,TotalCharges_999.8,TotalCharges_999.9
0,0,1,29.85,0,False,True,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
1,0,34,56.95,0,True,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
2,0,2,53.85,1,True,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
3,0,45,42.3,0,True,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
4,0,2,70.7,1,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False


In [17]:
X = df_copy.drop('Churn', axis=1) #Add the features to X set
y = df_copy['Churn'] #Add churn results to Y set

#80-20 data split with randomness seed set to 33
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=33 
)

In [18]:
#Export datasets Cleaning training and testing
X_train.to_csv("X_train.csv", index=False)
X_test.to_csv("X_test.csv", index=False)
y_train.to_csv("y_train.csv", index=False)
y_test.to_csv("y_test.csv", index=False)