In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE

In [3]:
# Load dataset
data = pd.read_csv(r"Telco-Customer.csv")

In [4]:
data.head(5)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [5]:
data.tail(5)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
7038,6840-RESVB,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,...,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.8,1990.5,No
7039,2234-XADUH,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.2,7362.9,No
7040,4801-JZAZL,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.6,346.45,No
7041,8361-LTMKD,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Mailed check,74.4,306.6,Yes
7042,3186-AJIEK,Male,0,No,No,66,Yes,No,Fiber optic,Yes,...,Yes,Yes,Yes,Yes,Two year,Yes,Bank transfer (automatic),105.65,6844.5,No


In [6]:
data.dtypes

customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

1: DATA CLEANING

In [15]:
# Handle missing values
# Impute missing 'TotalCharges' with calculated values
missing_total_charges = data['TotalCharges'].isnull()
data.loc[missing_total_charges, 'TotalCharges'] = (
    data.loc[missing_total_charges, 'MonthlyCharges'] * data.loc[missing_total_charges, 'tenure']
)


In [16]:
# Ensure all values in numeric columns are properly formatted
for col in ['MonthlyCharges', 'TotalCharges']:
    # Replace empty strings or spaces with NaN
    data[col] = pd.to_numeric(data[col], errors='coerce')

# Impute remaining missing numeric values with median
num_features = ['MonthlyCharges', 'TotalCharges']
num_imputer = SimpleImputer(strategy="median")
data[num_features] = num_imputer.fit_transform(data[num_features])


2: DATA TRANSFORMATION

In [17]:
# Encode categorical variables
categorical_features = ['Contract', 'InternetService', 'PhoneService']
encoder = OneHotEncoder(sparse_output=False, drop='first')
categorical_encoded = encoder.fit_transform(data[categorical_features])
encoded_columns = encoder.get_feature_names_out(categorical_features)
categorical_data = pd.DataFrame(categorical_encoded, columns=encoded_columns)
data = pd.concat([data.drop(columns=categorical_features), categorical_data], axis=1)

In [18]:
# Transform 'tenure'
data['Tenure-Group'] = pd.cut(
    data['tenure'], 
    bins=[0, 12, 24, 48, float('inf')], 
    labels=['New Customers', 'Short-term', 'Medium-term', 'Long-term']
)
data = pd.get_dummies(data, columns=['Tenure-Group'], drop_first=True)

3: DATA INTEGRATION

In [19]:
# Example: Merge with additional dataset (if provided)
# internet_plans = pd.read_csv("internet_plans.csv")
# data2 = pd.merge(data, internet_plans, on="CustomerID", how="left")

# # Handle missing service data in the merged dataset
# data2.fillna("Unknown", inplace=True)

4: DATA REDUCTION

In [20]:
# Drop irrelevant features like 'CustomerID'
data.drop(columns=['customerID'], inplace=True)

In [21]:
# Feature selection using RandomForest for numerical importance
X = data.drop(columns=['Churn'])
y = data['Churn']

# Convert categorical features to numerical using Label Encoding
label_encoder = LabelEncoder()
for col in X.select_dtypes(include=['object']).columns:
    X[col] = label_encoder.fit_transform(X[col])
    
selector = SelectKBest(chi2, k=10)
X_selected = selector.fit_transform(X, y)
selected_columns = X.columns[selector.get_support()]
data = pd.DataFrame(X_selected, columns=selected_columns)
data['Churn'] = y.values

5: DATA IMBALANCE

In [22]:
# Address imbalance using SMOTE
X = data.drop(columns=['Churn'])
y = data['Churn']
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

In [23]:
# Standardize numerical features
scaler = StandardScaler()
X_resampled_scaled = scaler.fit_transform(X_resampled)

In [24]:
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled_scaled, y_resampled, test_size=0.2, random_state=42
)
