In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2,mutual_info_classif
from sklearn.feature_selection import f_classif

In [None]:
df = pd.read_csv("D:\Machine Learning Projects\Customer Churn Prediction\dataset\Telco-Customer-Churn.csv")
df.head()

df_clean = pd.read_csv("D:\Machine Learning Projects\Customer Churn Prediction\dataset\Telco-Customer-Churn-Cleaned.csv")
df_clean.head()

In [None]:
mms = MinMaxScaler() # Normalization
ss = StandardScaler() # Standardization

colors = ['Orange', "Black"]
df_clean.drop(columns = ['MonthlyCharges_Group','TotalCharges_Group'], inplace = True)

df_clean['tenure'] = mms.fit_transform(df_clean[['tenure']])
df_clean['MonthlyCharges'] = mms.fit_transform(df_clean[['MonthlyCharges']])
df_clean['TotalCharges'] = mms.fit_transform(df_clean[['TotalCharges']])
df_clean.head()

In [None]:
plt.figure(figsize = (20,5))
sns.heatmap(df_clean.corr(),cmap = colors,annot = True);

In [None]:
corr = df_clean.corrwith(df_clean['Churn']).sort_values(ascending = False).to_frame()
corr.columns = ['Correlations']
plt.subplots(figsize = (5,5))
sns.heatmap(corr,annot = True,cmap = colors,linewidths = 0.4,linecolor = 'black');
plt.title('Correlation w.r.t Outcome');

In [None]:
col = list(df_clean.columns)
categorical_features = []
numerical_features = []
for i in col:
    if len(df[i].unique()) > 6:
        numerical_features.append(i)
    else:
        categorical_features.append(i)

print("Categorical Features :", *categorical_features)
print("Numerical Features :" , *numerical_features)

In [None]:
features = df_clean.loc[:,categorical_features]
target = df_clean.loc[:,'Churn']

best_features = SelectKBest(score_func = chi2,k = 'all')
fit = best_features.fit(features,target)

featureScores = pd.DataFrame(data = fit.scores_,index = list(features.columns),columns = ['Chi Squared Score']) 

plt.subplots(figsize = (5,5))
sns.heatmap(featureScores.sort_values(ascending = False,by = 'Chi Squared Score'),annot = True,cmap = colors,linewidths = 0.4,linecolor = 'black',fmt = '.2f');
plt.title('Selection of Categorical Features');

In [None]:
features = df_clean.loc[:,numerical_features]
target = df_clean.loc[:,'Churn']

best_features = SelectKBest(score_func = f_classif,k = 'all')
fit = best_features.fit(features,target)

featureScores = pd.DataFrame(data = fit.scores_,index = list(features.columns),columns = ['ANOVA Score']) 

plt.subplots(figsize = (5,5))
sns.heatmap(featureScores.sort_values(ascending = False,by = 'ANOVA Score'),annot = True,cmap = colors,linewidths = 0.4,linecolor = 'black',fmt = '.2f');
plt.title('Selection of Numerical Features');

In [None]:
df_clean.drop(columns = ['PhoneService', 'gender','StreamingTV','StreamingMovies','MultipleLines','InternetService'],inplace = True)
df_clean.head()

In [None]:
import imblearn
from collections import Counter
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

In [None]:
over = SMOTE(sampling_strategy = 1)

f1 = df_clean.iloc[:,:13].values
t1 = df_clean.iloc[:,13].values

f1, t1 = over.fit_resample(f1, t1)
Counter(t1)

In [None]:
df_clean.head()
df_clean.info()

In [None]:
print("\n📝 NEXT STEPS:")
print("=" * 15)
print("1. ✅ Data Loading & Exploration & Cleaning - COMPLETED")
print("2. ✅ Exploratory Data Analysis (EDA) - COMPLETED")
print("3. ✅ Feature Engineering - COMPLETED")

# Save the loaded data for next notebook
print("\n💾 Saving processed data for next notebook...")
df.to_csv('Telco-Customer-Churn.csv', index=False)
print("✅ Data saved to 'Telco-Customer-Churn.csv'")
df_clean.to_csv('Telco-Customer-Churn-Cleaned.csv', index=False)
print("✅ Data saved to 'Telco-Customer-Churn-Cleaned.csv'")