In [18]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, classification_report,confusion_matrix,roc_auc_score


from sklearn.linear_model import LogisticRegression, RidgeClassifier,Lasso
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier,BaggingClassifier, AdaBoostClassifier,GradientBoostingClassifier
from sklearn.cluster import KMeans
import warnings
warnings.filterwarnings('ignore')

In [6]:
df=pd.read_csv(r"C:\Users\yuvra\Downloads\WA_Fn-UseC_-Telco-Customer-Churn.csv")

In [11]:
df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [13]:
#Handle Missing Values
#df=df.drop(['customerID'],axis=1)
df['TotalCharges'] =pd.to_numeric(df['TotalCharges'],errors='coerce')
df.dropna(inplace=True)

for col in df.select_dtypes(include=['object']).columns:
    if col !='Churn':
        le=LabelEncoder()
        df[col]=le.fit_transform(df[col])
        
df['Churn']=df['Churn'].map({'No':0,'Yes':1})
    

In [19]:
x=df.drop('Churn',axis=1)
y=df['Churn']

scaler =StandardScaler()
X_scaled=scaler.fit_transform(x)


pca =PCA(n_components=2)
x_pca=pca.fit_transform(X_scaled)

x_train,x_test,y_train,y_Test= train_test_split(X_scaled,y,test_size=0.2,random_state=42)

In [28]:
df.dtypes

gender                int32
SeniorCitizen         int64
Partner               int32
Dependents            int32
tenure                int64
PhoneService          int32
MultipleLines         int32
InternetService       int32
OnlineSecurity        int32
OnlineBackup          int32
DeviceProtection      int32
TechSupport           int32
StreamingTV           int32
StreamingMovies       int32
Contract              int32
PaperlessBilling      int32
PaymentMethod         int32
MonthlyCharges      float64
TotalCharges        float64
Churn                 int64
dtype: object

In [30]:
log_reg =LogisticRegression()
log_reg.fit(x_train,y_train)
y_pred_log=log_reg.predict(x_test)

In [34]:
knn = KNeighborsClassifier()
knn.fit(x_train, y_train)
y_pred_knn = knn.predict(x_test)

In [35]:
dtree = DecisionTreeClassifier()
dtree.fit(x_train, y_train)
y_pred_dt = dtree.predict(x_test)


In [36]:
svm = SVC(probability=True)
svm.fit(x_train, y_train)
y_pred_svm = svm.predict(x_test)


In [37]:
nb = GaussianNB()
nb.fit(x_train, y_train)
y_pred_nb = nb.predict(x_test)


In [38]:
rf = RandomForestClassifier()
rf.fit(x_train, y_train)
y_pred_rf = rf.predict(x_test)


In [39]:
bag = BaggingClassifier()
bag.fit(x_train, y_train)
y_pred_bag = bag.predict(x_test)


In [40]:
gb = GradientBoostingClassifier()
gb.fit(x_train, y_train)
y_pred_gb = gb.predict(x_test)


In [41]:
ab = AdaBoostClassifier()
ab.fit(x_train, y_train)
y_pred_ab = ab.predict(x_test)


In [42]:
ridge = RidgeClassifier()
ridge.fit(x_train, y_train)
y_pred_ridge = ridge.predict(x_test)


In [43]:
lasso = LogisticRegression(penalty='l1', solver='liblinear')
lasso.fit(x_train, y_train)
y_pred_lasso = lasso.predict(x_test)


In [44]:
kmeans = KMeans(n_clusters=2, random_state=42)
kmeans.fit(X_scaled)
df['Cluster'] = kmeans.labels_


In [46]:
def evaluate(name, y_true, y_pred):
    print(f"🔍 {name}")
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print("ROC AUC:", roc_auc_score(y_true, y_pred))
    print(confusion_matrix(y_true, y_pred))
    print(classification_report(y_true, y_pred))
    print("="*60)

models = {
    'Logistic Regression': y_pred_log,
    'KNN': y_pred_knn,
    'Decision Tree': y_pred_dt,
    'SVM': y_pred_svm,
    'Naive Bayes': y_pred_nb,
    'Random Forest': y_pred_rf,
    'Bagging': y_pred_bag,
    'Gradient Boosting': y_pred_gb,
    'AdaBoost': y_pred_ab,
    'Ridge': y_pred_ridge,
    'Lasso': y_pred_lasso
}

for name, pred in models.items():
    evaluate(name, y_Test, pred)


🔍 Logistic Regression
Accuracy: 0.7853589196872779
ROC AUC: 0.6926311402850324
[[920 113]
 [189 185]]
              precision    recall  f1-score   support

           0       0.83      0.89      0.86      1033
           1       0.62      0.49      0.55       374

    accuracy                           0.79      1407
   macro avg       0.73      0.69      0.70      1407
weighted avg       0.77      0.79      0.78      1407

🔍 KNN
Accuracy: 0.7391613361762616
ROC AUC: 0.6654337348773884
[[850 183]
 [184 190]]
              precision    recall  f1-score   support

           0       0.82      0.82      0.82      1033
           1       0.51      0.51      0.51       374

    accuracy                           0.74      1407
   macro avg       0.67      0.67      0.67      1407
weighted avg       0.74      0.74      0.74      1407

🔍 Decision Tree
Accuracy: 0.7213930348258707
ROC AUC: 0.6550387998198488
[[823 210]
 [182 192]]
              precision    recall  f1-score   support

       