In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
import seaborn as sns
from plotly.subplots import make_subplots
from sklearn import metrics

In [2]:
from sklearn.preprocessing import StandardScaler

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, RandomizedSearchCV, cross_validate
from sklearn.metrics import roc_curve, roc_auc_score, auc, accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, classification_report

Evaluation uses relevant metrics and applies (repeated/nested) cross validation appropriately. Hyperparameter tuning is done, and models are clearly compared and interpreted.

# Loading data

In [3]:
df = pd.read_csv('./dataset/WA_Fn-UseC_-Telco-Customer-Churn.csv')
# Set once, apply to all
pio.templates.default = "plotly_white"

# Data description

In [4]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [5]:
df.info()
df.shape

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


(7043, 21)

In [6]:
df.dtypes

customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

In [7]:
df.columns.values

array(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
       'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract',
       'PaperlessBilling', 'PaymentMethod', 'MonthlyCharges',
       'TotalCharges', 'Churn'], dtype=object)

In [8]:
df = df.drop(['customerID'], axis=1)
df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [1]:
import numpy as np
np.show_config()

Build Dependencies:
  blas:
    detection method: pkgconfig
    found: true
    include directory: E:/Anaconda3/envs/data/Library/include
    lib directory: E:/Anaconda3/envs/data/Library/lib
    name: blas
    openblas configuration: unknown
    pc file directory: E:/Anaconda3/envs/data/Library/lib/pkgconfig
    version: 3.9.0
  lapack:
    detection method: pkgconfig
    found: true
    include directory: E:/Anaconda3/envs/data/Library/include
    lib directory: E:/Anaconda3/envs/data/Library/lib
    name: lapack
    openblas configuration: unknown
    pc file directory: E:/Anaconda3/envs/data/Library/lib/pkgconfig
    version: 3.9.0
Compilers:
  c:
    commands: cl.exe
    linker: link
    name: msvc
    version: 19.44.35217
  c++:
    commands: cl.exe
    linker: link
    name: msvc
    version: 19.44.35217
  cython:
    commands: cython
    linker: cython
    name: cython
    version: 3.1.5
Machine Information:
  build:
    cpu: x86_64
    endian: little
    family: x86_64
    sys

# Missing value 

In [9]:
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors="coerce")
df.isna().sum()

gender               0
SeniorCitizen        0
Partner              0
Dependents           0
tenure               0
PhoneService         0
MultipleLines        0
InternetService      0
OnlineSecurity       0
OnlineBackup         0
DeviceProtection     0
TechSupport          0
StreamingTV          0
StreamingMovies      0
Contract             0
PaperlessBilling     0
PaymentMethod        0
MonthlyCharges       0
TotalCharges        11
Churn                0
dtype: int64

In [None]:
# Correlation of missingness (1 = missing, 0 = present)
missing_corr = df.isna().corr()

sns.heatmap(missing_corr, annot=True, cmap='coolwarm')
plt.title('Correlation of Missingness')
plt.show()

In [None]:
df[np.isnan(df['TotalCharges'])]

totalCharges is 0 while tenure must be 0

In [None]:
df.drop(labels=df[df['tenure'] == 0].index, axis=0, inplace=True)

impute totalCharges with mean value

In [None]:
df.fillna(df['TotalCharges'].mean(), inplace=True)

In [None]:
df.isna().sum()

In [None]:
df["SeniorCitizen"] = df["SeniorCitizen"].map({0: "No", 1: "Yes"})

In [None]:
df["InternetService"].describe()

In [None]:
numerical_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']
df[numerical_cols].describe()

# Visualization

## Gender and churn distribution

In [None]:
# Create individual pie charts with px
fig_gender = px.pie(df, names='gender', title='Gender')
fig_churn = px.pie(df, names='Churn', title='Churn')

# Create subplots
fig = make_subplots(rows=1, cols=2, specs=[[{'type': 'domain'}, {'type': 'domain'}]])

# Add traces from px figures
fig.add_trace(fig_gender.data[0], row=1, col=1)
fig.add_trace(fig_churn.data[0], row=1, col=2)

# Update for donut style
fig.update_traces(hole=0.4, hoverinfo="label+percent+name", textfont_size=16)

# Better title and annotations
fig.update_layout(
    title_text="Gender and Churn Distributions",
    title_x=0.5,
    annotations=[
        dict(text='Gender', x=0.18, y=0.5, font_size=20, showarrow=False),
        dict(text='Churn', x=0.82, y=0.5, font_size=20, showarrow=False)
    ],
    font=dict(size=14)
)

fig.show()

## Churn rate by gender

In [None]:
df[df["Churn"] == "No"]["gender"].value_counts()

In [None]:
# Create a crosstab
churn_gender = pd.crosstab(df['Churn'], df['gender'], normalize='index') * 100

# Plot
ax = churn_gender.plot(kind='bar', stacked=True, color=['#c2c2f0', '#ffb3e6'], figsize=(8, 6))
ax.set_ylabel('Percentage (%)')
ax.set_title('Churn Rate by Gender')
for container in ax.containers:
    ax.bar_label(container, fmt='%.1f%%', label_type='center')
plt.legend(title='Gender')
plt.show()

In [None]:
churn_gender_counts = pd.crosstab(df['Churn'], df['gender'])

ax = churn_gender_counts.plot(kind='bar', color=['#c2c2f0', '#ffb3e6'], figsize=(8, 6))
ax.set_ylabel('Count')
ax.set_title('Number of Customers by Churn and Gender')
for container in ax.containers:
    ax.bar_label(container)
plt.legend(title='Gender')
plt.show()

## Contract distribution

In [None]:
fig = px.histogram(df, x="Churn", color="Contract", barmode="group", title="<b>Customer contract distribution<b>")
fig.update_layout(width=700, height=500, bargap=0.1)
fig.show()

## Payment Method Distribution

In [None]:
labels = df['PaymentMethod'].unique()
values = df['PaymentMethod'].value_counts()

fig = go.Figure(data=[go.Pie(labels=labels, values=values, hole=.3)])
fig.update_layout(title_text="<b>Payment Method Distribution</b>")
fig.show()

## Payment for Churn

In [None]:
fig = px.histogram(df, x="Churn", color="PaymentMethod", title="Customer Payment Method distribution w.r.t. Churn")
fig.update_layout(width=700, height=500, bargap=0.1)
fig.show()

## Internet Service and Gender of Churn

In [None]:
df["InternetService"].unique()

In [None]:
df[df["gender"] == "Male"][["InternetService", "Churn"]].value_counts()

In [None]:
fig = go.Figure()

# Define categories
churn_labels = ['Churn:No', 'Churn:Yes']
genders = ['Female', 'Male']

# Data: [ [DSL_F, DSL_M], [Fiber_F, Fiber_M], [NoInternet_F, NoInternet_M] ] per churn group
data = {
    'DSL': {
        'Churn:No': [965, 992],
        'Churn:Yes': [219, 240]
    },
    'Fiber optic': {
        'Churn:No': [889, 910],
        'Churn:Yes': [664, 633]
    },
    'No Internet': {
        'Churn:No': [690, 717],
        'Churn:Yes': [56, 57]
    }
}

# Build x-axis labels: "Churn:No-Female", "Churn:No-Male".
x_labels = [f"{churn}-{gender}" for churn in churn_labels for gender in genders]

# Add a trace for each InternetService (stacked)
for service, churn_data in data.items():
    y_values = []
    for churn in churn_labels:
        y_values.extend(churn_data[churn])  # [F, M] for this churn group
    fig.add_trace(go.Bar(
        x=x_labels,
        y=y_values,
        name=service,
        text=y_values,
        textposition='auto'
    ))

fig.update_layout(title_text="<b>Churn Distribution w.r.t. Internet Service and Gender</b>")

fig.show()

## Dependents churn distribution

In [None]:
color_map = {"Yes": "#FF97FF", "No": "#AB63FA"}
fig = px.histogram(df, x="Churn", color="Dependents", barmode="group", title="<b>Dependents distribution</b>",
                   color_discrete_map=color_map, text_auto=True)
fig.update_layout(width=700, height=500, bargap=0.1)
fig.show()

## Partner Churn

In [None]:
color_map = {"Yes": '#FFA15A', "No": '#00CC96'}
fig = px.histogram(df, x="Churn", color="Partner", barmode="group", title="<b>Chrun distribution w.r.t. Partners</b>",
                   color_discrete_map=color_map, text_auto=True)
fig.update_layout(width=700, height=500, bargap=0.1)
fig.show()

## SeniorCitizen distribution

In [None]:
color_map = {"Yes": '#00CC96', "No": '#B6E880'}
fig = px.histogram(df, x="Churn", color="SeniorCitizen", title="<b>Chrun distribution w.r.t. Senior Citizen</b>",
                   color_discrete_map=color_map, text_auto=True)
fig.update_layout(width=700, height=500, bargap=0.1)
fig.show()

## Online security churn

In [None]:
color_map = {"Yes": "#FF97FF", "No": "#AB63FA"}
fig = px.histogram(df, x="Churn", color="OnlineSecurity", barmode="group", title="<b>Churn w.r.t Online Security</b>",
                   color_discrete_map=color_map, text_auto=True)
fig.update_layout(width=700, height=500, bargap=0.1)
fig.show()

## paperless billing

In [None]:
color_map = {"Yes": '#FFA15A', "No": '#00CC96'}
fig = px.histogram(df, x="Churn", color="PaperlessBilling", title="<b>Chrun distribution w.r.t. Paperless Billing</b>",
                   color_discrete_map=color_map, text_auto=True)
fig.update_layout(width=700, height=500, bargap=0.1)
fig.show()

## TechSupport distribution

In [None]:
fig = px.histogram(df, x="Churn", color="TechSupport", barmode="group",
                   title="<b>Chrun distribution w.r.t. TechSupport</b>", text_auto=True)
fig.update_layout(width=700, height=500, bargap=0.1)
fig.show()

In [None]:
color_map = {"Yes": '#00CC96', "No": '#B6E880'}
fig = px.histogram(df, x="Churn", color="PhoneService", title="<b>Chrun distribution w.r.t. Phone Service</b>",
                   color_discrete_map=color_map, text_auto=True)
fig.update_layout(width=700, height=500, bargap=0.1)
fig.show()

In [None]:
sns.set_context("paper", font_scale=1.1)
ax = sns.kdeplot(df.MonthlyCharges[(df["Churn"] == 'No')],
                 color="Red", fill=True)
ax = sns.kdeplot(df.MonthlyCharges[(df["Churn"] == 'Yes')],
                 ax=ax, color="Blue", fill=True)
ax.legend(["Not Churn", "Churn"], loc='upper right')
ax.set_ylabel('Density')
ax.set_xlabel('Monthly Charges')
ax.set_title('Distribution of monthly charges by churn')

In [None]:
sns.set_context("paper", font_scale=1.1)
ax = sns.kdeplot(df.MonthlyCharges[(df["Churn"] == 'No')],
                 color="Red", fill=True)
ax = sns.kdeplot(df.MonthlyCharges[(df["Churn"] == 'Yes')],
                 ax=ax, color="Blue", fill=True)
ax.legend(["Not Churn", "Churn"], loc='upper right')
ax.set_ylabel('Density')
ax.set_xlabel('Monthly Charges')
ax.set_title('Distribution of monthly charges by churn')

In [None]:
fig = px.box(df, x='Churn', y='tenure')

# Update yaxis properties
fig.update_yaxes(title_text='Tenure (Months)', row=1, col=1)
# Update xaxis properties
fig.update_xaxes(title_text='Churn', row=1, col=1)

# Update size and title
fig.update_layout(autosize=True, width=750, height=600,
                  title_font=dict(size=25, family='Courier'),
                  title='Tenure vs Churn',
                  )

fig.show()

In [None]:
plt.figure(figsize=(25, 10))

corr = df.apply(lambda x: pd.factorize(x)[0]).corr()

mask = np.triu(np.ones_like(corr, dtype=bool))

ax = sns.heatmap(corr, mask=mask, xticklabels=corr.columns, yticklabels=corr.columns, annot=True, linewidths=.2,
                 cmap='coolwarm', vmin=-1, vmax=1)

# Data preprocessing

In [None]:
df.nunique()

In [None]:
for col in df.columns:
    n_unique = df[col].nunique(dropna=True)
    if n_unique < 5:
        uniques = df[col].dropna().unique()
        print(f"{col}: {list(uniques)}")

## Encoding

In [None]:
binary_cols = ['Partner', 'Dependents', 'PhoneService',
               'PaperlessBilling', 'Churn', 'SeniorCitizen']

for col in binary_cols:
    df[col] = df[col].map({'No': 0, 'Yes': 1})

df['gender'] = df['gender'].map({'Female': 0, 'Male': 1})

categorical_cols = [
    'MultipleLines', 'InternetService', 'OnlineSecurity',
    'OnlineBackup', 'DeviceProtection', 'TechSupport',
    'StreamingTV', 'StreamingMovies', 'Contract', 'PaymentMethod'
]

df = pd.get_dummies(df, columns=categorical_cols, drop_first=False)
# drop_first=False 保留所有类别（避免信息丢失）；若担心共线性可设为 True

In [None]:
# Get Churn correlations
churn_corr = df.corr()['Churn'].sort_values(ascending=False)

# Remove 'Churn' itself if you don't want it (it will be 1.0)
churn_corr = churn_corr.drop('Churn')

# Create figure
plt.figure(figsize=(16, 9))

# Use a diverging colormap: red (positive), white (0), blue (negative)
colors = plt.cm.RdYlBu_r((churn_corr + 1) / 2)  # Normalize to [0,1] for colormap

# Plot
bars = plt.bar(churn_corr.index, churn_corr.values, color=colors, edgecolor='black', linewidth=0.5)

# Rotate x-axis labels for readability
plt.xticks(rotation=60, ha='right', fontsize=11)
plt.yticks(fontsize=11)

# Labels and title
plt.ylabel('Correlation with Churn', fontsize=13)
plt.title('Feature Correlation with Churn (Higher = More Likely to Churn)', fontsize=16, weight='bold')

# Add horizontal line at 0
plt.axhline(0, color='gray', linewidth=0.8)

# Optional: Add value labels on bars (only for strong correlations to avoid clutter)
for bar, corr in zip(bars, churn_corr.values):
    if abs(corr) > 0.1:  # Only label if |correlation| > 0.1
        plt.text(
            bar.get_x() + bar.get_width() / 2,
            bar.get_height() + (0.01 if bar.get_height() >= 0 else -0.02),
            f'{corr:.2f}',
            ha='center', va='bottom' if bar.get_height() >= 0 else 'top',
            fontsize=9, fontweight='bold'
        )

plt.tight_layout()
plt.show()

## Split train/test set

In [None]:
y = df['Churn'].values
X = df.drop(columns=['Churn'])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=40, stratify=y)

In [None]:
def distplot(feature, frame, color='r'):
    plt.figure(figsize=(8, 3))
    plt.title("Distribution for {}".format(feature))
    sns.histplot(frame[feature], color=color)

In [None]:
num_cols = ["tenure", 'MonthlyCharges', 'TotalCharges']
for feat in num_cols: distplot(feat, df)

In [None]:
df_std = pd.DataFrame(StandardScaler().fit_transform(df[num_cols].astype('float64')), columns=num_cols)
for feat in numerical_cols: distplot(feat, df_std, color='c')

In [None]:
scaler = StandardScaler()

X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])

# Model

## Knn

In [None]:
knn_model = KNeighborsClassifier(n_neighbors = 11)
knn_model.fit(X_train,y_train)
predict_knn_y = knn_model.predict(X_test)
accuracy_knn = knn_model.score(X_test,y_test)
print("KNN accuracy:",accuracy_knn)
print(classification_report(y_test, predict_knn_y))

In [None]:
# Confusion Matrix
plt.figure(figsize=(4, 3))
sns.heatmap(
    confusion_matrix(y_test, predict_knn_y),
    annot=True,
    fmt="d",
    linecolor="k",
    linewidths=3,
    cmap="Blues"
)
plt.title("KNN CONFUSION MATRIX", fontsize=14)
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()

# ROC Curve
y_pred_prob = knn_model.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)

plt.figure(figsize=(6, 5))
plt.plot([0, 1], [0, 1], 'k--', label='Random Classifier')
plt.plot(fpr, tpr, label='KNN', color="r", linewidth=2)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('KNN ROC Curve', fontsize=16)
plt.legend()
plt.grid(alpha=0.3)
plt.show()

## Svm

In [None]:
svc_model = SVC(probability=True, random_state=1)
svc_model.fit(X_train, y_train)

# Make predictions
prediction_svc_y = svc_model.predict(X_test)

# Evaluate accuracy
accuracy_svc = svc_model.score(X_test, y_test)
print("SVM accuracy is:", accuracy_svc)
print(classification_report(y_test, prediction_svc_y))

In [None]:
# Confusion Matrix
plt.figure(figsize=(4, 3))
sns.heatmap(
    confusion_matrix(y_test, prediction_svc_y),
    annot=True,
    fmt="d",
    linecolor="k",
    linewidths=3,
    cmap="Blues"
)
plt.title("SVM CONFUSION MATRIX", fontsize=14)
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.tight_layout()
plt.show()

# ROC Curve (requires predict_proba → enabled by probability=True)
y_pred_prob = svc_model.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)

plt.figure(figsize=(6, 5))
plt.plot([0, 1], [0, 1], 'k--', label='Random Classifier')
plt.plot(fpr, tpr, label='SVM', color="r", linewidth=2)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('SVM ROC CURVE', fontsize=16)
plt.legend()
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

## ramdom tree

In [None]:
rt_model = RandomForestClassifier(n_estimators=500, oob_score=True, n_jobs=-1,
                                  random_state=50, max_features="sqrt",
                                  max_leaf_nodes=30)
rt_model.fit(X_train, y_train)

# Make predictions
prediction_rf_y = rt_model.predict(X_test)

# Print performance metrics
print(metrics.accuracy_score(y_test, prediction_rf_y))
print(classification_report(y_test, prediction_rf_y))

In [None]:
# Confusion Matrix
plt.figure(figsize=(4, 3))
sns.heatmap(
    confusion_matrix(y_test, prediction_rf_y),
    annot=True,
    fmt="d",
    linecolor="k",
    linewidths=3,
    cmap="Blues"
)
plt.title("RANDOM FOREST CONFUSION MATRIX", fontsize=14)
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()

# ROC Curve
y_pred_prob = rt_model.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)

plt.figure(figsize=(6, 5))
plt.plot([0, 1], [0, 1], 'k--', label='Random Classifier')
plt.plot(fpr, tpr, label='Random Forest', color="r", linewidth=2)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('RANDOM FOREST ROC CURVE', fontsize=16)
plt.legend()
plt.grid(alpha=0.3)
plt.show()

## logistic regression

In [None]:
lr_model = LogisticRegression(random_state=42, max_iter=1000)
lr_model.fit(X_train, y_train)
prediction_lr_y = lr_model.predict(X_test)

accuracy_lr = lr_model.score(X_test, y_test)
print("Logistic Regression accuracy is :", accuracy_lr)
report = classification_report(y_test, prediction_lr_y)
print(report)

In [None]:
# Confusion Matrix
plt.figure(figsize=(4, 3))
sns.heatmap(
    confusion_matrix(y_test, prediction_lr_y),
    annot=True,
    fmt="d",
    linecolor="k",
    linewidths=3,
    cmap="Blues"
)
plt.title("LOGISTIC REGRESSION CONFUSION MATRIX", fontsize=14)
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.tight_layout()
plt.show()

# ROC Curve
lr_y_pred_prob = lr_model.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, lr_y_pred_prob)

plt.figure(figsize=(6, 5))
plt.plot([0, 1], [0, 1], 'k--', label='Random Classifier')
plt.plot(fpr, tpr, label='Logistic Regression', color="r", linewidth=2)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('LOGISTIC REGRESSION ROC CURVE', fontsize=16)
plt.legend()
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

## Decision tree

In [None]:
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)

prediction_dt_y = dt_model.predict(X_test)

accuracy_dt = dt_model.score(X_test, y_test)
print("Decision Tree accuracy is:", accuracy_dt)
print(classification_report(y_test, prediction_dt_y))

In [None]:
# Confusion Matrix
plt.figure(figsize=(4, 3))
sns.heatmap(
    confusion_matrix(y_test, prediction_dt_y),
    annot=True,
    fmt="d",
    linecolor="k",
    linewidths=3,
    cmap="Blues"
)
plt.title("DECISION TREE CONFUSION MATRIX", fontsize=14)
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.tight_layout()
plt.show()

# ROC Curve
y_pred_prob = dt_model.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)

plt.figure(figsize=(6, 5))
plt.plot([0, 1], [0, 1], 'k--', label='Random Classifier')
plt.plot(fpr, tpr, label='Decision Tree', color="r", linewidth=2)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('DECISION TREE ROC CURVE', fontsize=16)
plt.legend()
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

# Compare models

In [None]:
models = {
    'Logistic Regression': lr_model,
    'KNN': knn_model,
    'Random Forest': rt_model,
    'Decision Tree': dt_model,
    'SVM': svc_model
}

In [None]:
results = []

for name, model in models.items():
    # Predictions
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]

    # Metrics
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred_proba)

    results.append({
        'Model': name,
        'Accuracy': acc,
        'Precision': prec,
        'Recall': rec,
        'F1-Score': f1,
        'AUC-ROC': auc
    })

# Create DataFrame
results_df = pd.DataFrame(results).round(4)
print(results_df)

In [None]:
import plotly.express as px

# Melt for easy plotting
melted = results_df.melt(id_vars='Model',
                         value_vars=['Accuracy', 'Recall', 'F1-Score', 'AUC-ROC'],
                         var_name='Metric', value_name='Score')

fig = px.bar(melted, x='Model', y='Score', color='Metric', barmode='group',
             title='Model Performance Comparison',
             height=500, text_auto=True)
fig.update_layout(yaxis_range=[0, 1])
fig.show()

In [None]:
plt.figure(figsize=(8, 6))
plt.plot([0, 1], [0, 1], 'k--', label='Random Classifier')

for name, model in models.items():
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
    plt.plot(fpr, tpr, label=f'{name} (AUC = {roc_auc_score(y_test, y_pred_proba):.3f})')

plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves Comparison')
plt.legend()
plt.grid(alpha=0.3)
plt.show()

In [None]:
fig, axes = plt.subplots(2, 3, figsize=(15, 8))
axes = axes.ravel()

for idx, (name, model) in enumerate(models.items()):
    if idx >= 6: break
    y_pred = model.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', ax=axes[idx], cmap='Blues')
    axes[idx].set_title(f'{name}\nAccuracy: {accuracy_score(y_test, y_pred):.2f}')
    axes[idx].set_xlabel('Predicted')
    axes[idx].set_ylabel('Actual')

# Hide unused subplots
for j in range(len(models), 6):
    axes[j].axis('off')

plt.tight_layout()
plt.show()