EDA

# Load Library

In [None]:
# Load Dataset
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, roc_auc_score, roc_curve, auc, precision_recall_curve
from sklearn.metrics import average_precision_score, f1_score, precision_score, recall_score, ConfusionMatrixDisplay
from statsmodels.graphics.mosaicplot import mosaic
from sklearn.tree import DecisionTreeClassifier
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.pipeline import make_pipeline as imb_make_pipeline
from collections import Counter
import warnings

# Suppress all warnings
warnings.filterwarnings('ignore')



# Load Datasets

In [None]:
# Load datasets
df_customer_churn = pd.read_excel('C:/Users/xgygr/Desktop/CustomerChurn.xlsx')
df_telco_customer_churn = pd.read_excel('C:/Users/xgygr/Desktop/Telco_customer_churn.xlsx')
df_telco_customer_churn_demographics = pd.read_excel('C:/Users/xgygr/Desktop/Telco_customer_churn_demographics.xlsx')
df_telco_customer_churn_location = pd.read_excel('C:/Users/xgygr/Desktop/Telco_customer_churn_location.xlsx')
df_telco_customer_churn_population = pd.read_excel('C:/Users/xgygr/Desktop/Telco_customer_churn_population.xlsx')
df_telco_customer_churn_services = pd.read_excel('C:/Users/xgygr/Desktop/Telco_customer_churn_services.xlsx')
df_telco_customer_churn_status = pd.read_excel('C:/Users/xgygr/Desktop/Telco_customer_churn_status.xlsx')

# Rename the columns so that the primary key is the same
df_telco_customer_churn.rename(columns = {'CustomerID':'Customer ID'}, inplace = True)

# drop the count column from all datasets
df_telco_customer_churn.drop(['Count'], axis=1, inplace=True)
df_telco_customer_churn_demographics.drop(['Count'], axis=1, inplace=True)
df_telco_customer_churn_location.drop(['Count'], axis=1, inplace=True)
df_telco_customer_churn_services.drop(['Count'], axis=1, inplace=True)
df_telco_customer_churn_status.drop(['Count'], axis=1, inplace=True)


In [None]:
# Print the columns of each dataset
print(df_customer_churn.columns)
print(df_telco_customer_churn.columns)
print(df_telco_customer_churn_demographics.columns)
print(df_telco_customer_churn_location.columns)
print(df_telco_customer_churn_population.columns)
print(df_telco_customer_churn_services.columns)
print(df_telco_customer_churn_status.columns)

# count rows for each dataset
print(df_customer_churn.shape)
print(df_telco_customer_churn.shape)
print(df_telco_customer_churn_demographics.shape)
print(df_telco_customer_churn_location.shape)
print(df_telco_customer_churn_population.shape)
print(df_telco_customer_churn_services.shape)
print(df_telco_customer_churn_status.shape)

# Dataset Clean

In [None]:
#Merge datasets except for df_telco_customer_churn_population into one big dataset using CustomerID as the key, and remove duplicate columns
# Merge datasets into one big dataset using CustomerID as the key
df = pd.merge(df_telco_customer_churn, df_telco_customer_churn_demographics, on='Customer ID', how='left')
df = pd.merge(df, df_telco_customer_churn_location, on='Customer ID', how='left')
df = pd.merge(df, df_telco_customer_churn_services, on='Customer ID', how='left')
df = pd.merge(df, df_telco_customer_churn_status, on='Customer ID', how='left')


# Remove _x and _y suffix, and drop duplicate columns
df.columns = df.columns.str.replace('_x', '')
df.columns = df.columns.str.replace('_y', '')

# Identify and drop duplicate columns
df = df.loc[:, ~df.columns.duplicated()]

# count distinct column values for each column
df.nunique()

# Drop unnecessary columns: (too granular, don't provide additional information, or similar to other category)
df.drop(['Customer ID','Country','Lat Long', 'Latitude','City','State','Zip Code','Location ID','Service ID','Status ID', 'Longitude', 'Churn Label','Quarter','Internet Type', 'Churn Reason'], axis=1, inplace=True)

In [None]:
# Check for missing values
# Check for missing values in df
missing_values = df.isnull().sum()
rows_with_missing_values = df.isnull().any(axis=1).sum()

with pd.option_context('display.max_rows', None):
    # Check for missing values in df
    missing_values = df.isnull().sum()
    rows_with_missing_values = df.isnull().any(axis=1).sum()

    # Display the results
    print("Categories with Missing Values:")
    print(missing_values)


In [None]:
# Rows that have missing values: Total Charges, Offer, Churn Reason, Churn Category
# Drop rows with missing total charges since there are only 11 of them
df = df.dropna(subset=['Total Charges'])

# fill null values with 'None' when a customer didn't receive any offers
df['Offer'] = df['Offer'].fillna('None')

# fill null values with 'did not churn' for Churn Category
df['Churn Category'] = df['Churn Category'].fillna('Did Not Churn')

In [None]:
df

# EDA

In [None]:

# 'Total Refunds', 'Total Extra Data Charges', 'Tenure in Months'
numerical_cols = ['Tenure Months', 'Monthly Charges', 'Total Charges', 'CLTV', 'Age', 'Avg Monthly Long Distance Charges',
 'Avg Monthly GB Download', 'Monthly Charge', 'Total Long Distance Charges', 'Total Revenue']

categorical_cols = ['Gender', 'Senior Citizen', 'Partner', 'Dependents', 'Phone Service', 'Multiple Lines', 'Internet Service',
 'Online Security', 'Online Backup', 'Device Protection', 'Tech Support', 'Streaming TV', 'Streaming Movies', 'Contract', 'Paperless Billing', 'Payment Method',
 'Under 30', 'Married', 'Referred a Friend', 'Offer', 'Device Protection Plan', 'Premium Tech Support', 'Streaming Music', 'Unlimited Data','Satisfaction Score',
 'Number of Dependents', 'Number of Referrals']

Other_Churn_Flags =['Churn Score', 'Churn Value', 'Customer Status', 'Churn Category']

In [None]:
df['Churn Value'] = df['Churn Value'].apply(lambda x: 'Churned' if x == 1 else 'Not Churned')
df['Churn Value']

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(15, 10))
churn_counts = df['Churn Value'].value_counts()

axes[0].bar(churn_counts.index, churn_counts.values, color = [sns.color_palette('Set1')[1], sns.color_palette('Set1')[0]])
axes[0].set_title("Distribution of Churn Value", fontsize=16)
axes[0].set_xlabel("Churn Value", fontsize=14)
axes[0].set_ylabel("Count", fontsize=14)
axes[0].set_xticklabels(['Did Not Churned (0)', 'Churned (1)'])

axes[1].pie(churn_counts.tolist(), labels = churn_counts.index.tolist(), colors = [sns.color_palette('Set1')[1],
                                                                                   sns.color_palette('Set1')[0]],
         autopct='%.1f%%', startangle = 90)
axes[1].set_title(f"Pie Chart of Churn Value Distribution")

# Categorical

In [None]:
num_categorical_cols = len(categorical_cols)
num_rows_cat = math.ceil(num_categorical_cols / 2)
custom_palette = {'Not Churned': sns.color_palette('Set1')[1], 'Churned': sns.color_palette('Set1')[0]}

plt.figure(figsize=(14, num_rows_cat * 4))
for idx, col in enumerate(categorical_cols):
    plt.subplot(num_rows_cat, 2, idx + 1)
    sns.countplot(data=df, x=col, hue='Churn Value', palette=custom_palette)
    plt.title(f'Distribution of {col} by Churn')
    plt.xticks(rotation=0)
    plt.tight_layout()
plt.show()

In [None]:
# https://www.statsmodels.org/stable/generated/statsmodels.graphics.mosaicplot.mosaic.html

fig, axes = plt.subplots(9, 3, figsize=(30, num_rows_cat * 4))
axes_idx = 0
axes_graph_idx = 0

for idx, col in enumerate(categorical_cols):
    col_churn = df.groupby([col, 'Churn Value']).size().to_dict()
    mosaic(col_churn, ax=axes[axes_idx][axes_graph_idx], title = f'Mosaic Plot of {col} by Churn', axes_label = True,
           labelizer=lambda k: "")

    axes_graph_idx += 1

    if (axes_graph_idx % 3 == 0):
        axes_graph_idx = 0
        axes_idx += 1

plt.show()

In [None]:
pie_category = ['Churn Category', 'Satisfaction Score', 'Customer Status']
fig, axes = plt.subplots(1, 3, figsize=(20, 20))
idx = 0

for category in pie_category:
    pie_category_freq = df[category].value_counts()

    pie_category_key = pie_category_freq.index.tolist()
    pie_category_data = pie_category_freq.tolist()

    axes[idx].pie(pie_category_data, labels = pie_category_key, colors = sns.color_palette('bright'),
         autopct='%.0f%%', startangle = 90)
    axes[idx].set_title(f"Pie Chart of {category} Distribution")


    idx += 1

plt.show()


# Numerical

In [None]:
# Function to clean numeric columns
def clean_numeric_columns(df, columns):
    for col in columns:
        # Replace any spaces or empty strings with NaN
        df[col] = df[col].replace(" ", None)  # Replacing spaces with None
        df[col] = pd.to_numeric(df[col], errors='coerce')  # Convert to numeric, forcing invalid values to NaN
    return df

# Replace inf values with NaN across the entire dataframe
df.replace([float('inf'), float('-inf')], float('nan'), inplace=True)

# Clean numeric columns
df = clean_numeric_columns(df, numerical_cols)

In [None]:
df[numerical_cols]

In [None]:
# Plot distribution for numerical columns
num_numerical_cols = len(numerical_cols)
num_rows_num = math.ceil(num_numerical_cols / 2)

plt.figure(figsize=(14, num_rows_num * 4))
for idx, col in enumerate(numerical_cols):
    plt.subplot(num_rows_num, 2, idx + 1)
    sns.histplot(data=df, x=col, hue='Churn Value', element="step", stat="density", common_norm=False, palette=custom_palette)
    plt.title(f'Distribution of {col} by Churn')
    plt.tight_layout()
plt.show()

In [None]:
num_numerical_cols = len(numerical_cols)
num_rows_num = math.ceil(num_numerical_cols / 2)

plt.figure(figsize=(14, num_rows_num * 4))
for idx, col in enumerate(numerical_cols):
    plt.subplot(num_rows_num, 2, idx + 1)
    sns.boxplot(data = df, x = "Churn Value", y= col, hue = "Churn Value", palette=custom_palette)
    plt.title(f'Box Plot Distribution of {col} by Churn')
    plt.tight_layout()
plt.show()

In [None]:
df[numerical_cols]

In [None]:
for col in numerical_cols:
    print(f"Before clipping, {col}: {df[col].describe()}")
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1

    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    df[col] = df[col].apply(lambda x: min(max(x, lower_bound), upper_bound))
    
    print(f"After clipping, {col}: {df[col].describe()}")

In [None]:
df

In [None]:
from sklearn.preprocessing import LabelEncoder
def object_to_int(dataframe_series):
    if dataframe_series.dtype=='object':
        dataframe_series = LabelEncoder().fit_transform(dataframe_series)
    return dataframe_series

df = df.apply(lambda x: object_to_int(x))


# Data Splitting and Pre-processing

In [None]:
df.head()

In [None]:
plt.figure(figsize=(14,7))
df.corr()['Churn Value'].sort_values(ascending = False)

In [None]:
df_x = df.drop(columns=['Churn Value'])
df_y = df['Churn Value']

In [None]:
# split to 80/20
x_dev, x_test, y_dev, y_test = train_test_split(df_x, df_y, stratify = df_y, test_size = 0.2, random_state = 42)


In [None]:
print("x_dev Shape: ", x_dev.shape)
print("x_test Shape: ", x_test.shape, "\n")

print("y_dev Shape: ", y_dev.shape)
print("y_test Shape: ", y_test.shape, "\n")

In [None]:
# This was used to help decide what feature goes to what encoder
# for categorical in categorical_cols:
#     print("Category: ", categorical)
#     print(len(sorted(df[categorical].unique())))
#     print(sorted(df[categorical].unique()), "\n")

# for categorical in Other_Churn_Flags:
#     print("Category: ", categorical)
#     print(len(sorted(df[categorical].unique())))
#     print(sorted(df[categorical].unique()), "\n")

In [None]:
ordinal_categorical = ['Contract', 'Offer', 'Satisfaction Score', 'Number of Dependents', 'Number of Referrals', 'Customer Status']
one_hot_categorical = []

for categorical in categorical_cols:
    if categorical not in ordinal_categorical:
        one_hot_categorical.append(categorical)

one_hot_categorical.append('Churn Category')


In [None]:
for ordinal_category in ordinal_categorical:
    sorted_ordinal = [sorted(x_dev[ordinal_category].unique())]
    ordinal_encoder = OrdinalEncoder(categories = sorted_ordinal)

    x_dev[f"{ordinal_category}_ord"] = ordinal_encoder.fit_transform(x_dev[[ordinal_category]])
    x_test[f"{ordinal_category}_ord"] = ordinal_encoder.transform(x_test[[ordinal_category]])

x_dev = x_dev.drop(columns = ordinal_categorical)
x_test = x_test.drop(columns = ordinal_categorical)

In [None]:
print("x_dev Shape: ", x_dev.shape)
print("x_test Shape: ", x_test.shape)

In [None]:
x_dev = pd.get_dummies(x_dev, columns = one_hot_categorical, drop_first = True, dtype=int)
x_test = pd.get_dummies(x_test, columns = one_hot_categorical, drop_first = True, dtype=int)

In [None]:
print("x_dev Shape: ", x_dev.shape)
print("x_test Shape: ", x_test.shape)

In [None]:
numerical_cols.append("Churn Score")

x_dev[numerical_cols] = x_dev[numerical_cols].apply(pd.to_numeric, errors='coerce')
x_test[numerical_cols] = x_test[numerical_cols].apply(pd.to_numeric, errors='coerce')

In [None]:
from sklearn.impute import KNNImputer 

# impute any missing value in our numerical column
imputer = KNNImputer(n_neighbors=5)
x_dev[numerical_cols] = imputer.fit_transform(x_dev[numerical_cols])
x_test[numerical_cols] = imputer.transform(x_test[numerical_cols]) 

In [None]:
from sklearn.preprocessing import PowerTransformer

# Make each numerical features normally distributed
pt = PowerTransformer(method='yeo-johnson') 
x_dev[numerical_cols] = pt.fit_transform(x_dev[numerical_cols]) 
x_test[numerical_cols] = pt.transform(x_test[numerical_cols])   

# x_dev[numerical_cols] = np.log1p(x_dev[numerical_cols]) 
# x_test[numerical_cols] = np.log1p(x_test[numerical_cols])

In [None]:
# Provided that we are using decision tree as our baseline and using Random Forest and XG Boost for our model, we will use Min Max Scaler
# as that scale is best for these models
minMax_Scaler = MinMaxScaler()
x_dev[numerical_cols] = minMax_Scaler.fit_transform(x_dev[numerical_cols])
x_test[numerical_cols] = minMax_Scaler.transform(x_test[numerical_cols])

In [None]:
# Check for any correlation
correlation_matrix = x_dev.corr()
correlation_matrix

In [None]:
# https://stackabuse.com/applying-filter-methods-in-python-for-feature-selection/
# Find all features that return a threshold value greater than or equal to 0.8 to drop
correlated_features = set()

for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) >= 0.78:
            colname = correlation_matrix.columns[i]
            correlated_features.add(colname)

print(correlated_features)
print(len(correlated_features))

In [None]:
x_dev.drop(labels = correlated_features, axis = 1, inplace = True)
x_test.drop(labels = correlated_features, axis = 1, inplace = True)

In [None]:
correlation_matrix = x_dev.corr()
correlation_matrix

In [None]:
# Generates heat map
plt.imshow(correlation_matrix, cmap = 'coolwarm')

# adding colorbar
plt.colorbar()

# gets the column name for tick labels
tick_labels = [column for column in correlation_matrix.columns]

# adds column label to the heat map 
plt.title("Correlation matrix for features")
plt.xticks(range(len(correlation_matrix)), tick_labels, rotation=50, ha='right', fontsize = 7)
plt.yticks(range(len(correlation_matrix)), tick_labels, fontsize = 7)

plt.show()

In [None]:
print("x_dev Shape: ", x_dev.shape)
print("x_test Shape: ", x_test.shape, "\n")

print("y_dev Shape: ", y_dev.shape)
print("y_test Shape: ", y_test.shape, "\n")

# Model Building and Training

## Decision Tree Baseline Model - Random Oversampling

In [None]:
ros = RandomOverSampler(random_state=42)
x_resampled_ros, y_resampled_ros = ros.fit_resample(x_dev, y_dev)

print("Class Distribution After Oversampling:", Counter(y_resampled_ros))

classifier = DecisionTreeClassifier(
    max_depth=3,
    min_samples_split=60,
    min_samples_leaf=50,
    max_features="sqrt",
    random_state=42,
    ccp_alpha=0.1
)

scores = cross_validate(
    classifier,
    x_resampled_ros,
    y_resampled_ros,
    cv=5,
    scoring=['roc_auc', 'average_precision']
)

classifier.fit(x_resampled_ros, y_resampled_ros)

y_train_pred = classifier.predict(x_resampled_ros)
y_train_prob = classifier.predict_proba(x_resampled_ros)[:, 1]

roc_auc_train = roc_auc_score(y_resampled_ros, y_train_prob)
print(f"\nTraining ROC AUC: {roc_auc_train:.4f}")

print("\nClassification Report on Training Set:")
print(classification_report(y_resampled_ros, y_train_pred))

y_test_pred = classifier.predict(x_test)
y_test_prob = classifier.predict_proba(x_test)[:, 1]

roc_auc = roc_auc_score(y_test, y_test_prob)
print(f"Test ROC AUC: {roc_auc:.4f}")

print("\nClassification Report on Test Set:")
print(classification_report(y_test, y_test_pred))

conf_matrix = confusion_matrix(y_test, y_test_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix, display_labels=classifier.classes_)
disp.plot(cmap=plt.cm.Blues)
plt.title("Random Oversampling Model with Regularization")
plt.show()

precision = precision_score(y_test, y_test_pred, pos_label=0)
recall = recall_score(y_test, y_test_pred, pos_label=0)
f1 = f1_score(y_test, y_test_pred, pos_label=0)
print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {f1:.4f}")

## Decision Tree Baseline Model - SMOTE

In [None]:
smote = SMOTE(random_state=42)
x_resampled_smote, y_resampled_smote = smote.fit_resample(x_dev, y_dev)
print("SMOTE class distribution:", Counter(y_resampled_smote))

dt_smote = DecisionTreeClassifier(
    max_depth=2,
    min_samples_split=15,
    min_samples_leaf=10,
    max_features="sqrt",
    random_state=42,
    ccp_alpha=0.05
)

scores = cross_validate(
    dt_smote,
    x_resampled_smote,
    y_resampled_smote,
    cv=5,
    scoring=['roc_auc', 'average_precision']
)

dt_smote.fit(x_resampled_smote, y_resampled_smote)

y_train_pred = dt_smote.predict(x_resampled_smote)
y_train_prob = dt_smote.predict_proba(x_resampled_smote)[:, 1]
roc_auc_train = roc_auc_score(y_resampled_smote, y_train_prob)

print(f"Training ROC AUC: {roc_auc_train:.4f}")
print("Classification Report on Training Set:\n", classification_report(y_resampled_smote, y_train_pred))

y_pred_smote = dt_smote.predict(x_test)
y_prob_smote = dt_smote.predict_proba(x_test)[:, 1]

print("\nDecision Tree with SMOTE Results on Test Set:")
conf_matrix = confusion_matrix(y_test, y_pred_smote)
print("Confusion Matrix on Test Set:\n", conf_matrix)
print("Classification Report on Test Set:\n", classification_report(y_test, y_pred_smote))

conf_matrix = confusion_matrix(y_test, y_pred_smote)
disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix, display_labels=dt_smote.classes_)
disp.plot(cmap=plt.cm.Blues)
plt.title("Confusion Matrix: Decision Tree with SMOTE")
plt.show()

roc_auc = roc_auc_score(y_test, y_prob_smote)
precision = precision_score(y_test, y_pred_smote, pos_label=0)
recall = recall_score(y_test, y_pred_smote, pos_label=0)
f1 = f1_score(y_test, y_pred_smote, pos_label=0)

print(f"Test ROC AUC: {roc_auc:.4f}")
print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {f1:.4f}")

## Random Forest Model Baseline Model - RandomOverSampler

In [None]:
from sklearn.ensemble import RandomForestClassifier

# RandomOverSampler
ros = RandomOverSampler(random_state=42)
x_resampled_ros, y_resampled_ros = ros.fit_resample(x_dev, y_dev)

# initial model
rf_model = RandomForestClassifier(
    n_estimators=100, 
    max_depth=10,      
    min_samples_split=10,
    min_samples_leaf=5,    
    max_features="sqrt",  
    random_state=42
)

# cross validation
from sklearn.model_selection import cross_validate

scores_rf = cross_validate(
    rf_model,
    x_resampled_ros,
    y_resampled_ros,
    cv=5,
    scoring=['roc_auc', 'average_precision']
)

print("Random Forest CV Results:")
print(f"Mean ROC AUC: {scores_rf['test_roc_auc'].mean():.4f}")
print(f"Mean Average Precision: {scores_rf['test_average_precision'].mean():.4f}")

# model train
rf_model.fit(x_resampled_ros, y_resampled_ros)

# make prediction on development set
y_dev_pred_rf = rf_model.predict(x_resampled_ros)
y_dev_prob_rf = rf_model.predict_proba(x_resampled_ros)[:, 1]

roc_auc_train_rf = roc_auc_score(y_resampled_ros, y_dev_prob_rf)
print(f"\nTraining ROC AUC: {roc_auc_train_rf:.4f}")

print("\nClassification Report on Training Set:")
print(classification_report(y_resampled_ros, y_dev_pred_rf))

# make prediction on test set
y_test_pred_rf = rf_model.predict(x_test)
y_test_prob_rf = rf_model.predict_proba(x_test)[:, 1]

roc_auc_rf = roc_auc_score(y_test, y_test_prob_rf)
print(f"\nTest ROC AUC: {roc_auc_rf:.4f}")

print("\nClassification Report on Test Set:")
print(classification_report(y_test, y_test_pred_rf))

# confusion matrix
conf_matrix_rf = confusion_matrix(y_test, y_test_pred_rf)
disp_rf = ConfusionMatrixDisplay(confusion_matrix=conf_matrix_rf, display_labels=rf_model.classes_)
disp_rf.plot(cmap=plt.cm.Blues)
plt.title("Random Forest Model")
plt.show()

# performance
precision_rf = precision_score(y_test, y_test_pred_rf, pos_label=0)
recall_rf = recall_score(y_test, y_test_pred_rf, pos_label=0)
f1_rf = f1_score(y_test, y_test_pred_rf, pos_label=0)
print(f"Precision: {precision_rf:.4f}, Recall: {recall_rf:.4f}, F1-Score: {f1_rf:.4f}")


### feature importance 
the result from initial model perform very well on development set and test set as the ROC AUC score and confusion matrix are equal to 1. So it might indicate that the model is overfitting. Therfore, we will do feature importance to remove or reevalute the importance of the feature in the model during the training.  

In [None]:
feature_importances = pd.DataFrame({
    'Feature': x_dev.columns,
    'Importance': rf_model.feature_importances_
}).sort_values(by='Importance', ascending=False)

print(feature_importances.head(10))



In [None]:
# correlation between top three high importance score features and churn value
correlation_with_target = x_dev[['Customer Status_ord', 'Satisfaction Score_ord', 'Churn Score']].join(y_dev).corr()
print(correlation_with_target['Churn Value'])
      
print(df[['Churn Category', 'Churn Value']].groupby('Churn Category').mean())


In [None]:
# remove high importance feature based on correlation to reduce the risk of overfitting
# remove churn category 1 since its not highly correlated with churn
x_dev_dropped = x_dev.drop(columns=['Customer Status_ord', 'Churn Score', 'Churn Category_1'])
x_test_dropped = x_test.drop(columns=['Customer Status_ord', 'Churn Score', 'Churn Category_1'])


In [None]:
# re-train model
rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=5,
    min_samples_split=10,
    min_samples_leaf=5,
    max_features="sqrt",
    random_state=42
)

rf_model.fit(x_dev_dropped, y_dev)

# predict test set
y_test_pred_rf = rf_model.predict(x_test_dropped)
y_test_prob_rf = rf_model.predict_proba(x_test_dropped)[:, 1]

print("Test ROC AUC:", roc_auc_score(y_test, y_test_prob_rf))
print("\nClassification Report on Test Set:")
print(classification_report(y_test, y_test_pred_rf))

# confusion matrix
conf_matrix_rf = confusion_matrix(y_test, y_test_pred_rf)
disp_rf = ConfusionMatrixDisplay(confusion_matrix=conf_matrix_rf, display_labels=rf_model.classes_)
disp_rf.plot(cmap=plt.cm.Blues)
plt.title("Random Forest Model after Feature Adjustment")
plt.show()


## Random Forest Model - SMOTE (comparison)

In [None]:
# with smote
smote = SMOTE(random_state=42)
x_resampled_smote, y_resampled_smote = smote.fit_resample(x_dev, y_dev)


rf_smote_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=5,
    min_samples_split=10,
    min_samples_leaf=5,
    max_features="sqrt",
    random_state=42
)

rf_smote_model.fit(x_resampled_smote, y_resampled_smote)

y_test_pred_rf_smote = rf_smote_model.predict(x_test)
y_test_prob_rf_smote = rf_smote_model.predict_proba(x_test)[:, 1]

print("Random Forest with SMOTE Test ROC AUC:", roc_auc_score(y_test, y_test_prob_rf_smote))
print("\nClassification Report on Test Set (Random Forest with SMOTE):")
print(classification_report(y_test, y_test_pred_rf_smote))

conf_matrix_rf_smote = confusion_matrix(y_test, y_test_pred_rf_smote)
disp_rf_smote = ConfusionMatrixDisplay(confusion_matrix=conf_matrix_rf_smote, display_labels=rf_smote_model.classes_)
disp_rf_smote.plot(cmap=plt.cm.Blues)
plt.title("Random Forest Model with SMOTE")
plt.show()

In [None]:
feature_importances = pd.DataFrame({
    'Feature': x_dev.columns,
    'Importance': rf_smote_model.feature_importances_
}).sort_values(by='Importance', ascending=False)

print(feature_importances.head(10))



 it performance same in random oversampling, so we will use x_dev_dropped and x_test_dropped in other later training

In [None]:
# re- train model

smote = SMOTE(random_state=42)
x_resampled_smote_dropped, y_resampled_smote_dropped = smote.fit_resample(x_dev_dropped, y_dev)

rf_smote_drop_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=5,
    min_samples_split=10,
    min_samples_leaf=5,
    max_features="sqrt",
    random_state=42
)

rf_smote_drop_model.fit(x_resampled_smote_dropped, y_resampled_smote_dropped)

y_test_pred_drop = rf_smote_drop_model.predict(x_test_dropped)
y_test_prob_drop = rf_smote_drop_model.predict_proba(x_test_dropped)[:, 1]

print("Test ROC AUC after Removing Leak Features:", roc_auc_score(y_test, y_test_prob_drop))
print("\nClassification Report after Removing Leak Features:")
print(classification_report(y_test, y_test_pred_drop))

conf_matrix_cleaned = confusion_matrix(y_test, y_test_pred_drop)
disp_cleaned = ConfusionMatrixDisplay(confusion_matrix=conf_matrix_cleaned, display_labels=rf_smote_drop_model.classes_)
disp_cleaned.plot(cmap=plt.cm.Blues)
plt.title("Random Forest Model after Removing Leak Features")
plt.show()

In [None]:
# double check if the model wont highly dependent on one feature
# recalculate the feature importance
feature_importances_smote_drop = pd.DataFrame({
    'Feature': x_dev_dropped.columns,
    'Importance': rf_smote_drop_model.feature_importances_
}).sort_values(by='Importance', ascending=False)

print("Feature Importances after Removing Leak Features:")
print(feature_importances_smote_drop.head(10))
 

### ROC curve - compared

In [None]:

fpr_rf_smote, tpr_rf_smote, _ = roc_curve(y_test, y_test_prob_drop)
roc_auc_rf_smote = auc(fpr_rf_smote, tpr_rf_smote)
fpr_rf_ros, tpr_rf_ros, _ = roc_curve(y_test, y_test_prob_rf)
roc_auc_rf_ros = auc(fpr_rf_ros, tpr_rf_ros)

plt.figure(figsize=(10, 6))
plt.plot(fpr_rf_smote, tpr_rf_smote, label=f"Random Forest- SMOTE ROC AUC = {roc_auc_rf_smote:.4f}", linewidth=2)
plt.plot(fpr_rf_ros, tpr_rf_ros, label=f"Random Forest- ROS ROC AUC = {roc_auc_rf_ros:.4f}", linewidth=2)
plt.plot([0, 1], [0, 1], linestyle='--', color='gray', label="Random Guess")
plt.xlabel("False Positive Rate", fontsize=12)
plt.ylabel("True Positive Rate", fontsize=12)
plt.title("Random Forest ROC Curve Comparison: SMOTE vs ROS", fontsize=14)
plt.legend(loc="lower right")
plt.grid(alpha=0.3)
plt.show()


## XG Boost Model