In [None]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    roc_auc_score, confusion_matrix, classification_report
)

In [None]:
# Load from Cleaned-dataset

df = pd.read_csv('../data/Cleaned-dataset.csv')
print('Shape:', df.shape)
df.head()

In [None]:
df.dtypes

In [None]:
# Separate features and target
x = df.drop('Churn', axis=1)
y = df['Churn']

# Identify feature types
num_features = x.select_dtypes(exclude='object').columns
cat_features = x.select_dtypes(include='object').columns
print('Numeric:', num_features.tolist())
print('Categorical:', cat_features.tolist())

In [None]:
# Separate binary and multi-category
binary_category = []
multi_category  = []
for feature in cat_features:
    # ✅ FIX: Use x[feature] not df[feature]
    if len(x[feature].unique()) == 2:
        binary_category.append(feature)
    else:
        multi_category.append(feature)
print('Binary:', binary_category)
print('Multi: ', multi_category)

In [None]:
#  Encode binary features on x
for col in binary_category:
    if col == 'gender':
        continue
    else:
        x[col] = x[col].map({'Yes': 1, 'No': 0})  # ✅ x[col] not df[col]

In [None]:
# One-hot encode ONCE
x = pd.get_dummies(
    x,
    columns=['gender', 'InternetService', 'Contract', 'PaymentMethod'],
    drop_first=True
)

# Convert bool to int
bool_cols = x.select_dtypes(include='bool').columns
x[bool_cols] = x[bool_cols].astype(int)

print('Features:')
for i, col in enumerate(x.columns, 1):
    print(f'  {i}. {col}')
print(f'Total: {len(x.columns)}')

In [None]:
x.select_dtypes(include='object').columns

In [None]:
x.dtypes

In [None]:
# Train-test split
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
#  Scale numeric features AND save scaler
scaler = StandardScaler()
numerical_columns = ['tenure', 'TotalCharges', 'MonthlyCharges']
x_train[numerical_columns] = scaler.fit_transform(x_train[numerical_columns])
x_test[numerical_columns]  = scaler.transform(x_test[numerical_columns])

#  Save scaler for app use
with open('../scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)
print('scaler.pkl saved')

In [None]:
x_test[numerical_columns] = scaler.transform(x_test[numerical_columns])

## Logistic Regression

In [None]:
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(x_train, y_train)

In [None]:
y_pred = lr_model.predict(x_test)
y_prob = lr_model.predict_proba(x_test)[:, 1]

In [None]:
print(f'Accuracy Score:  {accuracy_score(y_test, y_pred):.4f}')
print(f'Precision Score: {precision_score(y_test, y_pred):.4f}')
print(f'Recall Score:    {recall_score(y_test, y_pred):.4f}')
print(f'ROC AUC Score:   {roc_auc_score(y_test, y_prob):.4f}')

In [None]:
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
feature_importance = pd.Series(
    lr_model.coef_[0],
    index=x_train.columns
).sort_values(ascending=False)
feature_importance.head(10)

## Random Forest Classifier

In [None]:
rfc = RandomForestClassifier(n_estimators=200, random_state=42, class_weight='balanced')
rfc.fit(x_train, y_train)

In [None]:
y_pred_rfc = rfc.predict(x_test)
print('Accuracy:', accuracy_score(y_test, y_pred_rfc))
print(confusion_matrix(y_test, y_pred_rfc))
print(classification_report(y_test, y_pred_rfc))

In [None]:
y_prob_rfc = rfc.predict_proba(x_test)[:, 1]

threshold = 0.35
y_pred_custom = (y_prob_rfc >= threshold).astype(int)
print(confusion_matrix(y_test, y_pred_custom))
print(classification_report(y_test, y_pred_custom))

In [None]:
feature_importance_rfc = pd.Series(
    rfc.feature_importances_,
    index=x_train.columns
).sort_values(ascending=False)
feature_importance_rfc.head(10)

In [None]:
feature_importance_rfc.head(10).plot(kind='barh')
plt.gca().invert_yaxis()
plt.title('Top 10 Factors Influencing Churn')
plt.tight_layout()
plt.show()

In [None]:


with open('../model.pkl', 'wb') as f:
    pickle.dump(rfc, f)
print('model.pkl saved')

with open('../features.pkl', 'wb') as f:
    pickle.dump(x_train.columns.tolist(), f)
print(' features.pkl saved')

with open('../scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)
print(' scaler.pkl saved')

print('\n Model building complete!')
print(f'Features saved: {x_train.columns.tolist()}')