In [None]:
import pandas as pd
from sklearn.preprocessing  import StandardScaler, LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, accuracy_score
import joblib 
import warnings
warnings.filterwarnings("ignore")

In [None]:
train_df = pd.read_csv("customer_churn_dataset-training-master.csv")
test_df = pd.read_csv("customer_churn_dataset-testing-master.csv")

In [None]:
train_df.drop(columns=['CustomerID'], inplace=True)
test_df.drop(columns=['CustomerID'], inplace=True)

In [None]:
train_df.tail(40)

In [None]:
train_df.info()

In [None]:
train_df.isnull().sum()

In [None]:
train_df.shape

In [None]:
train_df.describe()

In [None]:
train_df.duplicated().sum()

In [None]:
if train_df['Churn'].isnull().any():
    print("Missing values found in target (Churn). Filling or dropping them.")
    train_df = train_df.dropna(subset=['Churn'])

In [None]:
## Count Plot for Gender
plt.figure(figsize=(6, 4))
sns.countplot(x='Gender', data=train_df)
plt.title('Count of Gender')
plt.show()

## Preprocessing

In [None]:
le = LabelEncoder()
for col in ['Gender', 'Subscription Type', 'Contract Length']:
    train_df[col] = le.fit_transform(train_df[col])
    test_df[col]  = le.transform(test_df[col])

In [None]:
train_df.head()

In [None]:
## Count Plot for Churn
plt.figure(figsize=(6, 4))
sns.countplot(x='Churn', data=train_df)
plt.title('Churn Distribution')
plt.show()

## Feature Selection

In [None]:
X = train_df.drop(columns=['Churn'])
y = train_df['Churn']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# plt.figure(figsize=(12,12))
# sns.heatmap(train_df.corr(), annot=True, fmt=".2f", cmap='coolwarm')
# plt.title('Correlation Heatmap')

In [None]:
## Pair Plot
sns.pairplot(train_df.dropna())
plt.suptitle('Pair Plot of Numerical Features', y=1.02)
plt.show()

In [None]:
## Histogram for Age
plt.figure(figsize=(8, 4))
sns.histplot(train_df['Age'].dropna(), bins=30)
plt.title('Distribution of Age')
plt.show()

In [None]:
train_df["Churn"].value_counts()

In [None]:
train_df["Churn"].value_counts().plot(kind='pie', color=['blue', 'orange'])
plt.title("Churn Distribution [Yes/No]")
plt.ylabel("")
plt.show()

In [None]:
train_df.groupby("Churn")["Usage Frequency"].mean()

In [None]:
train_df.groupby(["Churn", "Age"])["Tenure"].mean()

In [None]:
train_df.columns

## Scaling

In [None]:
# Scale the data (optional for XGBoost but good for consistency)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
joblib.dump(scaler, 'scaler.pkl')

In [None]:
def modelperformance(predictions):
    print("Accuracy Score on model is: {}".format(accuracy_score(y_test, predictions)))

## Logistic Regression

In [None]:
log_model = LogisticRegression()

In [None]:
log_model.fit(X_train, y_train)

In [None]:
y_pred_log_simple = log_model.predict(X_test)

In [None]:
modelperformance(y_pred_log_simple)

## KNN

In [None]:
# simple run without grid search
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)

In [None]:
y_pred_knn_simple = knn_model.predict(X_test)

In [None]:
modelperformance(y_pred_knn_simple)

In [None]:
# knn with grid search
# Hyperparameter tuning for KNN
param_grid = {
    "n_neighbors": [3, 5, 7, 9],
    "weights": ["uniform", "distance"],
}

In [None]:
gridkn = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5)
gridkn.fit(X_train, y_train)

In [None]:
gridkn.best_params_

In [None]:
y_pred_kn = gridkn.predict(X_test)

In [None]:
modelperformance(y_pred_kn)

## SVM

In [None]:
# SVM without grid search
svm = SVC()

In [None]:
svm.fit(X_train, y_train)

In [None]:
y_pred_svc_simple = svm.predict(X_test)

In [None]:
modelperformance(y_pred_svc_simple)

In [None]:
# SVM with grid search
param_grid = {
    'C': [0.01, 0.1, 0.5, 1],
    'kernel': ['linear', 'rbf', 'poly'],
}

In [None]:
gridsvc = GridSearchCV(svm, param_grid, cv = 5)

In [None]:
gridsvc.fit(X_train, y_train)

In [None]:
gridsvc.best_params_

In [None]:
y_pred_svc = gridsvc.predict(X_test)

In [None]:
modelperformance(y_pred_svc)

## Decision Tree

In [None]:
# Decision Tree without grid search
dt_model = DecisionTreeClassifier()

In [None]:
dt_model.fit(X_train, y_train)

In [None]:
y_pred_dtree_simple = dt_model.predict(X_test)

In [None]:
modelperformance(y_pred_dtree_simple)

In [None]:
# Decision tree with grid search
param_grid = {
    "criterion": ["gini", "entropy"],
    "splitter": ["best", "random"],
    "max_depth": [None, 10, 20, 30],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4]
}

In [None]:
gridtree = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=5)

In [None]:
gridtree.fit(X_train, y_train)

In [None]:
gridtree.best_params_

In [None]:
y_pred_dtree = gridtree.predict(X_test)

In [None]:
modelperformance(y_pred_dtree)

## Random Forest

In [None]:
# Random Forest without grid search
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

In [None]:
y_pred_rfc_simple = rf_model.predict(X_test)

In [None]:
modelperformance(y_pred_rfc_simple)

In [None]:
# Random Forest with grid search
param_grid = {
    "n_estimators": [32, 64, 128, 256],
    "max_features": [2, 3, 4],
    "bootstrap": [True, False]
}

In [None]:
gridrfc = GridSearchCV(rf_model, param_grid, cv=5)

In [None]:
gridrfc.fit(X_train, y_train)

In [None]:
gridrfc.best_params_

In [None]:
y_pred_rfc = gridrfc.predict(X_test)

In [None]:
modelperformance(y_pred_rfc)

## XGBoost

In [None]:
# Calculate imbalance ratio
imbalance_ratio = y_train.value_counts()[0] / y_train.value_counts()[1]

In [None]:
# Train XGBoost
# scale_pos_weight=imbalance_ratio,
model_xgb = XGBClassifier(
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)
model_xgb.fit(X_train, y_train)

In [None]:
# Predictions and evaluation
y_pred_xgb_simple = model_xgb.predict(X_test)
y_proba = model_xgb.predict_proba(X_test)[:, 1]

In [None]:
modelperformance(y_pred_xgb_simple)

## Choosing the best perfomance and saving the file in pkl format

In [None]:
# best_model = gridtree.best_estimator_
best_model = gridtree

In [None]:
joblib.dump(best_model, 'model.pkl')

## Storing other models predictions

In [None]:
model_logistic = y_pred_kn 
joblib.dump(model_logistic, 'model_logistic.pkl')

In [None]:
model_svc = y_pred_svc 
joblib.dump(model_svc, 'model_svc.pkl')

In [None]:
model_xgboost = y_pred_xgb_simple
joblib.dump(model_xgb, 'model_xgb.pkl')

In [None]:
X.columns

In [None]:
train_df.head(20)

In [None]:
plt.figure(figsize=(12, 12))
sns.countplot(x="Last Interaction", data=train_df)
plt.title("Support Calls Distribution")
plt.show()

In [None]:
# Extract importances
feature_importances = dt_model.feature_importances_

# Because we scaled directly on X_train, the names are just the columns of X_train
feature_names = X_train.columns.tolist()

# 6. Build and sort a DataFrame
feature_importances_df = (
    pd.DataFrame({
        'Feature': feature_names,
        'Importance': feature_importances
    })
    .sort_values('Importance', ascending=False)
)

In [None]:
# Plot
plt.figure(figsize=(12, 8))
plt.barh(feature_importances_df['Feature'], feature_importances_df['Importance'])
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Feature Importances (Decision Tree)')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

# Inspect the sorted importances
print(feature_importances_df)

In [None]:
from collections import Counter
print(Counter(y_train))  


In [None]:
import numpy as np


# Create a DataFrame with actual and predicted values
results = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred_dtree})

# Calculate the counts of actual and predicted values
actual_counts = results['Actual'].value_counts().sort_index()
predicted_counts = results['Predicted'].value_counts().sort_index()

# Create a bar plot
labels = ['No Churn', 'Churn']
x = np.arange(len(labels))  # the label locations
width = 0.35  # the width of the bars

fig, ax = plt.subplots(figsize=(10, 6))
rects1 = ax.bar(x - width/2, actual_counts, width, label='Actual')
rects2 = ax.bar(x + width/2, predicted_counts, width, label='Predicted')

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_xlabel('Churn')
ax.set_ylabel('Count')
ax.set_title('Actual vs Predicted Churn')
ax.set_xticks(x)
ax.set_xticklabels(labels)
ax.legend()

# Add data labels
def add_labels(rects):
    for rect in rects:
        height = rect.get_height()
        ax.annotate('{}'.format(height),
                    xy=(rect.get_x() + rect.get_width() / 2, height),
                    xytext=(0, 3),  # 3 points vertical offset
                    textcoords="offset points",
                    ha='center', va='bottom')

add_labels(rects1)
add_labels(rects2)

fig.tight_layout()
plt.show()

In [None]:
cm = confusion_matrix(y_test, y_pred_dtree_simple)

# Plot the confusion matrix
plt.figure(figsize=(10, 7))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['No Churn', 'Churn'], yticklabels=['No Churn', 'Churn'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

In [None]:

print(classification_report(y_test, y_pred_dtree_simple))


In [None]:
train_df.to_csv("preprocessed_data.csv", index=False)