# Import Required Libraries

In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

import joblib


# Load Data Files

In [3]:
customers = pd.read_csv("DATA/customers.csv")
usage = pd.read_csv("DATA/usage_data.csv")
complaints = pd.read_csv("DATA/complaints.csv")
billing = pd.read_csv("DATA/billing.csv")

print("Files Loaded Successfully!")


Files Loaded Successfully!


# Display Sample Data

In [4]:
print(customers.head())
print(usage.head())
print(complaints.head())
print(billing.head())


   customer_id         name plan_type   region
0         1001   Asha Mehta   Prepaid    Delhi
1         1002   Ravi Kumar  Postpaid   Mumbai
2         1003    Sneha Rao   Prepaid  Chennai
3         1004  Manoj Singh  Postpaid    Delhi
4         1005   Divya Jain   Prepaid  Kolkata
   customer_id  data_used_gb  calls_made  revenue_inr
0         1001           5.2          25          180
1         1002          12.5          40          280
2         1003           7.8          32          210
3         1004          15.6          55          320
4         1005           3.4          18          120
   customer_id  category        created_at  status
0         1002   Billing  2025-09-25 10:45    Open
1         1004   Network  2025-09-25 09:30    Open
2         1005  Recharge  2025-09-25 14:00  Closed
3         1002   Network  2025-09-26 20:40    Open
4         1003   Support  2025-09-26 11:10    Open
   customer_id  tenure   contract_type  monthly_charges churn
0         1001      12  Mo

# Merge All Datasets

In [5]:
df = customers.merge(usage, on="customer_id", how="left")
df = df.merge(complaints, on="customer_id", how="left")
df = df.merge(billing, on="customer_id", how="left")

print("Merged dataset shape:", df.shape)
df.head()


Merged dataset shape: (11, 14)


Unnamed: 0,customer_id,name,plan_type,region,data_used_gb,calls_made,revenue_inr,category,created_at,status,tenure,contract_type,monthly_charges,churn
0,1001,Asha Mehta,Prepaid,Delhi,5.2,25,180,,,,12,Month-to-Month,180,Yes
1,1002,Ravi Kumar,Postpaid,Mumbai,12.5,40,280,Billing,2025-09-25 10:45,Open,24,One Year,280,No
2,1002,Ravi Kumar,Postpaid,Mumbai,12.5,40,280,Network,2025-09-26 20:40,Open,24,One Year,280,No
3,1003,Sneha Rao,Prepaid,Chennai,7.8,32,210,Support,2025-09-26 11:10,Open,8,Month-to-Month,210,Yes
4,1004,Manoj Singh,Postpaid,Delhi,15.6,55,320,Network,2025-09-25 09:30,Open,30,One Year,320,No


# Check for Missing Values

In [6]:
df.isnull().sum()


customer_id        0
name               0
plan_type          0
region             0
data_used_gb       0
calls_made         0
revenue_inr        0
category           1
created_at         1
status             1
tenure             0
contract_type      0
monthly_charges    0
churn              0
dtype: int64

# Handle Missing Values and Create Features

In [7]:
df["category"] = df["category"].fillna("No Complaint")
df["status"] = df["status"].fillna("None")

df["created_at"] = pd.to_datetime(df["created_at"], errors="coerce")

df["has_complaint"] = df["created_at"].notnull().astype(int)

latest_date = df["created_at"].max()

df["complaint_recency_days"] = (latest_date - df["created_at"]).dt.days
df["complaint_recency_days"] = df["complaint_recency_days"].fillna(999)

print("Complaint features created successfully!")
df.head()

Complaint features created successfully!


Unnamed: 0,customer_id,name,plan_type,region,data_used_gb,calls_made,revenue_inr,category,created_at,status,tenure,contract_type,monthly_charges,churn,has_complaint,complaint_recency_days
0,1001,Asha Mehta,Prepaid,Delhi,5.2,25,180,No Complaint,NaT,,12,Month-to-Month,180,Yes,0,999.0
1,1002,Ravi Kumar,Postpaid,Mumbai,12.5,40,280,Billing,2025-09-25 10:45:00,Open,24,One Year,280,No,1,3.0
2,1002,Ravi Kumar,Postpaid,Mumbai,12.5,40,280,Network,2025-09-26 20:40:00,Open,24,One Year,280,No,1,1.0
3,1003,Sneha Rao,Prepaid,Chennai,7.8,32,210,Support,2025-09-26 11:10:00,Open,8,Month-to-Month,210,Yes,1,2.0
4,1004,Manoj Singh,Postpaid,Delhi,15.6,55,320,Network,2025-09-25 09:30:00,Open,30,One Year,320,No,1,3.0


# Create Master Dataset

In [8]:
master = df.groupby("customer_id").agg({
    "category": "count",
    "has_complaint": "max",
    "complaint_recency_days": "min",

    "data_used_gb": "first",
    "calls_made": "first",
    "revenue_inr": "first",

    "tenure": "first",
    "monthly_charges": "first",

    "plan_type": "first",
    "region": "first",
    "contract_type": "first",

    "churn": "first"
}).reset_index()

master.rename(columns={"category": "total_complaints"}, inplace=True)

print("Master dataset ready!")
master.head()

master.to_csv("telecom_master.csv", index=False)

Master dataset ready!


# Encode Target Variable

In [9]:
master["churn"] = master["churn"].astype(str).str.strip().str.title()
master["churn"] = master["churn"].map({"Yes": 1, "No": 0})

master = master.dropna(subset=["churn"])

print("Churn Encoding Done!")
print(master["churn"].value_counts())

Churn Encoding Done!
churn
1    5
0    5
Name: count, dtype: int64


# Create Dummy Variables

In [10]:
df_ml = pd.get_dummies(
    master,
    columns=["plan_type", "region", "contract_type"],
    drop_first=True
)

df_ml.head()


Unnamed: 0,customer_id,total_complaints,has_complaint,complaint_recency_days,data_used_gb,calls_made,revenue_inr,tenure,monthly_charges,churn,...,region_Chandigarh,region_Chennai,region_Delhi,region_Hyderabad,region_Jaipur,region_Kolkata,region_Mumbai,region_Pune,contract_type_One Year,contract_type_Two Year
0,1001,1,0,999.0,5.2,25,180,12,180,1,...,False,False,True,False,False,False,False,False,False,False
1,1002,2,1,1.0,12.5,40,280,24,280,0,...,False,False,False,False,False,False,True,False,True,False
2,1003,1,1,2.0,7.8,32,210,8,210,1,...,False,True,False,False,False,False,False,False,False,False
3,1004,1,1,3.0,15.6,55,320,30,320,0,...,False,False,True,False,False,False,False,False,True,False
4,1005,1,1,2.0,3.4,18,120,6,120,1,...,False,False,False,False,False,True,False,False,False,False


# Prepare Features and Target

In [11]:
X = df_ml.drop(["customer_id", "churn"], axis=1)
y = df_ml["churn"]

print("Feature matrix shape:", X.shape)


Feature matrix shape: (10, 19)


# Split Data into Training and Testing Sets

In [12]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)

print("Train/Test Split Successful!")


Train/Test Split Successful!


# Train Logistic Regression Model

In [13]:
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)

y_pred_lr = lr.predict(X_test)
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))


Logistic Regression Accuracy: 1.0


# Train Decision Tree Model

In [14]:
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)

y_pred_dt = dt.predict(X_test)
print("Decision Tree Accuracy:", accuracy_score(y_test, y_pred_dt))


Decision Tree Accuracy: 1.0


# Train Random Forest Model

In [15]:
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))


Random Forest Accuracy: 0.5


# Hyperparameter Tuning with Grid Search

In [16]:
param_grid = {
    "n_estimators": [100, 200],
    "max_depth": [5, 10, None],
    "min_samples_split": [2, 5]
}

grid = GridSearchCV(
    RandomForestClassifier(random_state=42),
    param_grid,
    cv=3,
    scoring="accuracy"
)

grid.fit(X_train, y_train)

best_model = grid.best_estimator_

print("Best Parameters:", grid.best_params_)
print("Best CV Accuracy:", grid.best_score_)


Best Parameters: {'max_depth': 5, 'min_samples_split': 2, 'n_estimators': 200}
Best CV Accuracy: 0.7222222222222222


# Evaluate Best Model

In [18]:
y_pred_best = best_model.predict(X_test)

print("Final Accuracy:", accuracy_score(y_test, y_pred_best))
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred_best))


Final Accuracy: 0.5

Classification Report:

              precision    recall  f1-score   support

           0       1.00      0.50      0.67         2
           1       0.00      0.00      0.00         0

    accuracy                           0.50         2
   macro avg       0.50      0.25      0.33         2
weighted avg       1.00      0.50      0.67         2



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# Save Model and Feature Names

In [19]:
joblib.dump(best_model, "churn_model.pkl")
joblib.dump(X.columns, "feature_names.pkl")

print("Model saved as churn_model.pkl")
print("Feature names saved as feature_names.pkl")


Model saved as churn_model.pkl
Feature names saved as feature_names.pkl


# Load Model and Make Predictions on New Data

In [21]:
import joblib

model = joblib.load("churn_model.pkl")
feature_names = joblib.load("feature_names.pkl")

new_df = df_ml[df_ml['customer_id'].isin(X_test.index.map(lambda idx: df_ml.iloc[idx]['customer_id']))][['customer_id', 'churn']].copy()

X_new = X_test.copy()

bool_cols = X_new.select_dtypes(include=['bool']).columns
X_new[bool_cols] = X_new[bool_cols].astype(int)

for col in feature_names:
    if col not in X_new.columns:
        X_new[col] = 0

X_new = X_new[feature_names]

new_df["predicted_churn"] = model.predict(X_new)
new_df["churn_probability"] = model.predict_proba(X_new)[:, 1]

print(new_df[["customer_id", "churn", "predicted_churn", "churn_probability"]])


   customer_id  churn  predicted_churn  churn_probability
1         1002      0                1              0.635
8         1009      0                0              0.315


# Calculate Prediction Accuracy

In [22]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(new_df["churn"], new_df["predicted_churn"])

print(f"Overall Accuracy: {accuracy * 100:.2f}%")

Overall Accuracy: 50.00%


# Convert Dummy Columns and Save Predictions

In [23]:
import datetime

today = datetime.date.today()

region_cols = [col for col in new_df.columns if col.startswith('region_')]
if region_cols:
    new_df['region'] = new_df[region_cols].idxmax(axis=1).str.replace('region_', '')
    new_df = new_df.drop(columns=region_cols)

contract_cols = [col for col in new_df.columns if col.startswith('contract_type_')]
if contract_cols:
    contract_values = new_df[contract_cols].idxmax(axis=1).str.replace('contract_type_', '')
    new_df['contract_type'] = contract_values.where(new_df[contract_cols].any(axis=1), 'Month-to-Month')
    new_df = new_df.drop(columns=contract_cols)

filename = f"churn_predictions.csv"
new_df.to_csv(filename, index=False)
print(f"Saved to {filename}")

Saved to churn_predictions.csv


In [24]:
def get_feature_names(model, fallback=None):
    if hasattr(model, "feature_names_in_"):
        return list(model.feature_names_in_)
    return list(fallback) if fallback is not None else None