In [8]:
# Import libraries
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
# Load data
demographics_df = pd.read_csv('data/telecom_demographics.csv')
usage_df = pd.read_csv('data/telecom_usage.csv')

In [3]:
# Merge data
churn_df = demographics_df.merge(usage_df, on='customer_id', how = 'inner')

In [4]:
# EDA
churn_rate = churn_df['churn'].value_counts(normalize=True)
print("Churn Rate:\n", churn_rate)

print("\nData Types and Null Counts:")
print(churn_df.info())

Churn Rate:
 churn
0    0.799538
1    0.200462
Name: proportion, dtype: float64

Data Types and Null Counts:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6500 entries, 0 to 6499
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   customer_id         6500 non-null   int64 
 1   telecom_partner     6500 non-null   object
 2   gender              6500 non-null   object
 3   age                 6500 non-null   int64 
 4   state               6500 non-null   object
 5   city                6500 non-null   object
 6   pincode             6500 non-null   int64 
 7   registration_event  6500 non-null   object
 8   num_dependents      6500 non-null   int64 
 9   estimated_salary    6500 non-null   int64 
 10  calls_made          6500 non-null   int64 
 11  sms_sent            6500 non-null   int64 
 12  data_used           6500 non-null   int64 
 13  churn               6500 non-null   int64 
dtypes: int64(9)

In [5]:
# Preprocess Data
# One-hot encode categorical features
categorical_features = ['telecom_partner', 'gender', 'state', 'city', 'registration_event']
churn_df_encoded = pd.get_dummies(churn_df, columns=categorical_features, drop_first=True)

# Feature scaling (excluding identifier and target columns)
features = churn_df_encoded.drop(['customer_id', 'churn'], axis=1)
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

# Target variable
target = churn_df_encoded['churn']

In [6]:
# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(features_scaled, target, test_size=0.2, random_state=42)

In [9]:
# Model: Logistic Regression
log_reg_model = LogisticRegression(random_state=42)
log_reg_model.fit(X_train, y_train)

log_reg_preds = log_reg_model.predict(X_test)

print("\nLogistic Regression - Confusion Matrix:")
print(confusion_matrix(y_test, log_reg_preds))
print("\nClassification Report (Logistic Regression):")
print(classification_report(y_test, log_reg_preds))


Logistic Regression - Confusion Matrix:
[[912 115]
 [243  30]]

Classification Report (Logistic Regression):
              precision    recall  f1-score   support

           0       0.79      0.89      0.84      1027
           1       0.21      0.11      0.14       273

    accuracy                           0.72      1300
   macro avg       0.50      0.50      0.49      1300
weighted avg       0.67      0.72      0.69      1300



In [10]:
# Model: Random Forest
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

rf_preds = rf_model.predict(X_test)

print("\nRandom Forest - Confusion Matrix:")
print(confusion_matrix(y_test, rf_preds))
print("\nClassification Report (Random Forest):")
print(classification_report(y_test, rf_preds))


Random Forest - Confusion Matrix:
[[1027    0]
 [ 272    1]]

Classification Report (Random Forest):
              precision    recall  f1-score   support

           0       0.79      1.00      0.88      1027
           1       1.00      0.00      0.01       273

    accuracy                           0.79      1300
   macro avg       0.90      0.50      0.45      1300
weighted avg       0.83      0.79      0.70      1300

