In [None]:
# 📦 Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, roc_auc_score, classification_report,
                             confusion_matrix, roc_curve)

import warnings
warnings.filterwarnings('ignore')
plt.style.use("seaborn-darkgrid")


In [None]:
# 🧪 Sample data (5 lines only to keep it executable)
from io import StringIO

sample_data = """
customer_id,age,gender,subscription_type,watch_hours,last_login_days,region,device,monthly_fee,churned,payment_method,number_of_profiles,avg_watch_time_per_day,favorite_genre
a9b75100-82a8-427a-a208-72f24052884a,51,Other,Basic,14.73,29,Africa,TV,8.99,1,Gift Card,1,0.49,Action
49a5dfd9-7e69-4022-a6ad-0a1b9767fb5b,47,Other,Standard,0.7,19,Europe,Mobile,13.99,1,Gift Card,5,0.03,Sci-Fi
4d71f6ce-fca9-4ff7-8afa-197ac24de14b,27,Female,Standard,16.32,10,Asia,TV,13.99,0,Crypto,2,1.48,Drama
d3c72c38-631b-4f9e-8a0e-de103cad1a7d,53,Other,Premium,4.51,12,Oceania,TV,17.99,1,Crypto,2,0.35,Horror
4e265c34-103a-4dbb-9553-76c9aa47e946,56,Other,Standard,1.89,13,Africa,Mobile,13.99,1,Crypto,2,0.13,Action
"""
df = pd.read_csv(StringIO(sample_data))
df['watch_per_login'] = df['watch_hours'] / (df['last_login_days'] + 1)


In [None]:
# 🎯 Preprocessing
X = df.drop(columns=['customer_id', 'churned'])
y = df['churned']

numeric_features = ['age', 'watch_hours', 'last_login_days', 'monthly_fee',
                    'number_of_profiles', 'avg_watch_time_per_day', 'watch_per_login']
categorical_features = ['gender', 'subscription_type', 'region', 'device',
                        'payment_method', 'favorite_genre']

numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])
categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'))])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])


In [None]:
# 🔍 Modeling
def build_model(clf):
    return Pipeline(steps=[('preprocessor', preprocessor), ('classifier', clf)])

log_pipe = build_model(LogisticRegression(max_iter=1000))
rf_pipe = build_model(RandomForestClassifier(random_state=42))

param_grid_lr = {'classifier__C': [0.1, 1, 10]}
param_grid_rf = {'classifier__n_estimators': [100, 200], 'classifier__max_depth': [None, 10]}

kf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
gs_lr = GridSearchCV(log_pipe, param_grid_lr, cv=kf, scoring='roc_auc', n_jobs=-1)
gs_rf = GridSearchCV(rf_pipe, param_grid_rf, cv=kf, scoring='roc_auc', n_jobs=-1)

gs_lr.fit(X, y)
gs_rf.fit(X, y)

print(f"Best LR: {gs_lr.best_params_}, AUC: {gs_lr.best_score_:.4f}")
print(f"Best RF: {gs_rf.best_params_}, AUC: {gs_rf.best_score_:.4f}")


## 📉 PHASE 7: Hypothesis Testing & A/B Testing

In [None]:
from scipy.stats import chi2_contingency, ttest_ind

# Chi-squared: churn by payment method
contingency = pd.crosstab(df['payment_method'], df['churned'])
chi2, p, dof, _ = chi2_contingency(contingency)
print("Chi-squared test for churn vs payment_method")
print("Chi2 =", chi2, " | p-value =", p)

# T-test: Premium vs Basic in watch_hours
premium = df[df['subscription_type'] == 'Premium']['watch_hours']
basic = df[df['subscription_type'] == 'Basic']['watch_hours']
t_stat, p_val = ttest_ind(premium, basic, equal_var=False)
print("\nT-test: Premium vs Basic")
print("t =", t_stat, " | p =", p_val)


## 📈 PHASE 8: Model Tuning & Explainability

In [None]:
# Final model & feature importance
best_model = gs_rf if gs_rf.best_score_ > gs_lr.best_score_ else gs_lr
best_model.fit(X, y)

cat_encoder = preprocessor.named_transformers_['cat'].named_steps['onehot']
cat_names = cat_encoder.get_feature_names_out(categorical_features)
all_feature_names = np.concatenate([numeric_features, cat_names])

# Tree-based feature importance
importances = best_model.best_estimator_.named_steps['classifier'].feature_importances_
feat_imp = pd.Series(importances, index=all_feature_names).sort_values(ascending=True)
feat_imp.plot(kind='barh', figsize=(8,6), title='Feature Importance')
plt.tight_layout()
plt.show()
