In [1]:
import os
import sys

project_root = r'c:\Users\anuda\Desktop\cell2cell_churn_drift'
os.chdir(project_root)
sys.path.insert(0, project_root)
from src.config import RAW_DATA 

print("Project Root:", os.getcwd()) 
print("Looking for:", RAW_DATA)
print("File Exists?", os.path.exists(RAW_DATA))

# Quick peek if it exists
if os.path.exists(RAW_DATA):
    import pandas as pd
    df = pd.read_csv(RAW_DATA)
    print("Success! Shape:", df.shape)
    print("Columns:", df.columns.tolist()[:10]) 
else:
    print("File not found — double-check download/path!")

    

Project Root: c:\Users\anuda\Desktop\cell2cell_churn_drift
Looking for: data/raw/cell2cellholdout.csv
File Exists? True
Success! Shape: (51047, 58)
Columns: ['CustomerID', 'Churn', 'MonthlyRevenue', 'MonthlyMinutes', 'TotalRecurringCharge', 'DirectorAssistedCalls', 'OverageMinutes', 'RoamingCalls', 'PercChangeMinutes', 'PercChangeRevenues']


In [2]:

import os
project_root = r'c:\Users\anuda\Desktop\cell2cell_churn_drift'
os.chdir(project_root)
import sys
sys.path.insert(0, project_root)
from src.config import TARGET

from src.data.preprocessing import load_and_preprocess, generate_batches

df_processed, scaler, le = load_and_preprocess()
print(f"Processed shape: {df_processed.shape}")


batches = generate_batches(df_processed, n_batches=5)
for i, batch in enumerate(batches):
    batch.to_csv(f"data/batches/batch_{i}.csv", index=False)
    print(f" Batch {i} shape: {batch.shape} | Churn rate: {batch[TARGET].mean():.2%}")

Raw shape: (51047, 58)
Raw Churn sample: ['Yes', 'Yes', 'No', 'No', 'Yes']
Processed Churn distribution:
Churn
0    0.711815
1    0.288185
Name: proportion, dtype: float64
Using 8 numerics
MonthlyRevenue dtype after fix: float64
MonthlyMinutes dtype after fix: float64
TotalRecurringCharge dtype after fix: float64
DirectorAssistedCalls dtype after fix: float64
OverageMinutes dtype after fix: float64
RoamingCalls dtype after fix: float64
PercChangeMinutes dtype after fix: float64
PercChangeRevenues dtype after fix: float64
Processed saved: data/processed/churn_processed.csv
Processed head (Churn + first few):
   Churn  MonthlyRevenue  MonthlyMinutes  TotalRecurringCharge
0      1       -0.783096       -0.578622             -1.042504
1      1       -0.940828       -0.973610             -1.252478
2      0       -0.468083       -0.977390             -0.370584
3      0        0.528260        1.487037              1.183229
4      1       -0.937453       -0.992509             -1.252478
Process

In [3]:
# Simulate Drift: Bump 'OverageMinutes' in batch 2 (e.g., +20% = more usage drift)
batch2 = pd.read_csv("data/batches/batch_2.csv")
print("batch",batch2.head(5))
batch2['OverageMinutes'] += batch2['OverageMinutes'] * 0.2  # Fake shift 
batch2.to_csv("data/batches/batch_2_driftsim.csv", index=False)
print("✅ Drift simulated in batch_2_driftsim.csv — now test detection!")

# Quick Test: Detect drift between baseline (batch_0) and drifted batch_2
from src.model.drift_detector import detect_drift
baseline = pd.read_csv("data/batches/batch_0.csv")
drifted = pd.read_csv("data/batches/batch_2_driftsim.csv")

drifts, has_drift = detect_drift(baseline, drifted)
print("Drift Results:", drifts)
print(f"Overall Drift Detected: {has_drift}")

batch    Churn  MonthlyRevenue  MonthlyMinutes  TotalRecurringCharge  \
0      0       -0.940828       -0.984950             -1.252478   
1      0        0.275574        0.307741              0.553305   
2      0       -0.312151        0.224585             -0.076620   
3      0       -0.110767       -0.916914              0.763280   
4      0        0.442306        3.214404              1.687169   

   DirectorAssistedCalls  OverageMinutes  RoamingCalls  PercChangeMinutes  \
0              -0.401392       -0.413790     -0.125718           0.068207   
1              -0.068867        0.125288     -0.125718          -0.220196   
2               1.153384       -0.227186     -0.125718           0.403378   
3              -0.401392       -0.413790     -0.125718          -0.099379   
4               0.043472       -0.403423     -0.125718           1.556989   

   PercChangeRevenues  DroppedCalls  ...  ReferralsMadeBySubscriber  \
0            0.030066           0.0  ...                       

In [5]:
import joblib
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, roc_auc_score
from xgboost import XGBClassifier
from src.config import NUMERIC_FEATURES, CATEGORICAL_FEATURES, TARGET, MODEL_PARAMS

# 1. Baseline on batch_0
baseline_data = pd.read_csv("data/batches/batch_0.csv")
X_base = baseline_data[NUMERIC_FEATURES + CATEGORICAL_FEATURES]
y_base = baseline_data[TARGET]

X_train_b, X_test_b, y_train_b, y_test_b = train_test_split(X_base, y_base, test_size=0.2, random_state=42)
model_base = XGBClassifier(**MODEL_PARAMS, random_state=42, eval_metric='logloss')
model_base.fit(X_train_b, y_train_b)

y_pred_b = model_base.predict(X_test_b)
f1_base = f1_score(y_test_b, y_pred_b)
auc_base = roc_auc_score(y_test_b, model_base.predict_proba(X_test_b)[:, 1])

print(f"✅ Baseline F1: {f1_base:.3f}, AUC: {auc_base:.3f}")
joblib.dump(model_base, 'models/baseline_model.pkl')  # mkdir models if needed

# 2. Retrain on "recent" batches 1-4 (adapts to drift)
recent_data = pd.concat([pd.read_csv(f"data/batches/batch_{i}.csv") for i in range(1, 5)], ignore_index=True)
X_r = recent_data[NUMERIC_FEATURES + CATEGORICAL_FEATURES]
y_r = recent_data[TARGET]

X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(X_r, y_r, test_size=0.2, random_state=42)
model_r = XGBClassifier(**MODEL_PARAMS, random_state=42, eval_metric='logloss')
model_r.fit(X_train_r, y_train_r)

y_pred_r = model_r.predict(X_test_r)
f1_r = f1_score(y_test_r, y_pred_r)
auc_r = roc_auc_score(y_test_r, model_r.predict_proba(X_test_r)[:, 1])

print(f"✅ Retrained F1: {f1_r:.3f} (delta: {f1_r - f1_base:+.3f}), AUC: {auc_r:.3f} (delta: {auc_r - auc_base:+.3f})")
joblib.dump(model_r, 'models/retrained_model.pkl')

# 3. Versioning (simple MLOps touch)
import datetime
version_base = f"v1_baseline_{datetime.datetime.now().strftime('%Y%m%d_%H%M')}"
version_re = f"v2_retrained_{datetime.datetime.now().strftime('%Y%m%d_%H%M')}"
print(f"✅ Versions: {version_base} (F1 {f1_base:.3f}) | {version_re} (F1 {f1_r:.3f})")

✅ Baseline F1: 0.458, AUC: 0.607
✅ Retrained F1: 0.450 (delta: -0.009), AUC: 0.612 (delta: +0.005)
✅ Versions: v1_baseline_20260108_2004 (F1 0.458) | v2_retrained_20260108_2004 (F1 0.450)
