In [59]:
print(data.columns.tolist())


['customer_id', 'gender', 'age', 'under_30', 'senior_citizen', 'married', 'dependents', 'number_of_dependents', 'country', 'state', 'city', 'population', 'quarter', 'referred_a_friend', 'number_of_referrals', 'tenure_in_months', 'offer', 'phone_service', 'avg_monthly_long_distance_charges', 'multiple_lines', 'internet_service', 'internet_type', 'avg_monthly_gb_download', 'online_security', 'online_backup', 'device_protection_plan', 'premium_tech_support', 'streaming_tv', 'streaming_movies', 'streaming_music', 'unlimited_data', 'contract', 'paperless_billing', 'payment_method', 'monthly_charge', 'total_charges', 'total_refunds', 'total_extra_data_charges', 'total_long_distance_charges', 'total_revenue', 'satisfaction_score', 'customer_status', 'churn_label', 'churn_score', 'cltv', 'churn_category', 'churn_reason']


In [1]:
# ======================================================
# Step 1 — Imports
# ======================================================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from sklearn.ensemble import RandomForestClassifier

from xgboost import XGBClassifier
import joblib

sns.set(style="whitegrid")

# ======================================================
# Step 2 — Load Data
# ======================================================
df = pd.read_csv("telco.csv")

print("Rows, columns:", df.shape)
print(df.head(3))

# ======================================================
# Step 3 — Clean Column Names
# ======================================================
df.columns = (
    df.columns
      .str.strip()
      .str.lower()
      .str.replace(' ', '_', regex=True)
      .str.replace(r'[^\w]', '', regex=True)
)

print("Cleaned column names:", df.columns.tolist()[:10])

# ======================================================
# Step 4 — Create Target Variable
# ======================================================
df['churn_numeric'] = df['churn_label'].map({'Yes': 1, 'No': 0})
print(df['churn_numeric'].value_counts(normalize=True))

# ======================================================
# Step 5 — Drop Irrelevant Columns
# ======================================================
drop_cols = [
    'customer_id', 'zip_code', 'latitude', 'longitude',
    'churn_label', 'customer_status', 'churn_score',
    'churn_category', 'churn_reason'
]

df_model = df.drop(columns=drop_cols, errors='ignore')
print("Model dataset shape:", df_model.shape)

# ======================================================
# Step 6 — Prepare Features
# ======================================================
X = df_model.drop(columns=['churn_numeric'])
y = df_model['churn_numeric']

numeric_features = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = X.select_dtypes(include=['object', 'category', 'bool']).columns.tolist()

numeric_transformer = StandardScaler()
cat_transformer = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', cat_transformer, categorical_features)
    ]
)

# ======================================================
# Step 7 — Train-Test Split
# ======================================================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Train:", X_train.shape, "Test:", X_test.shape)

# ======================================================
# Step 8 — Model Comparison
# ======================================================
results = {}

print("\n========== MODEL COMPARISON ==========\n")

# ---------------- Logistic Regression ----------------
print("✅ Training Logistic Regression")
lr_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('clf', LogisticRegression(max_iter=1000, class_weight='balanced'))
])
lr_pipeline.fit(X_train, y_train)

lr_pred = lr_pipeline.predict(X_test)
lr_proba = lr_pipeline.predict_proba(X_test)[:, 1]

lr_acc = accuracy_score(y_test, lr_pred)
lr_auc = roc_auc_score(y_test, lr_proba)

print(f"Accuracy: {lr_acc:.4f}")
print(f"ROC AUC: {lr_auc:.4f}")
print("Report:\n", classification_report(y_test, lr_pred))

results['Logistic Regression'] = (lr_acc, lr_auc)

# ---------------- Random Forest ----------------
print("\n✅ Training Random Forest")
rf_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('clf', RandomForestClassifier(n_estimators=300, random_state=42))
])
rf_pipeline.fit(X_train, y_train)

rf_pred = rf_pipeline.predict(X_test)
rf_proba = rf_pipeline.predict_proba(X_test)[:, 1]

rf_acc = accuracy_score(y_test, rf_pred)
rf_auc = roc_auc_score(y_test, rf_proba)

print(f"Accuracy: {rf_acc:.4f}")
print(f"ROC AUC: {rf_auc:.4f}")
print("Report:\n", classification_report(y_test, rf_pred))

results['Random Forest'] = (rf_acc, rf_auc)

# ---------------- XGBoost ----------------
print("\n✅ Training XGBoost")
xgb_clf = XGBClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric='logloss'
)

xgb_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('clf', xgb_clf)
])

xgb_pipeline.fit(X_train, y_train)

xgb_pred = xgb_pipeline.predict(X_test)
xgb_proba = xgb_pipeline.predict_proba(X_test)[:, 1]

xgb_acc = accuracy_score(y_test, xgb_pred)
xgb_auc = roc_auc_score(y_test, xgb_proba)

print(f"Accuracy: {xgb_acc:.4f}")
print(f"ROC AUC: {xgb_auc:.4f}")
print("Report:\n", classification_report(y_test, xgb_pred))

results['XGBoost'] = (xgb_acc, xgb_auc)

# ======================================================
# Step 9 — Save Best Model (XGBoost)
# ======================================================
print("\n✅ Final XGBoost model saved as churn_xgb_pipeline.pkl")
joblib.dump(xgb_pipeline, "churn_xgb_pipeline.pkl")

# ======================================================
# Step 10 — Quick Sample Prediction
# ======================================================
sample = X_test.head(5)
pred = xgb_pipeline.predict(sample)
proba = xgb_pipeline.predict_proba(sample)[:, 1]

print("Sample Predictions:", pred)
print("Sample Probabilities:", np.round(proba, 3))

# ======================================================
# Notebook Complete
# ======================================================
print("\n✅ Notebook finished — ready for Streamlit!")


Rows, columns: (7043, 50)
  Customer ID  Gender  Age Under 30 Senior Citizen Married Dependents  \
0  8779-QRDMV    Male   78       No            Yes      No         No   
1  7495-OOKFY  Female   74       No            Yes     Yes        Yes   
2  1658-BYGOY    Male   71       No            Yes      No        Yes   

   Number of Dependents        Country       State  ...  \
0                     0  United States  California  ...   
1                     1  United States  California  ...   
2                     3  United States  California  ...   

  Total Extra Data Charges  Total Long Distance Charges  Total Revenue  \
0                       20                         0.00          59.65   
1                        0                       390.80        1024.10   
2                        0                       203.94        1910.88   

   Satisfaction Score  Customer Status Churn Label Churn Score  CLTV  \
0                   3          Churned         Yes          91  5433   
1  

In [6]:
import pandas as pd
import pickle
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier

# Load dataset
df = pd.read_csv("telco.csv")

# Preprocessing
df.columns = df.columns.str.lower().str.replace(" ", "_")
df['total_charges'] = pd.to_numeric(df['total_charges'], errors='coerce')
df['total_charges'].fillna(df['total_charges'].median(), inplace=True)
df['churn_label'] = df['churn_label'].map({'Yes':1, 'No':0})

# Encode categorical columns
categorical_cols = df.select_dtypes(include='object').columns.tolist()

# Safely remove target if present
if "churn_label" in categorical_cols:
    categorical_cols.remove("churn_label")


for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))

# Create X / y
X = df.drop("churn_label", axis=1)
y = df["churn_label"]

# Train Random Forest
rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf.fit(X, y)

# Extract importances
feat_imp = pd.DataFrame({
    "Feature": X.columns,
    "Importance": rf.feature_importances_
}).sort_values(by="Importance", ascending=False)

# Save only the feature importance table
pickle.dump(feat_imp, open("rf_feature_importance.pkl", "wb"))

print("✅ Feature importance file saved: rf_feature_importance.pkl")


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['total_charges'].fillna(df['total_charges'].median(), inplace=True)


✅ Feature importance file saved: rf_feature_importance.pkl
