In [3]:
import pandas as pd
import numpy as np

np.random.seed(42)

N = 2000   # dataset size

n_samples = 2000



In [4]:
df = pd.DataFrame({
    "customer_id": [f"C{i:05d}" for i in range(n_samples)],
    "age": np.random.randint(18, 70, n_samples),
    "gender": np.random.choice(["Male", "Female"], n_samples),
    "tenure_months": np.random.randint(1, 60, n_samples),
    "avg_monthly_spend": np.random.uniform(20, 150, n_samples),
    "num_support_tickets": np.random.poisson(2, n_samples),
    "contract_type": np.random.choice(["Month-to-month", "One year", "Two year"], n_samples),
    "payment_method": np.random.choice(["Credit card", "Debit card", "E-wallet", "Bank transfer"], n_samples),
})

In [5]:
df["churned"] = (
    (df["tenure_months"] < 12).astype(int) +
    (df["num_support_tickets"] > 3).astype(int) +
    (df["contract_type"] == "Month-to-month").astype(int)
)

df["churned"] = (df["churned"] >= 2).astype(int)


In [6]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# One-hot encoding
df_encoded = pd.get_dummies(
    df,
    columns=["gender", "contract_type", "payment_method"],
    drop_first=True
)

X = df_encoded.drop(columns=["customer_id", "churned"])
y = df_encoded["churned"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

model = RandomForestClassifier(
    n_estimators=100,
    max_depth=6,
    random_state=42
)

model.fit(X_train, y_train)

print("Base Test Accuracy:", accuracy_score(y_test, model.predict(X_test)))


Base Test Accuracy: 1.0


In [7]:
def scenario_new_customers(df):
    return df[df["tenure_months"] <= 6]

def scenario_high_spenders(df):
    return df[df["avg_monthly_spend"] > 120]

def scenario_high_support(df):
    return df[df["num_support_tickets"] >= 4]

def scenario_long_term(df):
    return df[df["tenure_months"] > 36]

scenario_datasets = {
    "new_customers": scenario_new_customers(df),
    "high_spenders": scenario_high_spenders(df),
    "high_support": scenario_high_support(df),
    "long_term": scenario_long_term(df)
}


In [8]:
from sklearn.metrics import precision_score, recall_score, f1_score

evaluation_results = []

for scenario_name, scenario_df in scenario_datasets.items():
    
    if len(scenario_df) == 0:
        continue
    
    # Encode scenario dataset
    scenario_encoded = pd.get_dummies(
        scenario_df,
        columns=["gender", "contract_type", "payment_method"],
        drop_first=True
    )
    
    # Align columns to training features
    scenario_encoded = scenario_encoded.reindex(
        columns=df_encoded.columns,
        fill_value=0
    )
    
    X_scenario = scenario_encoded.drop(columns=["customer_id", "churned"])
    y_true = scenario_encoded["churned"]
    
    y_pred = model.predict(X_scenario)
    
    evaluation_results.append({
        "scenario": scenario_name,
        "sample_size": len(y_true),
        "accuracy": accuracy_score(y_true, y_pred),
        "precision": precision_score(y_true, y_pred),
        "recall": recall_score(y_true, y_pred),
        "f1_score": f1_score(y_true, y_pred)
    })

results_df = pd.DataFrame(evaluation_results)
results_df


Unnamed: 0,scenario,sample_size,accuracy,precision,recall,f1_score
0,new_customers,204,1.0,1.0,1.0,1.0
1,high_spenders,445,1.0,1.0,1.0,1.0
2,high_support,280,1.0,1.0,1.0,1.0
3,long_term,813,1.0,1.0,1.0,1.0
