In [None]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline

from sklearn.neighbors import KNeighborsClassifier
from sklearn.dummy import DummyClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import train_test_split

from sklearn.metrics import (
    roc_auc_score,
    classification_report,
    confusion_matrix,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score
)


In [8]:
train = pd.read_csv("../data/processed/train_clean.csv")
test = pd.read_csv("../data/processed/test_clean.csv")

In [5]:
low_spend_threshold = train["Total Spend"].median()
usage_median = train["Usage Frequency"].median()
age_median = train["Age"].median()

#train["PaymentDelayMissing"] = train["Payment Delay"].isna().astype(int)
#test["PaymentDelayMissing"] = test["Payment Delay"].isna().astype(int)


In [6]:
def baseline_churn_probability(row):
    prob = 0.25  # base probability

    #if row["PaymentDelayMissing"] == 1:
     #   prob += 0.40

    # monthly contracts churn more
    if row["Contract Length"] == "Monthly":
        prob += 0.25
    
    #low spenders churn more
    if row["Total Spend"] < low_spend_threshold:
        prob += 0.15

    # more support calls → more churn
    if row["Support Calls"] > 2:
        prob += 0.10
    
    # low usage → more churn
    if row["Usage Frequency"] < usage_median:
        prob += 0.10

    # older customers churn more
    if row["Age"] > age_median:
        prob += 0.05

    # Keep probability in range
    prob = max(0.01, min(0.99, prob))
    return prob


In [7]:
baseline_features = [
    "Contract Length",
    "Total Spend",
    "Support Calls",
    "Usage Frequency",
    "Age",
    #"PaymentDelayMissing"
]

X_base = train[baseline_features]
y = train["Churn"]

from sklearn.model_selection import train_test_split

X_train_b, X_val_b, y_train_b, y_val_b = train_test_split(
    X_base, y, test_size=0.2, random_state=1234, stratify=y
)

# Compute probabilities on validation set
val_probs_base = X_val_b.apply(baseline_churn_probability, axis=1)

from sklearn.metrics import roc_auc_score
auc_base = roc_auc_score(y_val_b, val_probs_base)
auc_base

0.78960169627965

In [None]:
X_test_base = test[baseline_features]

test_pred_base = X_test_base.apply(baseline_churn_probability, axis=1)

submission = pd.DataFrame({
    "CustomerID": test["CustomerID"],
    "Churn": test_pred_base
})

submission.to_csv("baseline.csv", index=False)
submission.head()


Unnamed: 0,CustomerID,Churn
0,146773,0.65
1,21394,0.4
2,411099,0.35
3,239666,0.5
4,35032,0.85
