In [17]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline

from sklearn.neighbors import KNeighborsClassifier
from sklearn.dummy import DummyClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import train_test_split

from sklearn.metrics import (
    roc_auc_score,
    classification_report,
    confusion_matrix,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score
)

In [18]:
train = pd.read_csv("../data/processed/train_clean.csv")
test = pd.read_csv("../data/processed/test_clean.csv")

In [19]:
categorical_cols = ["Contract Length", "Subscription Type", "Gender"]

numeric_cols = [
    "Total Spend",
    "Support Calls",
    "Usage Frequency",
    "Age",
    "Last Interaction",
    "Tenure"
]

baseline_features = categorical_cols + numeric_cols

In [20]:
for col in numeric_cols:
    train[col] = pd.to_numeric(train[col], errors="coerce")
    test[col] = pd.to_numeric(test[col], errors="coerce")
    
    median_val = train[col].median()
    train[col] = train[col].fillna(median_val)
    test[col] = test[col].fillna(median_val)


for col in categorical_cols:
    train[col] = train[col].fillna("Unknown")
    test[col] = test[col].fillna("Unknown")

In [58]:
tree_preprocessor = make_column_transformer(
    (OneHotEncoder(handle_unknown="ignore", sparse_output=False), categorical_cols),
    (StandardScaler(), numeric_cols),
    remainder="drop"
)

In [59]:
tree_model = DecisionTreeClassifier(
    max_depth=10,        # prevents overfitting
    min_samples_leaf=230,
    #class_weight="balanced",
    random_state=1234
)

In [60]:
# for depth in [3, 9, 10, 11, 12, 13, 14, 15]:
#     model = DecisionTreeClassifier(max_depth=depth, min_samples_leaf=10, random_state=42)
#     pipe = make_pipeline(tree_preprocessor, model)
#     pipe.fit(X_train, y_train)
#     preds = pipe.predict_proba(X_val)[:, 1]
#     print(depth, roc_auc_score(y_val, preds))


In [61]:
# for leaf in [200, 210, 220, 230, 240, 250]:
#     model = DecisionTreeClassifier(
#         max_depth=10,
#         min_samples_leaf=leaf,
#         random_state=42
#     )
#     pipe = make_pipeline(tree_preprocessor, model)
#     pipe.fit(X_train, y_train)
#     preds = pipe.predict_proba(X_val)[:, 1]
#     print(depth, roc_auc_score(y_val, preds))


In [62]:
tree_pipeline = make_pipeline(
    tree_preprocessor,
    tree_model
)

In [63]:
X = train[baseline_features].copy()
y = train["Churn"]

X_train, X_val, y_train, y_val = train_test_split(
    X, 
    y,
    test_size=0.2,
    random_state=1234,
    stratify=y
)

In [64]:
tree_fit = tree_pipeline.fit(X_train, y_train)

val_pred_tree = tree_fit.predict_proba(X_val)[:, 1]
auc_tree = roc_auc_score(y_val, val_pred_tree)
auc_tree
#0.9277137204336369

0.9277137204336369

In [65]:
X_test = test[baseline_features]

test_pred_tree = tree_fit.predict_proba(X_test)[:, 1]

In [66]:

submission_tree = pd.DataFrame({
    "CustomerID": test["CustomerID"],
    "Churn": test_pred_tree
})

submission_tree.to_csv("../data/submissions/tree_baseline.csv", index=False)