# Smiles Dataset


In [None]:
#%pip install -U pandas numpy scikit-learn

## Introduction

In [None]:

CSV_PATH = "./datasets/spotify_churn_dataset.csv"  
TARGET_COL = "is_churned"                 
ID_COL = "user_id"


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display
pd.set_option("display.max_columns", 120)
pd.set_option("display.width", 140)


In [None]:

# Read raw (let pandas infer, then coerce precisely)
df_raw = pd.read_csv(CSV_PATH)

# Make a copy to work with
df = df_raw.copy()

# --- Coerce types explicitly for this schema ---
# Categorical/text
cat_cols_expected = ["gender", "country", "subscription_type", "device_type"]
for c in cat_cols_expected:
    if c in df.columns:
        df[c] = (
            df[c]
            .astype("string")
            .str.strip()
        )

# Numeric
num_cols_expected = [
    "age",
    "listening_time",
    "songs_played_per_day",
    "skip_rate",
    "ads_listened_per_week",
]
for c in num_cols_expected:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors="coerce")

# ID (keep as nullable integer for safety; exclude from modeling later)
if ID_COL in df.columns:
    df[ID_COL] = pd.to_numeric(df[ID_COL], errors="coerce").astype("Int64")
    print(f"[info] Found ID column: {ID_COL}")

# offline_listening comes as 0/1 -> boolean
if "offline_listening" in df.columns:
    df["offline_listening"] = (
        pd.to_numeric(df["offline_listening"], errors="coerce")
          .map({1: True, 0: False})
          .astype("boolean")
    )

# Target (is_churned) as 0/1 -> int, keep NaN if weird values appear
if TARGET_COL in df.columns:
    y_tmp = pd.to_numeric(df[TARGET_COL], errors="coerce")
    bad = y_tmp.isna().sum()
    if bad > 0:
        print(f"[warn] {bad} rows in '{TARGET_COL}' could not be parsed as 0/1 (will be NaN).")
    df[TARGET_COL] = y_tmp.astype("Int64")
else:
    print(f"[warn] Target column '{TARGET_COL}' not found.")

print(f"[loaded] {df.shape[0]} rows Ã— {df.shape[1]} columns")
display(df.head(5))


In [None]:
# --- Step 3: overview & missing-safe ---
import matplotlib.pyplot as plt
pd.set_option("display.max_columns", 100)
pd.set_option("display.width", 120)

print(f"Rows: {df.shape[0]} | Columns: {df.shape[1]}")
df.info()  # don't wrap with display(); it returns None

# Descriptive stats (numeric only)
print("\nNumeric summary:")
display(df.select_dtypes(include="number").describe().T)

# Categorical quick peek (top categories)
cat_cols = df.select_dtypes(include=["object", "string", "category"]).columns.tolist()
if cat_cols:
    print("\nCategorical top values (first 5 cols):")
    for c in cat_cols[:5]:
        print(f"\n[{c}]")
        display(df[c].value_counts(dropna=False).head(10))

# Missing values (safe if none)
missing = df.isna().mean().sort_values(ascending=False)
has_missing = (missing > 0).any()
if has_missing:
    print("\nMissing values found:")
    display(missing[missing > 0])

    plt.figure(figsize=(7,4))
    missing[missing > 0].sort_values().plot.barh()
    plt.title("Missing values per column")
    plt.xlabel("Fraction of missing")
    plt.tight_layout()
    plt.show()
else:
    print("\n[OK] No missing values detected. Skipping missing-values plot.")


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt


# --- 4.1  Check churn balance ---
if TARGET_COL not in df.columns:
    raise ValueError(f"Target column '{TARGET_COL}' not found.")

churn_rate = df[TARGET_COL].mean()
print(f"[INFO] Churn rate: {churn_rate:.2%}")

plt.figure(figsize=(4,3))
sns.countplot(x=TARGET_COL, data=df)
plt.title(f"Churn distribution (rate â‰ˆ {churn_rate:.1%})")
plt.tight_layout()
plt.show()


# --- 4.2  Numeric features vs churn ---
num_cols = df.select_dtypes(include=["number"]).columns.drop([TARGET_COL, "user_id"], errors="ignore")

print(f"Numeric columns: {list(num_cols)}")

for col in num_cols:
    plt.figure(figsize=(5,3))
    sns.histplot(data=df, x=col, hue=TARGET_COL, kde=True, bins=30, element="step")
    plt.title(f"{col} by churn status")
    plt.tight_layout()
    plt.show()


# --- 4.3  Categorical features vs churn (bar charts) ---
cat_cols = df.select_dtypes(include=["object", "string", "category"]).columns.tolist() + \
           df.select_dtypes(include=["bool"]).columns.tolist()

for col in cat_cols:
    if df[col].nunique(dropna=True) <= 15:  # avoid overcrowding
        plt.figure(figsize=(6,3))
        sns.barplot(
            data=df,
            x=col, y=TARGET_COL,
            estimator=np.mean,
            order=df[col].value_counts().index
        )
        plt.title(f"Mean churn rate by {col}")
        plt.xticks(rotation=30, ha="right")
        plt.tight_layout()
        plt.show()


## Preprocessing


In [None]:
from sklearn.model_selection import train_test_split

TARGET_COL = "is_churned"
ID_COL = "user_id"

# --- 5.1 Separate target and drop ID ---
X = df.drop(columns=[TARGET_COL, ID_COL], errors="ignore")
y = df[TARGET_COL]

# --- 5.2 Split train/test (keep churn proportion consistent) ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print(f"Train: {X_train.shape} | Test: {X_test.shape}")
print(f"Churn rate train = {y_train.mean():.2%} | test = {y_test.mean():.2%}")

# --- 5.3 Identify column types (for preprocessing) ---
num_cols = X_train.select_dtypes(include=["number"]).columns.tolist()
cat_cols = X_train.select_dtypes(include=["object", "string", "category"]).columns.tolist()
bool_cols = X_train.select_dtypes(include=["bool"]).columns.tolist()

print("\nNumeric cols:", num_cols)
print("Categorical cols:", cat_cols)
print("Boolean cols:", bool_cols)

# --- optional: ensure bool â†’ int for modeling simplicity ---
for c in bool_cols:
    X_train[c] = X_train[c].astype(int)
    X_test[c] = X_test[c].astype(int)


## Train


In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, confusion_matrix
import numpy as np

# --- define preprocessing ---
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_cols),
        ("cat", categorical_transformer, cat_cols)
    ],
    remainder="drop"
)

# --- define models ---
models = {
    "DecisionTree": DecisionTreeClassifier(random_state=42, max_depth=None),
    "RandomForest": RandomForestClassifier(n_estimators=200, random_state=42),
    "SVM": SVC(kernel="rbf", probability=True, random_state=42),
    "ANN": MLPClassifier(hidden_layer_sizes=(64, 32), activation="relu", max_iter=300, random_state=42)
}

# --- train & evaluate each ---
results = []
for name, model in models.items():
    print(f"\nðŸ§  Training {name} ...")
    pipe = Pipeline(steps=[("preprocess", preprocessor),
                           ("model", model)])
    pipe.fit(X_train, y_train)

    # predictions (use the PIPELINE, not the bare model)
    y_pred = pipe.predict(X_test)

    # probabilities / scores (again from the PIPELINE)
    try:
        y_prob = pipe.predict_proba(X_test)[:, 1]
    except AttributeError:
        # e.g., SVM without probas (or any model lacking predict_proba)
        if hasattr(pipe, "decision_function"):
            scores = pipe.decision_function(X_test)
            # min-max normalize to [0,1] for ROC-AUC
            y_prob = (scores - scores.min()) / (scores.max() - scores.min() + 1e-9)
        else:
            # fallback: cast preds to float
            y_prob = y_pred.astype(float)

    acc = accuracy_score(y_test, y_pred)
    try:
        auc = roc_auc_score(y_test, y_prob)
    except ValueError:
        # If only one class in y_test (rare), fall back to accuracy as proxy
        auc = np.nan

    print(f"Accuracy: {acc:.3f} | ROC-AUC: {auc if not np.isnan(auc) else 'NA'}")
    print(classification_report(y_test, y_pred, digits=3))
    print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))

    results.append({"Model": name, "Accuracy": acc, "ROC-AUC": auc})

results_df = pd.DataFrame(results).sort_values("ROC-AUC", ascending=False, na_position="last")
display(results_df)

