In [16]:
import os
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    roc_auc_score,
    ConfusionMatrixDisplay,
    RocCurveDisplay,
    accuracy_score,
)
import joblib


from sklearn.datasets import make_classification

plt.rcParams["figure.figsize"] = (6, 4)


In [17]:
def resolve_dataset_path(filename="churn_data.csv", env_var="CHURN_DATA_PATH", max_parent_levels=6):
    """Robustly find the dataset file.

    Returns absolute path if found, otherwise raises FileNotFoundError.
    """
    # enviorenment variable override
    env_path = os.getenv(env_var)
    if env_path:
        env_path = os.path.expanduser(env_path)
        if os.path.isfile(env_path):
            print(f"Using dataset from environment variable {env_var}: {env_path}")
            return os.path.abspath(env_path)
        else:
            print(f"{env_var} is set but file not found at: {env_path}")

    # 2) try common relative paths from current working directory and parents
    cwd = os.getcwd()
    candidates = []

    for up in range(max_parent_levels + 1):
        base = os.path.abspath(os.path.join(cwd, *([os.pardir] * up))) if up > 0 else cwd
        candidates.extend([
            os.path.join(base, "dataset", filename),
            os.path.join(base, "data", filename),
            os.path.join(base, filename),
            os.path.join(base, "notebooks", filename),
            os.path.join(base, "notebooks", "Sanjaya_FC211023", filename),
            os.path.join(base, "notebooks", "Sanjaya_FC211023", "dataset", filename),
        ])

    # also check some common mount points used in sandboxes
    candidates.extend([
        os.path.join("/workspace", filename),
        os.path.join("/workspace", "dataset", filename),
        os.path.join("/mnt", "data", filename),
        os.path.join("/mnt", "data", "dataset", filename),
        os.path.join("/home", os.getenv("USER", ""), filename),
    ])

    # Check candidates
    for p in candidates:
        if p and os.path.isfile(p):
            print("Resolved dataset at:", p)
            return os.path.abspath(p)

    # 3) shallow walk on parents (limit depth)
    for up in range(max_parent_levels + 1):
        base = os.path.abspath(os.path.join(cwd, *([os.pardir] * up))) if up > 0 else cwd
        for root, dirs, files in os.walk(base):
            if filename in files:
                found = os.path.join(root, filename)
                print("Resolved dataset via search at:", found)
                return os.path.abspath(found)

    # Not found
    tried = list(dict.fromkeys(candidates))  # unique preserving order
    print("Tried these candidate paths:")
    for p in tried:
        print("  -", p)
    raise FileNotFoundError(
        f"Could not find '{filename}'. Set {env_var} to the absolute path or place the file in a 'dataset/' folder near your project root.\n"
        f"Current working directory: {cwd}"
    )


In [None]:
data_file = None
try:
    data_file = resolve_dataset_path("churn_data.csv")
except FileNotFoundError as e:
    print("\n  Dataset not found:")
    print(e)

    if os.getenv("CHURN_FORCE_FAIL") == "1":
        raise

    # Otherwise create a small synthetic dataset as fallback
    print("\nCreating a small synthetic dataset as a fallback so you can run the notebook end-to-end.")
    print("You should replace it with your real churn_data.csv for real results.")

    n_samples = 1000
    X_syn, y_syn = make_classification(
        n_samples=n_samples,
        n_features=8,
        n_informative=4,
        n_redundant=1,
        n_clusters_per_class=1,
        weights=[0.75, 0.25],
        flip_y=0.01,
        random_state=42,
    )

    df_syn = pd.DataFrame(X_syn, columns=[f"num_feat_{i}" for i in range(X_syn.shape[1])])
    # Add a categorical-like feature by binning a numeric column
    df_syn["contract_type"] = pd.qcut(df_syn["num_feat_0"], q=3, labels=["Month-to-month", "One year", "Two year"])
    # Add a customer id column (will be removed automatically)
    df_syn["customerID"] = [f"C{10000 + i}" for i in range(n_samples)]
    df_syn["Churn"] = np.where(y_syn == 1, "Yes", "No")

    # Save synthetic CSV so subsequent runs pick it up too
    fallback_path = os.path.abspath("_synthetic_churn.csv")
    df_syn.to_csv(fallback_path, index=False)
    print(f"Synthetic dataset written to: {fallback_path}")
    data_file = fallback_path

# At this point data_file points to a valid CSV
print("\nLoading dataset from:", data_file)
df = pd.read_csv(data_file)
print("Dataset shape:", df.shape)
print(df.head(3))

# Quick sanity check
if df.shape[0] == 0 or df.shape[1] == 0:
    raise ValueError("Loaded dataset appears empty. Please provide a valid churn_data.csv file.")


In [None]:
PREFERRED_TARGET = "Churn"
possible_targets = [PREFERRED_TARGET, "churn", "Exited", "is_churn", "target", "label"]

target_col = None
cols_lower = {c.lower(): c for c in df.columns}
for cand in possible_targets:
    if cand.lower() in cols_lower:
        target_col = cols_lower[cand.lower()]
        print(f"Using target column: {target_col}")
        break

if target_col is None:
    # fall back to last column but warn
    target_col = df.columns[-1]
    print(f" Could not detect 'Churn' column. Falling back to last column: {target_col}")

# Drop obvious ID columns to avoid leaking
id_like = [c for c in df.columns if "id" in c.lower()]
if id_like:
    print("Dropping ID-like columns:", id_like)
    df = df.drop(columns=id_like)

X = df.drop(columns=[target_col])
y_raw = df[target_col]

# Convert to binary 0/1
if y_raw.dtype == object:
    y = y_raw.astype(str).str.strip().str.lower().map(lambda v: 1 if v in {"yes", "y", "true", "1", "churn", "exited"} else 0)
else:
    # numeric, try to coerce
    try:
        y = y_raw.astype(int)
    except Exception:
        y = pd.Series(np.where(y_raw == y_raw.iloc[0], 0, 1), index=y_raw.index)

print("\nTarget distribution:")
print(y.value_counts(dropna=False))

# Basic validation
unique_vals = sorted(y.unique())
if not set(unique_vals).issubset({0, 1}):
    raise ValueError(f"Target values are unexpected: {unique_vals}. Please make sure target is binary (Yes/No or 0/1).")


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print("\nTrain shape:", X_train.shape, "Test shape:", X_test.shape)

if X_train.shape[1] == 0:
    raise ValueError("No feature columns found. Check your dataset columns.")


In [None]:
numeric_features = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = X.select_dtypes(exclude=[np.number]).columns.tolist()

print("Numeric features (", len(numeric_features), "):", numeric_features)
print("Categorical features (", len(categorical_features), "):", categorical_features)


try:
    ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
except TypeError:
    ohe = OneHotEncoder(handle_unknown="ignore", sparse=False)

numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
])

categorical_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", ohe),
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ],
    remainder="drop",
)


In [None]:
svm = SVC(kernel="rbf", probability=True, class_weight="balanced", random_state=42)

pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("clf", svm),
])

print("\nTraining SVM (this may take a bit)...")
pipeline.fit(X_train, y_train)


In [None]:
y_pred = pipeline.predict(X_test)
try:
    y_prob = pipeline.predict_proba(X_test)[:, 1]
except Exception:
    # if classifier doesn't support predict_proba (shouldn't happen for SVC with probability=True)
    y_prob = np.zeros_like(y_pred, dtype=float)

acc = accuracy_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_prob) if y_prob.sum() != 0 else float("nan")
rep = classification_report(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

print("\nModel performance summary:")
print("Accuracy:", acc)
print("ROC-AUC:", auc)
print(rep)

# plots
ConfusionMatrixDisplay(cm).plot()
plt.title("Confusion Matrix")
plt.show()

if not np.isnan(auc):
    RocCurveDisplay.from_predictions(y_test, y_prob)
    plt.title("ROC Curve")
    plt.show()
else:
    print("Skipping ROC plot because we could not compute probabilities.")


In [None]:
project_root = os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(data_file)), os.pardir))
outputs_dir = os.path.join(project_root, "outputs")
models_dir = os.path.join(project_root, "models")
os.makedirs(outputs_dir, exist_ok=True)
os.makedirs(models_dir, exist_ok=True)

metrics = {"accuracy": float(acc), "roc_auc": (None if np.isnan(auc) else float(auc))}
with open(os.path.join(outputs_dir, "metrics.json"), "w") as f:
    json.dump(metrics, f, indent=2)

with open(os.path.join(outputs_dir, "classification_report.txt"), "w") as f:
    f.write(rep)

model_path = os.path.join(models_dir, "svm_churn_pipeline.joblib")
joblib.dump(pipeline, model_path)
print(f"\nModel saved to: {model_path}")
print(f"Metrics and report saved to: {outputs_dir}")


In [None]:
score_val = pipeline.score(X_test, y_test)
print("Score() on test set:", score_val)
assert isinstance(score_val, float) and 0.0 <= score_val <= 1.0, "pipeline.score must be a float between 0 and 1"

# B: predict_proba shape
proba = pipeline.predict_proba(X_test)
assert proba.ndim == 2 and proba.shape[1] == 2, "predict_proba should return shape (n_samples, 2)"

# C: saved model file exists
assert os.path.isfile(model_path), f"Saved model not found at {model_path}"

# D: metrics file exists and contains accuracy
with open(os.path.join(outputs_dir, "metrics.json"), "r") as f:
    metrics_loaded = json.load(f)
assert "accuracy" in metrics_loaded, "metrics.json must contain accuracy"

# E: preview first 5 predictions with truth
preview = X_test.copy().reset_index(drop=True).iloc[:5].copy()
preview["y_true"] = y_test.reset_index(drop=True).iloc[:5]
preview["y_pred"] = y_pred[:5]
preview["p_churn"] = y_prob[:5]
print("\nPreview (first 5):")
print(preview)
