In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

import joblib
import warnings
warnings.filterwarnings("ignore")

pd.set_option("display.max_columns", None)

In [None]:
df = pd.read_csv("Telco-Customer-Churn.csv")
df.head()

In [None]:
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")

df.dropna(subset=["TotalCharges"], inplace=True)

df["Churn"] = df["Churn"].map({"Yes": 1, "No": 0})

df.drop(columns=["customerID"], inplace=True)

df.head()

In [None]:
num_cols = ["tenure", "MonthlyCharges", "TotalCharges"]

cat_cols = df.drop(columns=["Churn"]).columns[df.drop(columns=["Churn"]).dtypes == "object"].tolist()

cat_cols

In [None]:
preprocess = ColumnTransformer([
    ('onehot', OneHotEncoder(handle_unknown='ignore'), cat_cols),
    ('scaler', StandardScaler(), num_cols)
])

model = ImbPipeline([
    ('preprocess', preprocess),
    ('smote', SMOTE()),
    ('clf', RandomForestClassifier(n_estimators=200))
])

In [None]:
X = df.drop("Churn", axis=1)
y = df["Churn"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

model.fit(X_train, y_train)

preds = model.predict(X_test)
probs = model.predict_proba(X_test)[:, 1]

print(classification_report(y_test, preds))
print("ROC-AUC:", roc_auc_score(y_test, probs))

In [None]:
joblib.dump(model, "model.pkl")