# Day 3


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [2]:
df = pd.read_csv("C:/Users/Hrishikesh/churn-mlops-platform/data/raw/Telco-Customer-Churn.csv")
df

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,6840-RESVB,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,...,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.80,1990.5,No
7039,2234-XADUH,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.9,No
7040,4801-JZAZL,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.60,346.45,No
7041,8361-LTMKD,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Mailed check,74.40,306.6,Yes


In [3]:
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors = "coerce")
df = df.dropna()

# Define Target and Features 

In [4]:
Target = "Churn"

In [5]:
X = df.drop(columns=[Target, "customerID"])
y = df[Target].map({"Yes":1, "No":0})

# Splitting the Columns by type

In [6]:
categorical_cols = X.select_dtypes(include = "object").columns
numerical_cols = X.select_dtypes(exclude="object").columns

In [7]:
print(categorical_cols)
print(numerical_cols)

Index(['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines',
       'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
       'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract',
       'PaperlessBilling', 'PaymentMethod'],
      dtype='object')
Index(['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges'], dtype='object')


# Pre-processing the Pipeline

In [8]:
preprocessor = ColumnTransformer([("num", StandardScaler(), numerical_cols),
            ("Cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols)])

# Train/ Test Split the data

In [9]:
X_train, X_test, y_train,y_test = train_test_split(X,y, test_size=0.2, random_state=42, stratify=y)

# Importing the Additional Modules

In [10]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score, f1_score

# Define the models to Compare

In [11]:
# n_jobs = specifies the number of CPU cores to use for parallel processing 
# during both the fit (training) and predict phases.

models = {"random_forest":RandomForestClassifier(
    n_estimators=200,
    max_depth=10, 
    random_state=42,
    n_jobs=-1
    ),
    "gradient_boosting":GradientBoostingClassifier(
        n_estimators = 200,
        learning_rate = 0.05,
        random_state=42
    )
}

# Loop Through Models + Log to MLflow

In [14]:
!pip install mlflow

Collecting mlflow
  Using cached mlflow-3.9.0-py3-none-any.whl.metadata (31 kB)
Collecting mlflow-skinny==3.9.0 (from mlflow)
  Using cached mlflow_skinny-3.9.0-py3-none-any.whl.metadata (32 kB)
Collecting mlflow-tracing==3.9.0 (from mlflow)
  Using cached mlflow_tracing-3.9.0-py3-none-any.whl.metadata (19 kB)
Collecting Flask-CORS<7 (from mlflow)
  Using cached flask_cors-6.0.2-py3-none-any.whl.metadata (5.3 kB)
Collecting alembic!=1.10.0,<2 (from mlflow)
  Using cached alembic-1.18.3-py3-none-any.whl.metadata (7.2 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Using cached docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Using cached graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting huey<3,>=2.5.4 (from mlflow)
  Using cached huey-2.6.0-py3-none-any.whl.metadata (4.3 kB)
Collecting skops<1 (from mlflow)
  Using cached skops-0.13.0-py3-none-any.whl.metadata (5.6 kB)
Collecting waitress<4 (from mlflow)
  Using cached waitress-3.0.2-p

# Train + Log

In [12]:
import os
import mlflow
import mlflow.sklearn

In [13]:
MLFLOW_PATH = "file:///C:/Users/Hrishikesh/churn-mlops-platform/mlruns"

mlflow.set_tracking_uri(MLFLOW_PATH)

experiment_name = "churn_model_comparison"
mlflow.set_experiment(experiment_name)

print("MLflow tracking at:",mlflow.get_tracking_uri())
print("Experiment:", experiment_name)

# mlflow.set_tracking_uri("http://127.0.0.1:5000")

for name, model in models.items():
    pipe = Pipeline([("prep", preprocessor),
                     ("model", model)])
    
    with mlflow.start_run(run_name = name):
        pipe.fit(X_train, y_train)

        # Log the parameters
        mlflow.log_params(model.get_params())

        preds = pipe.predict(X_test)
        proba = pipe.predict_proba(X_test)[:,1]

        auc = roc_auc_score(y_test, proba)
        f1 = f1_score(y_test, preds)

        mlflow.log_param("model", name)
        mlflow.log_metric("roc_auc", auc)
        mlflow.log_metric("f1", f1)

        mlflow.sklearn.log_model(pipe, "model")

        print(f"{name} = ROC_AUC:{auc:.4f}, F1:{f1:.4f}")

  return FileStore(store_uri, store_uri)


MLflow tracking at: file:///C:/Users/Hrishikesh/churn-mlops-platform/mlruns
Experiment: churn_model_comparison


  flavor.save_model(path=local_path, mlflow_model=mlflow_model, **kwargs)


random_forest = ROC_AUC:0.8302, F1:0.5676


  flavor.save_model(path=local_path, mlflow_model=mlflow_model, **kwargs)


gradient_boosting = ROC_AUC:0.8392, F1:0.5764


In [None]:
# For Churn 
# ROC_AUC = Ranking Quality
# F1 = Balance of catching churners vs false alarms
# Pick the model which has higher metrics

# Gradient Boosting : Faster inference, Low Memory 
# Random Forest : Easier Tuning, More Stable

In [None]:
# TO RUN 
#  mlflow ui --backend-store-uri file:///C:/Users/Hrishikesh/churn-mlops-platform/mlruns
# 

