In [None]:
# 1. Load the necessary libraries

import numpy as np
import pandas as pd

# Import libraries for model building and evaluation
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

# Import libraries for experiment tracking
import mlflow
import mlflow.sklearn 
from sklearn.metrics import accuracy_score, precision_score, f1_score

import os
PROJECT_ROOT = os.path.abspath("..")
MLRUNS_PATH = os.path.join(PROJECT_ROOT, "mlruns")

mlflow.set_tracking_uri(f"file://{MLRUNS_PATH}")

In [2]:
# 2. load dataset

reference_df = pd.read_csv('../data/processed/reference_data.csv')
reference_df.shape

(4922, 21)

In [3]:
# 3. Encode target variable
reference_df["Churn"] = reference_df["Churn"].map({"Yes": 1, "No": 0})

reference_df["Churn"].value_counts()


Churn
0    3625
1    1297
Name: count, dtype: int64

In [4]:
# 4. Test and train split
X = reference_df.drop(columns=["Churn"])
y = reference_df["Churn"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y) 

In [5]:
# 5. Preprocessing pipeline

Numerical_cols = ["tenure", "MonthlyCharges", "TotalCharges"]
Categorical_cols = [col for col in X.columns if col not in Numerical_cols]

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), Numerical_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), Categorical_cols),
    ]
) 

In [6]:
# 6. Baseline model
model = Pipeline(
    steps = [
        ("preprocessor", preprocessor),
        ("classifier", LogisticRegression(solver="liblinear", random_state=42, max_iter=1000)),
    ] 
)       

In [7]:
# track experiment with mlflow

mlflow.set_experiment("Churn_baseline_model")

with mlflow.start_run():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # evaluate model
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    # log parameters and metrics
    mlflow.log_param("model_type", "LogisticRegression")
    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("precision", precision)
    mlflow.log_metric("f1_score", f1)

    # log the model
    mlflow.sklearn.log_model(model, artifact_path="model") 


  return FileStore(store_uri, store_uri)
2025/12/15 19:56:45 INFO mlflow.tracking.fluent: Experiment with name 'Churn_baseline_model' does not exist. Creating a new experiment.
