In [24]:
import os
import mlflow
import mlflow.sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

In [15]:
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
processed_dir = os.path.join(project_root, "notebooks", "data")
df_test = pd.read_csv(os.path.join(processed_dir, "test_preprocessed.csv"))
df_train = pd.read_csv(os.path.join(processed_dir, "train_preprocessed.csv"))

In [19]:
full_df = pd.concat([df_train, df_test], ignore_index=True)
full_df.head()

Unnamed: 0,Age,Gender,Tenure,Usage Frequency,Support Calls,Payment Delay,Subscription Type,Contract Length,Total Spend,Last Interaction,Churn
0,27.0,1,14.0,28.0,3.0,16.0,2,1,862.0,9.0,1.0
1,40.0,1,19.0,2.0,8.0,28.0,2,1,620.81,21.0,1.0
2,27.0,0,57.0,3.0,0.0,24.0,2,0,915.0,26.0,1.0
3,40.0,1,21.0,14.0,0.0,11.0,0,0,592.83,9.0,0.0
4,39.0,1,58.0,4.0,2.0,8.0,2,1,694.0,15.0,1.0


In [20]:
full_df.shape

(505206, 11)

In [21]:
full_df["Churn"].value_counts()

Churn
1.0    280492
0.0    224714
Name: count, dtype: int64

In [22]:
from sklearn.utils import shuffle
full_df = shuffle(full_df, random_state=42).reset_index(drop=True)
full_df.head()

Unnamed: 0,Age,Gender,Tenure,Usage Frequency,Support Calls,Payment Delay,Subscription Type,Contract Length,Total Spend,Last Interaction,Churn
0,22.0,0,24.0,21.0,1.0,12.0,1,0,749.4,1.0,0.0
1,50.0,1,34.0,28.0,10.0,27.0,1,0,209.0,22.0,1.0
2,44.0,1,51.0,13.0,9.0,2.0,0,2,301.38,6.0,1.0
3,27.0,0,39.0,17.0,3.0,2.0,1,0,267.0,5.0,1.0
4,28.0,1,36.0,3.0,2.0,12.0,0,2,914.8,10.0,0.0


In [31]:
X = full_df.drop(columns=["Churn"])
y = full_df["Churn"]
X_train_full, X_test, y_train_full, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42, 
    stratify=y
)
X_train, X_valid, y_train, y_valid = train_test_split(
    X_train_full, y_train_full, 
    test_size=0.25, 
    random_state=42, 
    stratify=y_train_full
)
scaler = StandardScaler()
scaler.fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_valid_scaled = scaler.transform(X_valid)
X_test_scaled = scaler.transform(X_test)

X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_valid_scaled = pd.DataFrame(X_valid_scaled, columns=X_valid.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)

X_train_scaled.to_csv(os.path.join(project_root , "notebooks","data", "X_train_scaled_merged.csv"), index=False)
y_train.to_csv(os.path.join(project_root , "notebooks","data", "y_train_merged.csv"), index=False)

X_valid_scaled.to_csv(os.path.join(project_root , "notebooks","data", "X_valid_scaled_merged.csv"), index=False)
y_valid.to_csv(os.path.join(project_root , "notebooks","data", "y_valid_merged.csv"), index=False)

X_test_scaled.to_csv(os.path.join(project_root , "notebooks","data", "X_test_scaled_merged.csv"), index=False)
y_test.to_csv(os.path.join(project_root , "notebooks","data", "y_test_merged.csv"), index=False)



In [26]:
print("\nSplitting and scaling complete!")
print(f"Final Training set shape: {X_train_scaled.shape}")
print(f"Validation set shape: {X_valid_scaled.shape}")
print(f"Test set shape: {X_test_scaled.shape}")



Splitting and scaling complete!
Final Training set shape: (303123, 10)
Validation set shape: (101041, 10)
Test set shape: (101042, 10)
