In [18]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from imblearn.over_sampling import SMOTE

In [None]:
# Load and prepare data
print("Loading and preparing data...")
df = pd.read_csv("../data/milestone2_result_df.csv")

Loading and preparing data...


In [20]:
# Data Cleaning
print("Cleaning data...")
# Drop irrelevant columns
df = df.drop(['Churn Reason', 'Days Since Churn'], axis=1, errors='ignore')

# Handle missing values
df["Income"] = df["Income"].fillna(df["Income"].mean())
df["Credit Score"] = df["Credit Score"].fillna(df["Credit Score"].median())
df = df.dropna()

# Remove duplicates
df = df.drop_duplicates()

Cleaning data...


In [21]:
# Encode categorical variables
print("Encoding categorical variables...")
categorical_cols = ['Gender', 'Marital Status', 'Occupation', 
                   'Education Level', 'Customer Segment', 
                   'Preferred Communication Channel']
df = pd.get_dummies(df, columns=categorical_cols)

Encoding categorical variables...


In [22]:
# Feature scaling
print("Scaling numeric features...")
scaler = StandardScaler()
numeric_cols = ['Income', 'Customer Tenure', 'Balance', 'Age', 'Credit Score']
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

Scaling numeric features...


In [23]:
# Split data into features and target
X = df.drop('Churn Flag', axis=1)
y = df['Churn Flag']

# Check class distribution
print("Class distribution:")
print(y.value_counts())
print(f"Class imbalance ratio: {y.value_counts()[0] / y.value_counts()[1]:.2f}")

Class distribution:
Churn Flag
0    101546
1     14094
Name: count, dtype: int64
Class imbalance ratio: 7.20


In [24]:
print("Splitting data into train and test sets...")
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

Splitting data into train and test sets...


In [25]:
print("Applying SMOTE to handle class imbalance...")
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

Applying SMOTE to handle class imbalance...


In [26]:
print(f"Original training set shape: {X_train.shape}")
print(f"Resampled training set shape: {X_train_resampled.shape}")
print("Resampled class distribution:")
print(pd.Series(y_train_resampled).value_counts())

Original training set shape: (92512, 666)
Resampled training set shape: (162474, 666)
Resampled class distribution:
Churn Flag
0    81237
1    81237
Name: count, dtype: int64


In [27]:
# MODEL SELECTION AND TRAINING
from sklearn.svm import LinearSVC

print("\n=== MODEL SELECTION AND TRAINING ===")

# Define models to evaluate
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42),
}



=== MODEL SELECTION AND TRAINING ===


In [28]:
# Train models
trained_models = {}
training_time = {}

In [29]:
for name, model in models.items():
    print(f"\nTraining {name}...")
    import time
    start_time = time.time()
    
    # Train on resampled data
    model.fit(X_train_resampled, y_train_resampled)
    
    end_time = time.time()
    training_time[name] = end_time - start_time
    
    print(f"Training completed in {training_time[name]:.2f} seconds")
    trained_models[name] = model


Training Logistic Regression...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Training completed in 53.30 seconds

Training Random Forest...
Training completed in 80.99 seconds

Training Gradient Boosting...
Training completed in 202.97 seconds


In [30]:
# Save trained models and data splits for subsequent stages
import pickle

In [31]:
print("\nSaving models and data for evaluation stage...")
with open('churn_models_and_data.pkl', 'wb') as f:
    pickle.dump({
        'models': trained_models,
        'X_train': X_train,
        'X_test': X_test,
        'y_train': y_train,
        'y_test': y_test,
        'feature_names': X.columns.tolist()
    }, f)


Saving models and data for evaluation stage...


In [32]:
print("Model selection and training completed.")
print("Summary of models trained:")
for name, model in trained_models.items():
    print(f"- {name}: Training time = {training_time[name]:.2f} seconds")

Model selection and training completed.
Summary of models trained:
- Logistic Regression: Training time = 53.30 seconds
- Random Forest: Training time = 80.99 seconds
- Gradient Boosting: Training time = 202.97 seconds
