# Telco Customer Churn: Model Development

This notebook covers the model development process, including training baseline models, ensemble methods, hyperparameter tuning, and model evaluation.

## 1. Setup and Data Preparation

In [None]:
import pandas as pd
import numpy as np
import sys
import os
import yaml

# Add src directory to path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

# Data and Preprocessing
from src.data_loader import TelcoDataLoader
from src.preprocessor import DataPreprocessor
from sklearn.model_selection import train_test_split

# Models
from src.base_model import LogisticRegressionModel
from src.ensemble_models import RandomForestChurnModel, XGBoostChurnModel
from src.advanced_ensemble import StackingChurnModel

# Evaluation and Tuning
from src.model_evaluator import ModelEvaluator
from src.hyperparameter_tuner import HyperparameterTuner
from src.cross_validation import CrossValidator
from src.visualization import plot_feature_importance

# Load config
with open('../config/config.yaml', 'r') as f:
    config = yaml.safe_load(f)

# Load and preprocess data
loader = TelcoDataLoader()
df = loader.load_raw_data()
df_processed = DataPreprocessor.preprocess_data(df.copy())

# Create features and target
X = df_processed.drop(config['target'], axis=1)
y = df_processed[config['target']].apply(lambda x: 1 if x == 'Yes' else 0)

# Create preprocessing pipeline
preprocessor = DataPreprocessor().create_preprocessing_pipeline()

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=config['training']['random_state'], stratify=y)

# Apply preprocessing
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

# Get feature names for later use
feature_names = preprocessor.get_feature_names_out()
X_train_transformed = pd.DataFrame(X_train_transformed, columns=feature_names)
X_test_transformed = pd.DataFrame(X_test_transformed, columns=feature_names)

## 2. Baseline Model (Logistic Regression)

In [None]:
lr_model = LogisticRegressionModel(random_state=config['training']['random_state'])

lr_model.train(X_train_transformed, y_train)

lr_metrics = lr_model.evaluate(X_test_transformed, y_test)
print('Logistic Regression Metrics:', lr_metrics)

## 3. Ensemble Models

In [None]:
# Random Forest

rf_model = RandomForestChurnModel(random_state=config['training']['random_state'])
rf_model.train(X_train_transformed, y_train)
rf_metrics = rf_model.evaluate(X_test_transformed, y_test)
print('Random Forest Metrics:', rf_metrics)

# XGBoost

xgb_model = XGBoostChurnModel(random_state=config['training']['random_state'])
xgb_model.train(X_train_transformed, y_train)
xgb_metrics = xgb_model.evaluate(X_test_transformed, y_test)
print('XGBoost Metrics:', xgb_metrics)

## 4. Hyperparameter Tuning (Example with RandomForest)

In [None]:
from sklearn.ensemble import RandomForestClassifier
param_grid_rf = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20],
    'min_samples_leaf': [1, 2]
}

rf_tuner = HyperparameterTuner(RandomForestClassifier(random_state=config['training']['random_state']), param_grid_rf)
best_rf_model, best_params = rf_tuner.grid_search_cv(X_train_transformed, y_train)

# Wrap the tuned model in our custom class
tuned_rf = RandomForestChurnModel()
tuned_rf.model = best_rf_model

## 5. Model Comparison

In [None]:
models_to_compare = {
    'Logistic Regression': lr_model.model,
    'Random Forest': rf_model.model,
    'XGBoost': xgb_model.model,
    'Tuned Random Forest': tuned_rf.model
}

evaluator = ModelEvaluator(models_to_compare, X_test_transformed, y_test)
comparison_df = evaluator.compare_multiple_models()
print(comparison_df)

# Create plots (ROC, Confusion Matrix)
evaluator.create_evaluation_plots()

## 6. Cross-Validation of the Best Model

In [None]:
# Let's assume Tuned Random Forest is our best model
cv_validator = CrossValidator(tuned_rf.model, X_train_transformed, y_train)
cv_validator.validate_model_stability()

## 7. Feature Importance Analysis

In [None]:
feature_importance = tuned_rf.get_feature_importance()
if feature_importance is not None:
    plot_feature_importance(feature_importance)