In [None]:
# Import required libraries
import sys
import os
import warnings
warnings.filterwarnings('ignore')

# Add src to path
sys.path.append('../src')

import pandas as pd
import numpy as np
from preprocessing import DataPreprocessor, load_data
from model_training import ChurnModelTrainer, split_data
from evaluation import ModelEvaluator
from utils import create_sample_dataset

## 1. Generate Sample Data (Optional)

In [None]:
# Create a sample dataset for demonstration
df = create_sample_dataset(n_samples=1000, output_path='../data/sample_customer_churn.csv')
print(f"Dataset shape: {df.shape}")
df.head()

## 2. Load Data

In [None]:
# Load your data
df = load_data('../data/sample_customer_churn.csv')
print(f"Loaded {len(df)} records with {len(df.columns)} columns")

## 3. Preprocess Data

In [None]:
# Initialize preprocessor
preprocessor = DataPreprocessor(scaling_method='standard')

# Run preprocessing pipeline
X, y = preprocessor.preprocess_pipeline(
    df, 
    target_column='Churn',
    encoding_method='onehot',
    handle_imbalance=False
)

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"Churn rate: {y.mean()*100:.2f}%")

## 4. Split Data

In [None]:
# Split into train, validation, and test sets
X_train, X_val, X_test, y_train, y_val, y_test = split_data(
    X, y, 
    test_size=0.2, 
    val_size=0.1, 
    random_state=42
)

print(f"Training set: {len(X_train)} samples")
print(f"Validation set: {len(X_val)} samples")
print(f"Test set: {len(X_test)} samples")

## 5. Train Models

In [None]:
# Initialize trainer
trainer = ChurnModelTrainer(random_state=42)

# Initialize models
models = trainer.initialize_models()
print(f"Initialized {len(models)} models:")
for name in models.keys():
    print(f"  - {name}")

In [None]:
# Train baseline models
cv_results = trainer.train_baseline_models(X_train, y_train, cv_folds=5)

# Display results
for name, results in cv_results.items():
    print(f"{name}: {results['mean_score']:.4f} (+/- {results['std_score']:.4f})")

In [None]:
# Tune hyperparameters for top 3 models
top_models = sorted(cv_results.items(), key=lambda x: x[1]['mean_score'], reverse=True)[:3]
top_model_names = [name for name, _ in top_models]

print(f"Tuning hyperparameters for: {top_model_names}")

tuned_models = trainer.tune_hyperparameters(
    X_train, y_train,
    model_names=top_model_names,
    search_method='grid',
    cv_folds=3
)

In [None]:
# Select best model
best_name, best_model, best_score = trainer.select_best_model(X_val, y_val)

print(f"\nBest Model: {best_name}")
print(f"Validation Score: {best_score:.4f}")

## 6. Evaluate Model

In [None]:
# Initialize evaluator
evaluator = ModelEvaluator()

# Get feature names
feature_names = list(X.columns) if hasattr(X, 'columns') else [f"Feature_{i}" for i in range(X.shape[1])]

# Evaluate best model
metrics = evaluator.evaluate_model(
    best_model,
    X_test,
    y_test,
    feature_names=feature_names,
    model_name=best_name
)

print("\nTest Set Performance:")
for metric, value in metrics.items():
    print(f"{metric}: {value:.4f}")

In [None]:
# Compare all models
comparison_df = evaluator.compare_models(trainer.models, X_test, y_test)
comparison_df

## 7. Feature Importance

In [None]:
# Plot feature importance (for tree-based models)
if hasattr(best_model, 'feature_importances_'):
    importance_df = evaluator.plot_feature_importance(
        best_model, 
        feature_names, 
        top_n=15
    )
    display(importance_df)

## 8. Make Predictions on New Data

In [None]:
# Create a sample new customer
new_customer = pd.DataFrame([{
    'gender': 'Male',
    'SeniorCitizen': 0,
    'Partner': 'Yes',
    'Dependents': 'No',
    'tenure': 12,
    'PhoneService': 'Yes',
    'MultipleLines': 'No',
    'InternetService': 'Fiber optic',
    'OnlineSecurity': 'No',
    'OnlineBackup': 'No',
    'DeviceProtection': 'No',
    'TechSupport': 'No',
    'StreamingTV': 'Yes',
    'StreamingMovies': 'Yes',
    'Contract': 'Month-to-month',
    'PaperlessBilling': 'Yes',
    'PaymentMethod': 'Electronic check',
    'MonthlyCharges': 85.0,
    'TotalCharges': 1020.0
}])

# Preprocess new customer (using fitted preprocessor)
X_new, _ = preprocessor.preprocess_pipeline(
    new_customer,
    target_column='Churn',  # Will be ignored for prediction data
    fit=False  # Use already fitted transformers
)

# Make prediction
prediction = best_model.predict(X_new)
probability = best_model.predict_proba(X_new)

print(f"\nChurn Prediction: {'Churn' if prediction[0] == 1 else 'No Churn'}")
print(f"Churn Probability: {probability[0][1]*100:.2f}%")
print(f"Retention Probability: {probability[0][0]*100:.2f}%")

## 9. Save Model

In [None]:
# Save the best model
import joblib

model_path = trainer.save_model(filepath='../models/best_churn_model.pkl')
print(f"Model saved to: {model_path}")

# Save preprocessor
preprocessor_path = '../models/preprocessor.pkl'
joblib.dump(preprocessor, preprocessor_path)
print(f"Preprocessor saved to: {preprocessor_path}")

## 10. Load and Use Saved Model

In [None]:
# Load saved model
loaded_model = joblib.load('../models/best_churn_model.pkl')
loaded_preprocessor = joblib.load('../models/preprocessor.pkl')

print("Model and preprocessor loaded successfully!")

# Use loaded model for prediction
# (preprocessing and prediction code same as above)

## Summary

This notebook demonstrated:
1. ✅ Data generation/loading
2. ✅ Data preprocessing
3. ✅ Train/validation/test split
4. ✅ Model training (multiple algorithms)
5. ✅ Hyperparameter tuning
6. ✅ Model evaluation
7. ✅ Feature importance
8. ✅ Making predictions
9. ✅ Saving/loading models

You can now:
- Customize the preprocessing pipeline
- Train with your own data
- Experiment with different models
- Make predictions on new customers
- Deploy models in production