In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
from sklearn.preprocessing import StandardScaler
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

def load_and_explore_data(filepath='Churn_Modelling.csv'):
    """Load and explore the dataset"""
    print("🔍 Loading and exploring dataset...")
    
    try:
        df = pd.read_csv(filepath)
        print(f"✅ Dataset loaded successfully: {df.shape}")
        
        # Basic info
        print(f"\n📊 Dataset Info:")
        print(f"- Rows: {len(df):,}")
        print(f"- Columns: {len(df.columns)}")
        print(f"- Missing values: {df.isnull().sum().sum()}")
        
        # Target distribution
        churn_rate = df['Exited'].mean()
        print(f"- Churn rate: {churn_rate:.1%}")
        
        return df
        
    except FileNotFoundError:
        print("❌ Error: 'Churn_Modelling.csv' not found!")
        print("Please ensure the dataset file is in the same directory as this script.")
        return None
    except Exception as e:
        print(f"❌ Error loading dataset: {e}")
        return None

def preprocess_data(df):
    """Preprocess the data for modeling"""
    print("\n🔧 Preprocessing data...")
    
    # Create a copy to avoid modifying original
    df_processed = df.copy()
    
    # Remove unnecessary columns
    columns_to_drop = ['RowNumber', 'CustomerId', 'Surname']
    df_processed = df_processed.drop(columns_to_drop, axis=1)
    print(f"✅ Dropped columns: {columns_to_drop}")
    
    # Handle categorical variables
    print("✅ Encoding categorical variables...")
    df_processed = pd.get_dummies(df_processed, columns=['Geography', 'Gender'], drop_first=True)
    
    # Separate features and target
    X = df_processed.drop('Exited', axis=1)
    y = df_processed['Exited']
    
    print(f"✅ Features shape: {X.shape}")
    print(f"✅ Target shape: {y.shape}")
    print(f"✅ Feature columns: {list(X.columns)}")
    
    return X, y

def train_and_evaluate_model(X, y):
    """Train and evaluate the model with hyperparameter tuning"""
    print("\n🚀 Training model...")
    
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    
    print(f"✅ Train set: {X_train.shape[0]} samples")
    print(f"✅ Test set: {X_test.shape[0]} samples")
    
    # Define hyperparameter grid for tuning
    param_grid = {
        'n_estimators': [100, 200],
        'max_depth': [10, 15, None],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2],
        'class_weight': ['balanced', None]
    }
    
    # Initialize base model
    base_model = RandomForestClassifier(random_state=42)
    
    # Perform grid search with cross-validation
    print("🔍 Performing hyperparameter tuning...")
    grid_search = GridSearchCV(
        base_model, 
        param_grid, 
        cv=5, 
        scoring='roc_auc',
        n_jobs=-1,
        verbose=0
    )
    
    grid_search.fit(X_train, y_train)
    
    # Get the best model
    best_model = grid_search.best_estimator_
    print(f"✅ Best parameters: {grid_search.best_params_}")
    print(f"✅ Best CV score: {grid_search.best_score_:.4f}")
    
    # Make predictions
    y_pred = best_model.predict(X_test)
    y_pred_proba = best_model.predict_proba(X_test)[:, 1]
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    
    print(f"\n📈 Model Performance:")
    print(f"✅ Accuracy: {accuracy:.4f}")
    print(f"✅ ROC AUC: {roc_auc:.4f}")
    
    # Detailed classification report
    print(f"\n📊 Classification Report:")
    print(classification_report(y_test, y_pred))
    
    # Feature importance
    feature_importance = pd.DataFrame({
        'feature': X.columns,
        'importance': best_model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print(f"\n🎯 Top 5 Most Important Features:")
    for idx, row in feature_importance.head().iterrows():
        print(f"  {row['feature']}: {row['importance']:.4f}")
    
    return best_model, feature_importance

def save_model_and_artifacts(model, feature_importance):
    """Save the trained model and related artifacts"""
    print("\n💾 Saving model and artifacts...")
    
    try:
        # Save the model
        joblib.dump(model, 'churn_model.pkl')
        print("✅ Model saved as 'churn_model.pkl'")
        
        # Save feature importance
        feature_importance.to_csv('feature_importance.csv', index=False)
        print("✅ Feature importance saved as 'feature_importance.csv'")
        
        # Save model metadata
        model_info = {
            'model_type': 'RandomForestClassifier',
            'features': list(feature_importance['feature']),
            'n_features': len(feature_importance),
            'model_params': model.get_params()
        }
        
        # Save as text file for easy reference
        with open('model_info.txt', 'w') as f:
            f.write("🏦 ICICI Bank Churn Prediction Model\n")
            f.write("=" * 50 + "\n\n")
            f.write(f"Model Type: {model_info['model_type']}\n")
            f.write(f"Number of Features: {model_info['n_features']}\n\n")
            f.write("Features:\n")
            for i, feature in enumerate(model_info['features'], 1):
                f.write(f"{i:2d}. {feature}\n")
            f.write(f"\nModel Parameters:\n")
            for param, value in model_info['model_params'].items():
                f.write(f"  {param}: {value}\n")
        
        print("✅ Model info saved as 'model_info.txt'")
        
    except Exception as e:
        print(f"❌ Error saving model: {e}")

def validate_model_integration():
    """Test if the saved model works with the Flask app format"""
    print("\n🧪 Validating model integration...")
    
    try:
        # Load the saved model
        loaded_model = joblib.load('churn_model.pkl')
        
        # Create sample data in the format expected by Flask app
        sample_data = {
            'CreditScore': [650],
            'Age': [35],
            'Tenure': [5],
            'Balance': [80000.0],
            'NumOfProducts': [2],
            'HasCrCard': [1],
            'IsActiveMember': [0],
            'EstimatedSalary': [75000.0],
            'Geography_Germany': [1],
            'Geography_Spain': [0],
            'Gender_Male': [0]
        }
        
        sample_df = pd.DataFrame(sample_data)
        
        # Test prediction
        prediction = loaded_model.predict(sample_df)[0]
        probability = loaded_model.predict_proba(sample_df)[0][1]
        
        print(f"✅ Model integration test successful!")
        print(f"   Sample prediction: {prediction}")
        print(f"   Sample probability: {probability:.4f}")
        
        return True
        
    except Exception as e:
        print(f"❌ Model integration test failed: {e}")
        return False

def main():
    """Main training pipeline"""
    print("🏦 ICICI Bank - Customer Churn Prediction Model Training")
    print("=" * 60)
    
    # Load and explore data
    df = load_and_explore_data()
    if df is None:
        return
    
    # Preprocess data
    X, y = preprocess_data(df)
    
    # Train and evaluate model
    model, feature_importance = train_and_evaluate_model(X, y)
    
    # Save model and artifacts
    save_model_and_artifacts(model, feature_importance)
    
    # Validate integration
    if validate_model_integration():
        print("\n🎉 Model training completed successfully!")
        print("   The model is ready to be used with the Flask application.")
        print("   Run 'python app.py' to start the prediction service.")
    else:
        print("\n⚠️ Model training completed but integration test failed.")
        print("   Please check the model format and try again.")

if __name__ == "__main__":
    main()

🏦 ICICI Bank - Customer Churn Prediction Model Training
🔍 Loading and exploring dataset...
✅ Dataset loaded successfully: (10000, 14)

📊 Dataset Info:
- Rows: 10,000
- Columns: 14
- Missing values: 0
- Churn rate: 20.4%

🔧 Preprocessing data...
✅ Dropped columns: ['RowNumber', 'CustomerId', 'Surname']
✅ Encoding categorical variables...
✅ Features shape: (10000, 11)
✅ Target shape: (10000,)
✅ Feature columns: ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary', 'Geography_Germany', 'Geography_Spain', 'Gender_Male']

🚀 Training model...
✅ Train set: 8000 samples
✅ Test set: 2000 samples
🔍 Performing hyperparameter tuning...
✅ Best parameters: {'class_weight': 'balanced', 'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 200}
✅ Best CV score: 0.8637

📈 Model Performance:
✅ Accuracy: 0.8340
✅ ROC AUC: 0.8650

📊 Classification Report:
              precision    recall  f1-score   support

           0  