# Week 3, Day 6: Machine Learning Hackathon Challenge

## Challenge Overview
Build a complete machine learning solution for a real-world problem. You'll apply the concepts learned throughout Week 3:
- Data preprocessing
- Model selection and training
- Parameter tuning
- Model evaluation

## Problem: Customer Churn Prediction
Predict whether a customer will churn (leave) based on various features.

In [None]:
# Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV

## Part 1: Data Generation and Preparation

In [None]:
def generate_customer_data(n_samples=1000):
    """Generate synthetic customer data"""
    np.random.seed(42)
    
    # Generate features
    data = {
        'tenure': np.random.randint(1, 72, n_samples),  # months
        'monthly_charges': np.random.normal(70, 30, n_samples),
        'total_services': np.random.randint(1, 6, n_samples),
        'age': np.random.normal(45, 15, n_samples),
        'satisfaction_score': np.random.randint(1, 6, n_samples),
        'support_calls': np.random.poisson(3, n_samples),
        'payment_delay': np.random.randint(0, 15, n_samples),
        'contract_type': np.random.choice(['Monthly', 'Yearly'], n_samples),
        'online_security': np.random.choice(['Yes', 'No'], n_samples),
        'tech_support': np.random.choice(['Yes', 'No'], n_samples)
    }
    
    # Calculate total charges
    data['total_charges'] = data['tenure'] * data['monthly_charges'] + \
                           np.random.normal(0, 100, n_samples)
    
    # Create DataFrame
    df = pd.DataFrame(data)
    
    # Generate churn based on features
    churn_prob = 1 / (1 + np.exp(-(0.02 * df['monthly_charges'] - 
                                   0.3 * df['satisfaction_score'] + 
                                   0.1 * df['support_calls'] + 
                                   0.05 * df['payment_delay'] - 
                                   0.01 * df['tenure'])))
    df['churn'] = (np.random.random(n_samples) < churn_prob).astype(int)
    
    return df

# Generate dataset
df = generate_customer_data()
print("Dataset Shape:", df.shape)
print("\nFirst few rows:")
print(df.head())

## Challenge Tasks

### Task 1: Exploratory Data Analysis

In [None]:
def perform_eda(df):
    """Perform exploratory data analysis"""
    # Your code here:
    # 1. Analyze feature distributions
    # 2. Check correlations
    # 3. Identify patterns
    # 4. Visualize relationships
    pass

# Example solution structure:
def example_eda(df):
    # Basic statistics
    print("Basic Statistics:")
    print(df.describe())
    
    # Correlation analysis
    plt.figure(figsize=(12, 8))
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    sns.heatmap(df[numeric_cols].corr(), annot=True, cmap='coolwarm')
    plt.title('Correlation Matrix')
    plt.show()
    
    # Churn distribution
    plt.figure(figsize=(15, 5))
    
    plt.subplot(131)
    sns.countplot(data=df, x='churn')
    plt.title('Churn Distribution')
    
    plt.subplot(132)
    sns.boxplot(data=df, x='churn', y='monthly_charges')
    plt.title('Monthly Charges by Churn')
    
    plt.subplot(133)
    sns.boxplot(data=df, x='churn', y='satisfaction_score')
    plt.title('Satisfaction Score by Churn')
    
    plt.tight_layout()
    plt.show()

example_eda(df)

### Task 2: Data Preprocessing

In [None]:
def preprocess_data(df):
    """Preprocess the dataset"""
    # Your code here:
    # 1. Handle missing values
    # 2. Encode categorical variables
    # 3. Scale numerical features
    # 4. Feature engineering
    pass

# Example solution structure:
def example_preprocessing(df):
    # Create copy of dataframe
    df_processed = df.copy()
    
    # Encode categorical variables
    categorical_cols = ['contract_type', 'online_security', 'tech_support']
    df_processed = pd.get_dummies(df_processed, columns=categorical_cols)
    
    # Feature engineering
    df_processed['avg_monthly_charges'] = df_processed['total_charges'] / df_processed['tenure']
    df_processed['calls_per_month'] = df_processed['support_calls'] / df_processed['tenure']
    
    # Prepare features and target
    X = df_processed.drop('churn', axis=1)
    y = df_processed['churn']
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    return X_train_scaled, X_test_scaled, y_train, y_test

X_train_scaled, X_test_scaled, y_train, y_test = example_preprocessing(df)

### Task 3: Model Selection and Training

In [None]:
def train_models():
    """Train and compare different models"""
    # Your code here:
    # 1. Implement multiple models
    # 2. Train and evaluate each model
    # 3. Compare performance
    # 4. Select best model
    pass

# Example solution structure:
def example_model_training(X_train, X_test, y_train, y_test):
    # Define models
    models = {
        'Logistic Regression': LogisticRegression(random_state=42),
        'Decision Tree': DecisionTreeClassifier(random_state=42),
        'Random Forest': RandomForestClassifier(random_state=42),
        'SVM': SVC(random_state=42)
    }
    
    # Train and evaluate models
    results = []
    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        results.append({'Model': name, 'Accuracy': accuracy})
        
        print(f"\n{name} Results:")
        print(f"Accuracy: {accuracy:.4f}")
        print("\nClassification Report:")
        print(classification_report(y_test, y_pred))
    
    # Visualize results
    results_df = pd.DataFrame(results)
    plt.figure(figsize=(10, 6))
    sns.barplot(data=results_df, x='Model', y='Accuracy')
    plt.title('Model Comparison')
    plt.xticks(rotation=45)
    plt.show()
    
    return models

models = example_model_training(X_train_scaled, X_test_scaled, y_train, y_test)

### Task 4: Model Optimization

In [None]:
def optimize_model():
    """Optimize the best performing model"""
    # Your code here:
    # 1. Perform hyperparameter tuning
    # 2. Cross-validation
    # 3. Feature selection
    # 4. Model evaluation
    pass

# Example solution structure:
def example_optimization(X_train, X_test, y_train, y_test):
    # Grid search for Random Forest
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [5, 10, 15],
        'min_samples_split': [2, 5, 10]
    }
    
    rf = RandomForestClassifier(random_state=42)
    grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)
    
    print("Best parameters:", grid_search.best_params_)
    print("Best cross-validation score:", grid_search.best_score_)
    
    # Evaluate optimized model
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    
    print("\nOptimized Model Performance:")
    print(classification_report(y_test, y_pred))
    
    return best_model

best_model = example_optimization(X_train_scaled, X_test_scaled, y_train, y_test)

## Evaluation Criteria

Your solution will be evaluated based on:

1. Data Analysis and Preprocessing (25%)
   - Quality of EDA
   - Feature engineering
   - Data cleaning

2. Model Implementation (25%)
   - Model selection
   - Implementation quality
   - Code organization

3. Model Performance (25%)
   - Prediction accuracy
   - Model optimization
   - Cross-validation results

4. Analysis and Insights (25%)
   - Feature importance analysis
   - Performance interpretation
   - Business recommendations

## Submission Guidelines
1. Complete all tasks in this notebook
2. Document your approach and decisions
3. Include visualizations and insights
4. Provide recommendations for improvement