## Phase 9: Optimization & Continuous Improvement

### Continuous Model Improvement

**1. Retraining Schedule**


In [None]:
# Decide how often to retrain based on:
# - How fast patterns change
# - How much new data accumulates
# - Computational resources

# Example: Churn model retrained weekly
import schedule
import time

def retrain_model():
    """
    Retrain model with latest data
    """
    # Load new data (last 30 days)
    new_data = load_recent_data(days=30)
    
    # Combine with historical data for stability
    combined_data = pd.concat([historical_data, new_data])
    
    # Preprocess, train, validate
    X = preprocessor.fit_transform(combined_data)
    y = combined_data['churned']
    
    # Split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    
    # Train new model
    new_model = RandomForestClassifier(n_estimators=100)
    new_model.fit(X_train, y_train)
    
    # Validate: Is new model better than current model?
    new_auc = roc_auc_score(y_test, new_model.predict_proba(X_test)[:, 1])
    current_auc = evaluate_current_model(X_test, y_test)
    
    if new_auc > current_auc + 0.01:  # Only promote if 1% improvement
        promote_model(new_model)
        print(f"Model updated. New AUC: {new_auc:.4f}")
    else:
        print(f"New model not better. Current AUC: {current_auc:.4f}, New: {new_auc:.4f}")

# Schedule
schedule.every().friday.at("02:00").do(retrain_model)

while True:
    schedule.run_pending()
    time.sleep(60)


**2. Feature Importance Analysis**


In [None]:
# Understand which features drive predictions
# Use for model debugging and business insights

def analyze_feature_importance(model, feature_names):
    """
    Analyze which features matter most
    """
    importances = model.feature_importances_
    
    importance_df = pd.DataFrame({
        'feature': feature_names,
        'importance': importances
    }).sort_values('importance', ascending=False)
    
    print(importance_df)
    
    # Visualize
    plt.figure(figsize=(10, 6))
    plt.barh(importance_df['feature'], importance_df['importance'])
    plt.xlabel('Importance')
    plt.title('Feature Importance')
    plt.tight_layout()
    plt.show()
    
    # Insights: If top 5 features explain 80% of importance,
    # model is interpretable and stable


**3. Error Analysis**


In [None]:
# When model makes mistakes, analyze why

def analyze_errors(y_true, y_pred, y_proba, data):
    """
    Analyze model errors for insights
    """
    # False positives: Predicted churn but didn't
    false_positives = (y_pred == 1) & (y_true == 0)
    fp_data = data[false_positives]
    
    print("False Positives (high churn prediction but stayed):")
    print(fp_data[['age', 'total_spent', 'transaction_count']].describe())
    
    # False negatives: Predicted no churn but actually churned
    false_negatives = (y_pred == 0) & (y_true == 1)
    fn_data = data[false_negatives]
    
    print("\nFalse Negatives (missed churners):")
    print(fn_data[['age', 'total_spent', 'transaction_count']].describe())
    
    # Insights:
    # If FNs are old customers with high spend, might need segment-specific models
    # If FPs are new customers, might have cold start problem


**4. Model Performance Degradation**


In [None]:
# Monitor if model performance decreases over time

def monitor_performance_degradation(dates, performance_scores):
    """
    Track performance over time
    """
    performance_df = pd.DataFrame({
        'date': dates,
        'auc': performance_scores
    })
    
    # Calculate trend
    performance_df['auc_rolling_avg'] = performance_df['auc'].rolling(7).mean()
    
    plt.figure(figsize=(12, 6))
    plt.plot(performance_df['date'], performance_df['auc'], label='Daily AUC', alpha=0.5)
    plt.plot(performance_df['date'], performance_df['auc_rolling_avg'], 
             label='7-day Average', linewidth=2)
    plt.axhline(y=0.80, color='r', linestyle='--', label='Minimum Threshold')
    plt.xlabel('Date')
    plt.ylabel('ROC-AUC')
    plt.title('Model Performance Over Time')
    plt.legend()
    plt.tight_layout()
    plt.show()
    
    # Alert if AUC drops below threshold for 3 consecutive days


### Complete MLDC Workflow Code


In [None]:
class MLDLCPipeline:
    """
    Complete Machine Learning Development Life Cycle
    """
    
    def __init__(self, project_name):
        self.project_name = project_name
        self.preprocessor = None
        self.model = None
        
    def frame_problem(self, business_goal, problem_type, success_metrics):
        """Phase 1: Frame Problem"""
        self.problem_definition = {
            'goal': business_goal,
            'type': problem_type,
            'metrics': success_metrics
        }
        print(f"Problem framed: {business_goal}")
        
    def gather_data(self, query):
        """Phase 2: Gather Data"""
        self.raw_data = pd.read_sql(query, connection)
        print(f"Data gathered: {self.raw_data.shape}")
        
    def process_data(self):
        """Phase 3: Clean & Process"""
        self.cleaned_data = self.raw_data.dropna()
        self.cleaned_data = self.cleaned_data.drop_duplicates()
        print(f"Data cleaned: {self.cleaned_data.shape}")
        
    def eda(self):
        """Phase 4: Exploratory Analysis"""
        print(self.cleaned_data.describe())
        print(f"Churn rate: {self.cleaned_data['churned'].mean():.2%}")
        
    def engineer_features(self):
        """Phase 5: Feature Engineering"""
        from sklearn.preprocessing import StandardScaler, OneHotEncoder
        from sklearn.compose import ColumnTransformer
        
        numeric_cols = self.cleaned_data.select_dtypes(include='number').columns
        categorical_cols = self.cleaned_data.select_dtypes(include='object').columns
        
        numeric_transformer = StandardScaler()
        categorical_transformer = OneHotEncoder(handle_unknown='ignore')
        
        self.preprocessor = ColumnTransformer(
            transformers=[
                ('num', numeric_transformer, numeric_cols),
                ('cat', categorical_transformer, categorical_cols)
            ]
        )
        print("Features engineered")
        
    def train_model(self):
        """Phase 6: Train & Select Model"""
        from sklearn.model_selection import train_test_split
        from sklearn.ensemble import RandomForestClassifier
        
        X = self.cleaned_data.drop('churned', axis=1)
        y = self.cleaned_data['churned']
        
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42
        )
        
        X_train_transformed = self.preprocessor.fit_transform(X_train)
        X_test_transformed = self.preprocessor.transform(X_test)
        
        self.model = RandomForestClassifier(n_estimators=100)
        self.model.fit(X_train_transformed, y_train)
        
        from sklearn.metrics import roc_auc_score
        score = roc_auc_score(y_test, self.model.predict_proba(X_test_transformed)[:, 1])
        print(f"Model trained. ROC-AUC: {score:.4f}")
        
        self.X_test = X_test
        self.y_test = y_test
        
    def deploy_model(self):
        """Phase 7: Deploy"""
        import joblib
        joblib.dump(self.model, f'models/{self.project_name}_model.joblib')
        joblib.dump(self.preprocessor, f'models/{self.project_name}_preprocessor.joblib')
        print("Model deployed")
        
    def monitor_model(self):
        """Phase 8 & 9: Monitor & Optimize"""
        print("Monitoring pipeline active")
        print("Scheduled for weekly retraining")

# Usage
pipeline = MLDLCPipeline("churn_prediction")
pipeline.frame_problem(
    business_goal="Reduce churn by 5%",
    problem_type="Classification",
    success_metrics={'precision': 0.80, 'recall': 0.70}
)
pipeline.gather_data("SELECT * FROM customers")
pipeline.process_data()
pipeline.eda()
pipeline.engineer_features()
pipeline.train_model()
pipeline.deploy_model()
pipeline.monitor_model()


### Tools Used in Optimization

| Tool | Purpose |
|------|---------|
| MLflow | Experiment tracking, model management |
| Weights & Biases | Experiment tracking, visualization |
| Apache Airflow | Workflow scheduling, orchestration |
| Prefect | Data pipeline orchestration |
| DVC | Data version control |

---

## Summary: MLDC Workflow

| Phase | Goal | Key Activities | Tools |
|-------|------|-----------------|-------|
| 1. Frame Problem | Define clearly | Business goal, metrics, constraints | Docs, SQL |
| 2. Gather Data | Collect raw data | Query databases, APIs, files | SQL, Pandas, APIs |
| 3. Process Data | Clean & prepare | Remove nulls, duplicates, outliers | Pandas, Great Expectations |
| 4. EDA | Understand data | Visualize, analyze distributions | Matplotlib, Seaborn, Pandas |
| 5. Feature Engineering | Create meaningful features | Encode, scale, aggregate, interact | Scikit-learn, Pandas |
| 6. Train & Select | Build & evaluate models | Train multiple models, cross-validate, tune | Scikit-learn, XGBoost |
| 7. Deploy | Put in production | API, containers, serving platform | Docker, FastAPI, Kubernetes |
| 8. Test & Monitor | Validate performance | Data validation, drift detection, A/B test | Great Expectations, Prometheus |
| 9. Optimize | Continuous improvement | Retrain, analyze errors, update features | MLflow, Airflow, Monitoring tools |

---

## Key Takeaways

1. **Problem framing is 50% of success** - A well-defined problem is half-solved

2. **Data quality > Model complexity** - Invest 80% time in data, 20% in algorithms

3. **Always start simple** - Baseline models are fast to build and interpret

4. **Cross-validate properly** - Prevents overfitting and ensures generalization

5. **Monitor in production** - Models degrade over time; retraining is essential

6. **Business metrics matter most** - Accuracy is not the goal; revenue/cost reduction is

7. **Document everything** - Future you will thank present you

8. **Iterate continuously** - ML is not one-time; it's ongoing optimization

---

**ML Development Life Cycle Complete. Ready to build production ML systems!**
