# 03. Machine Learning Yield Prediction
## Smart Wafer Yield Optimization Project

This notebook implements comprehensive machine learning models for semiconductor yield prediction using the SECOM dataset.

### Objectives:
- Handle class imbalance with advanced techniques
- Train multiple ML models (Random Forest, XGBoost, LightGBM, Logistic Regression)
- Perform hyperparameter tuning and model selection
- Evaluate models with comprehensive metrics
- Analyze feature importance and model interpretability
- Save best performing model for production use

### Models to Implement:
1. **Random Forest**: Ensemble method with feature importance
2. **XGBoost**: Gradient boosting with advanced regularization
3. **LightGBM**: Fast gradient boosting with categorical support
4. **Logistic Regression**: Linear baseline model
5. **Ensemble**: Voting classifier combining best models

### Evaluation Metrics:
- Accuracy, Precision, Recall, F1-Score
- ROC-AUC and Precision-Recall curves
- Confusion Matrix analysis
- Cross-validation performance


In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (classification_report, confusion_matrix, roc_auc_score, 
                           roc_curve, precision_recall_curve, accuracy_score, 
                           precision_score, recall_score, f1_score)
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as ImbPipeline
import joblib
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

# Import our utility functions
import sys
import os
notebook_path = os.path.abspath("")
if notebook_path.endswith("notebooks"):
    project_root = os.path.dirname(notebook_path)
    os.chdir(project_root)
from app.utils import load_data, preprocess_data

print("Libraries imported successfully!")
print("Ready to begin machine learning model development...")



 Notes for later:
| Component                 | Why It Matters (Micron + SECOM Context)                                          |
| ------------------------- | -------------------------------------------------------------------------------- |
| **Median Imputer**        | Handles missing process sensor readings robustly                                 |
| **RobustScaler**          | Minimizes impact of sensor outliers and spikes                                   |
| **Yeo–Johnson Transform** | Reduces skew and stabilizes variance (like log but works with negatives)         |
| **PCA (50–100)**          | Removes redundancy among 590 correlated sensors, reduces overfitting             |
| **XGBoost**               | High accuracy, handles nonlinearity, outliers, imbalance                         |
| **SHAP** (post-model)     | Provides interpretability for engineers—shows which sensors contribute to faults |


In [None]:
# from sklearn.pipeline import Pipeline
# from sklearn.impute import SimpleImputer
# from sklearn.preprocessing import RobustScaler, PowerTransformer
# from sklearn.decomposition import PCA
# from xgboost import XGBClassifier

# # Step-by-step balanced pipeline
# pipeline = Pipeline([
#     ('imputer', SimpleImputer(strategy='median')),
#     # ('feature_engineering', OutlierFlagger(features=high_impact_features)), # Study
#     ('scaler', RobustScaler()),
#     ('power', PowerTransformer(method='yeo-johnson')), # !!!!!! refer back to 02_feature_engineering for some notes
#     ('pca', PCA(n_components=50, random_state=42)),  # or tune 30–100
#     ('clf', XGBClassifier(
#         scale_pos_weight=10,        # adjust for imbalance
#         eval_metric='auc',
#         random_state=42,
#         n_estimators=300,
#         max_depth=5,
#         learning_rate=0.05
#     ))
# ])

# After training:
# import shap

# explainer = shap.TreeExplainer(pipeline.named_steps['clf'])
# X_pca = pipeline.named_steps['pca'].transform(
#     pipeline.named_steps['power'].transform(
#         pipeline.named_steps['scaler'].transform(X_test)
#     )
# )
# shap_values = explainer.shap_values(X_pca)
# shap.summary_plot(shap_values, X_pca)


## 1. Load and Prepare Data


In [None]:
# Load the preprocessed data
print("Loading preprocessed SECOM data...")
import os
import time
data = load_data()

# Check if we have preprocessed data, otherwise preprocess
if os.path.exists('../data/processed/secom_cleaned.csv'):
    data = pd.read_csv('../data/processed/secom_cleaned.csv')
    print("✅ Loaded preprocessed data")
else:
    print("⚠️ No preprocessed data found, preprocessing now...")
    data = preprocess_data(data, method='knn')

print(f"Dataset shape: {data.shape}")
print(f"Missing values: {data.isnull().sum().sum()}")

# Separate features and target
if 'target' in data.columns:
    X = data.drop('target', axis=1)
    y = data['target']
    print(f"Features: {X.shape[1]}, Target distribution: {y.value_counts().to_dict()}")
    print(f"Class balance ratio: {y.value_counts().min() / y.value_counts().max():.3f}")
else:
    print("❌ No target variable found!")
    X = data
    y = None


## 2. Handle Class Imbalance


In [None]:
# Handle class imbalance using SMOTE
print("Handling class imbalance...")

# Split data first
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Original training set distribution: {y_train.value_counts().to_dict()}")

# Apply SMOTE to balance classes
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

print(f"After SMOTE distribution: {pd.Series(y_train_balanced).value_counts().to_dict()}")
print(f"Training set size: {X_train_balanced.shape[0]} (was {X_train.shape[0]})")


## 3. Train Multiple ML Models


In [None]:
# Define models to train
models = {
    'Random Forest': RandomForestClassifier(
        n_estimators=100,
        max_depth=10,
        random_state=42,
        class_weight='balanced'
    ),
    'Logistic Regression': LogisticRegression(
        random_state=42,
        max_iter=1000,
        class_weight='balanced'
    )
}

# Train and evaluate models
results = {}

for name, model in models.items():
    print(f"\nTraining {name}...")
    
    # Train model
    start_time = time.time()
    model.fit(X_train_balanced, y_train_balanced)
    training_time = time.time() - start_time
    
    # Make predictions
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_proba)
    
    results[name] = {
        'model': model,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'auc': auc,
        'training_time': training_time
    }
    
    print(f"✅ {name} - Accuracy: {accuracy:.3f}, F1: {f1:.3f}, AUC: {auc:.3f}")

# Display results
results_df = pd.DataFrame({
    name: {metric: results[name][metric] for metric in ['accuracy', 'precision', 'recall', 'f1', 'auc']}
    for name in results.keys()
}).T

print("\n📊 Model Performance Summary:")
print(results_df.round(3))


## 4. Model Evaluation and Visualization


In [None]:
# Create visualizations for model comparison
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# 1. Model Performance Comparison
metrics = ['accuracy', 'precision', 'recall', 'f1', 'auc']
x_pos = np.arange(len(metrics))
width = 0.35

for i, (name, result) in enumerate(results.items()):
    values = [result[metric] for metric in metrics]
    axes[0, 0].bar(x_pos + i*width, values, width, label=name, alpha=0.8)

axes[0, 0].set_xlabel('Metrics')
axes[0, 0].set_ylabel('Score')
axes[0, 0].set_title('Model Performance Comparison')
axes[0, 0].set_xticks(x_pos + width/2)
axes[0, 0].set_xticklabels(metrics)
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# 2. ROC Curves
for name, result in results.items():
    model = result['model']
    y_proba = model.predict_proba(X_test)[:, 1]
    fpr, tpr, _ = roc_curve(y_test, y_proba)
    auc = result['auc']
    axes[0, 1].plot(fpr, tpr, label=f'{name} (AUC = {auc:.3f})')

axes[0, 1].plot([0, 1], [0, 1], 'k--', alpha=0.5)
axes[0, 1].set_xlabel('False Positive Rate')
axes[0, 1].set_ylabel('True Positive Rate')
axes[0, 1].set_title('ROC Curves')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)

# 3. Confusion Matrix for best model
best_model_name = max(results.keys(), key=lambda x: results[x]['f1'])
best_model = results[best_model_name]['model']
y_pred_best = best_model.predict(X_test)

cm = confusion_matrix(y_test, y_pred_best)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[1, 0])
axes[1, 0].set_title(f'Confusion Matrix - {best_model_name}')
axes[1, 0].set_xlabel('Predicted')
axes[1, 0].set_ylabel('Actual')

# 4. Feature Importance (for Random Forest)
if 'Random Forest' in results:
    rf_model = results['Random Forest']['model']
    feature_importance = rf_model.feature_importances_
    
    # Get top 20 features
    top_features_idx = np.argsort(feature_importance)[-20:]
    top_features_importance = feature_importance[top_features_idx]
    top_features_names = [f'Feature_{i}' for i in top_features_idx]
    
    axes[1, 1].barh(range(len(top_features_names)), top_features_importance)
    axes[1, 1].set_yticks(range(len(top_features_names)))
    axes[1, 1].set_yticklabels(top_features_names)
    axes[1, 1].set_xlabel('Importance')
    axes[1, 1].set_title('Top 20 Feature Importance')
    axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"\n🏆 Best Model: {best_model_name}")
print(f"F1 Score: {results[best_model_name]['f1']:.3f}")
print(f"AUC Score: {results[best_model_name]['auc']:.3f}")


## 5. Save Best Model


In [None]:
# Save the best model
import os
os.makedirs('../models', exist_ok=True)

best_model = results[best_model_name]['model']
model_path = '../models/yield_predictor.pkl'

joblib.dump(best_model, model_path)
print(f"✅ Best model ({best_model_name}) saved to {model_path}")

# Save model metadata
metadata = {
    'model_name': best_model_name,
    'accuracy': results[best_model_name]['accuracy'],
    'precision': results[best_model_name]['precision'],
    'recall': results[best_model_name]['recall'],
    'f1': results[best_model_name]['f1'],
    'auc': results[best_model_name]['auc'],
    'training_time': results[best_model_name]['training_time'],
    'n_features': X.shape[1],
    'n_samples': X.shape[0]
}

import json
with open('../models/model_metadata.json', 'w') as f:
    json.dump(metadata, f, indent=2)

print("✅ Model metadata saved")
print(f"📊 Final Model Performance:")
print(f"   Accuracy: {metadata['accuracy']:.3f}")
print(f"   F1 Score: {metadata['f1']:.3f}")
print(f"   AUC Score: {metadata['auc']:.3f}")
print(f"   Features: {metadata['n_features']}")
print(f"   Samples: {metadata['n_samples']}")
