# Model Design - {{experimentName}}

**MLE-Star Stage 1: Model Design and Architecture**

This notebook covers the model design phase of the MLE-Star methodology:
- Problem definition and success metrics
- Data exploration and understanding
- Model architecture selection
- Baseline model implementation

**Framework:** {{framework}}

**Date:** {{date}}


In [None]:
# Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import yaml

# Set style for plots
plt.style.use('seaborn-v0_8')
sns.set_palette('husl')

# Configure display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

print("Libraries imported successfully!")

## 1. Load Configuration


In [None]:
# Load project configuration
config_path = Path('../configs/config.yaml')
with open(config_path, 'r') as f:
    config = yaml.safe_load(f)

print("Project Configuration:")
print(f"Experiment: {config['experiment']['name']}")
print(f"Framework: {config['model']['framework']}")
print(f"Model Type: {config['model']['type']}")

# Set random seed for reproducibility
np.random.seed(config['data']['random_seed'])

## 2. Problem Definition


### Problem Statement
Define your machine learning problem here:

**Problem Type:** [Classification/Regression/Clustering/etc.]

**Objective:** [What are you trying to predict or optimize?]

**Success Metrics:** [How will you measure success?]
- Primary metric: 
- Secondary metrics: 

**Business Impact:** [Why is this problem important?]


In [None]:
# Define problem parameters
PROBLEM_TYPE = "classification"  # classification, regression, clustering, etc.
TARGET_VARIABLE = "target"       # name of target column
FEATURE_COLUMNS = []             # list of feature column names

# Success metrics
PRIMARY_METRIC = "accuracy"      # primary evaluation metric
SECONDARY_METRICS = ["precision", "recall", "f1_score"]

print(f"Problem Type: {PROBLEM_TYPE}")
print(f"Target Variable: {TARGET_VARIABLE}")
print(f"Primary Metric: {PRIMARY_METRIC}")
print(f"Secondary Metrics: {SECONDARY_METRICS}")

## 3. Data Loading and Exploration


In [None]:
# Load data
data_path = Path(config['data']['raw_data_path'])
print(f"Data path: {data_path}")

# TODO: Load your dataset
# Example:
# df = pd.read_csv(data_path / 'data.csv')
# print(f"Data shape: {df.shape}")

# For demonstration, create sample data
n_samples = 1000
n_features = 10

# Generate synthetic data for demonstration
X = np.random.randn(n_samples, n_features)
y = (X[:, 0] + X[:, 1] + np.random.randn(n_samples) * 0.1 > 0).astype(int)

# Create DataFrame
feature_names = [f'feature_{i}' for i in range(n_features)]
df = pd.DataFrame(X, columns=feature_names)
df[TARGET_VARIABLE] = y

print(f"Dataset shape: {df.shape}")
print(f"Features: {feature_names[:5]}...")  # show first 5 features
print(f"Target distribution:\n{df[TARGET_VARIABLE].value_counts()}")

In [None]:
# Basic data exploration
print("Dataset Info:")
print(df.info())
print("\nBasic Statistics:")
print(df.describe())

In [None]:
# Visualize data distribution
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Target distribution
df[TARGET_VARIABLE].hist(ax=axes[0, 0], bins=20)
axes[0, 0].set_title('Target Distribution')
axes[0, 0].set_xlabel(TARGET_VARIABLE)

# Feature distributions (first few features)
for i, feature in enumerate(feature_names[:3]):
    row = (i + 1) // 2
    col = (i + 1) % 2
    df[feature].hist(ax=axes[row, col], bins=30, alpha=0.7)
    axes[row, col].set_title(f'Distribution of {feature}')
    axes[row, col].set_xlabel(feature)

plt.tight_layout()
plt.show()

In [None]:
# Correlation analysis
correlation_matrix = df.corr()

plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0,
            square=True, fmt='.2f')
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()

# Show features most correlated with target
target_corr = correlation_matrix[TARGET_VARIABLE].abs().sort_values(ascending=False)
print(f"Features most correlated with {TARGET_VARIABLE}:")
print(target_corr.head(10))

## 4. Model Architecture Selection


In [None]:
# Framework-specific imports
framework = config['model']['framework']

if framework == 'pytorch':
    import torch
    import torch.nn as nn
    import torch.optim as optim
    from torch.utils.data import DataLoader, TensorDataset
    print(f"PyTorch version: {torch.__version__}")
    
elif framework == 'tensorflow':
    import tensorflow as tf
    from tensorflow import keras
    from tensorflow.keras import layers
    print(f"TensorFlow version: {tf.__version__}")
    
elif framework == 'scikit-learn':
    from sklearn.model_selection import train_test_split
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.linear_model import LogisticRegression
    from sklearn.svm import SVC
    from sklearn.metrics import classification_report, confusion_matrix
    import sklearn
    print(f"Scikit-learn version: {sklearn.__version__}")

In [None]:
# Model architecture design based on framework
def create_model_architecture():
    """Create model architecture based on framework"""
    
    if framework == 'pytorch':
        class MLPModel(nn.Module):
            def __init__(self, input_size, hidden_layers, output_size, dropout_rate=0.2):
                super(MLPModel, self).__init__()
                layers = []
                
                # Input layer
                layers.append(nn.Linear(input_size, hidden_layers[0]))
                layers.append(nn.ReLU())
                layers.append(nn.Dropout(dropout_rate))
                
                # Hidden layers
                for i in range(len(hidden_layers) - 1):
                    layers.append(nn.Linear(hidden_layers[i], hidden_layers[i+1]))
                    layers.append(nn.ReLU())
                    layers.append(nn.Dropout(dropout_rate))
                
                # Output layer
                layers.append(nn.Linear(hidden_layers[-1], output_size))
                
                self.model = nn.Sequential(*layers)
            
            def forward(self, x):
                return self.model(x)
        
        input_size = len(feature_names)
        hidden_layers = config['model']['hidden_layers']
        output_size = len(df[TARGET_VARIABLE].unique())
        
        model = MLPModel(input_size, hidden_layers, output_size)
        print(f"PyTorch Model Architecture:")
        print(model)
        
    elif framework == 'tensorflow':
        input_size = len(feature_names)
        hidden_layers = config['model']['hidden_layers']
        output_size = len(df[TARGET_VARIABLE].unique())
        
        model = keras.Sequential([
            layers.Dense(hidden_layers[0], activation='relu', input_shape=(input_size,)),
            layers.Dropout(0.2)
        ])
        
        for hidden_size in hidden_layers[1:]:
            model.add(layers.Dense(hidden_size, activation='relu'))
            model.add(layers.Dropout(0.2))
        
        # Output layer
        activation = 'sigmoid' if output_size == 2 else 'softmax'
        model.add(layers.Dense(output_size, activation=activation))
        
        print(f"TensorFlow Model Architecture:")
        model.summary()
        
    elif framework == 'scikit-learn':
        # Define multiple model options
        models = {
            'Random Forest': RandomForestClassifier(
                n_estimators=100, random_state=config['data']['random_seed']
            ),
            'Logistic Regression': LogisticRegression(
                random_state=config['data']['random_seed']
            ),
            'SVM': SVC(
                kernel='rbf', random_state=config['data']['random_seed']
            )
        }
        
        print("Scikit-learn Model Options:")
        for name, model in models.items():
            print(f"- {name}: {type(model).__name__}")
        
        model = models  # Return all models for comparison
    
    return model

# Create model architecture
model_architecture = create_model_architecture()

## 5. Baseline Model Implementation


In [None]:
# Split data for baseline model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Prepare features and target
X = df[feature_names].values
y = df[TARGET_VARIABLE].values

# Split data
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.3, random_state=config['data']['random_seed'], stratify=y
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=config['data']['random_seed'], stratify=y_temp
)

print(f"Training set: {X_train.shape}")
print(f"Validation set: {X_val.shape}")
print(f"Test set: {X_test.shape}")

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

print("Features scaled successfully!")

In [None]:
# Train and evaluate baseline models
from sklearn.metrics import accuracy_score, classification_report

def train_baseline_model():
    """Train baseline model based on framework"""
    
    if framework == 'scikit-learn':
        results = {}
        
        for name, model in model_architecture.items():
            print(f"\nTraining {name}...")
            
            # Train model
            model.fit(X_train_scaled, y_train)
            
            # Make predictions
            train_pred = model.predict(X_train_scaled)
            val_pred = model.predict(X_val_scaled)
            
            # Calculate metrics
            train_acc = accuracy_score(y_train, train_pred)
            val_acc = accuracy_score(y_val, val_pred)
            
            results[name] = {
                'model': model,
                'train_accuracy': train_acc,
                'val_accuracy': val_acc,
                'val_predictions': val_pred
            }
            
            print(f"Training Accuracy: {train_acc:.4f}")
            print(f"Validation Accuracy: {val_acc:.4f}")
        
        return results
    
    else:
        print(f"Baseline training for {framework} will be implemented in training pipeline notebook")
        return None

# Train baseline models
baseline_results = train_baseline_model()

In [None]:
# Evaluate baseline results
if baseline_results:
    print("\nBaseline Model Comparison:")
    print("-" * 50)
    
    comparison_data = []
    
    for name, result in baseline_results.items():
        comparison_data.append({
            'Model': name,
            'Train Accuracy': f"{result['train_accuracy']:.4f}",
            'Val Accuracy': f"{result['val_accuracy']:.4f}",
            'Overfitting': f"{result['train_accuracy'] - result['val_accuracy']:.4f}"
        })
    
    comparison_df = pd.DataFrame(comparison_data)
    print(comparison_df.to_string(index=False))
    
    # Find best model
    best_model_name = max(baseline_results.keys(), 
                         key=lambda x: baseline_results[x]['val_accuracy'])
    best_model = baseline_results[best_model_name]['model']
    
    print(f"\nBest baseline model: {best_model_name}")
    print(f"Best validation accuracy: {baseline_results[best_model_name]['val_accuracy']:.4f}")

In [None]:
# Detailed analysis of best model
if baseline_results:
    print(f"\nDetailed Analysis of {best_model_name}:")
    
    val_pred = baseline_results[best_model_name]['val_predictions']
    
    # Classification report
    print("\nClassification Report:")
    print(classification_report(y_val, val_pred))
    
    # Confusion Matrix
    from sklearn.metrics import confusion_matrix
    cm = confusion_matrix(y_val, val_pred)
    
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f'Confusion Matrix - {best_model_name}')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.show()

## 6. Model Design Summary


In [None]:
# Create model design summary
design_summary = {
    'experiment_name': config['experiment']['name'],
    'framework': framework,
    'problem_type': PROBLEM_TYPE,
    'target_variable': TARGET_VARIABLE,
    'n_features': len(feature_names),
    'n_samples': len(df),
    'data_splits': {
        'train': len(X_train),
        'validation': len(X_val),
        'test': len(X_test)
    },
    'primary_metric': PRIMARY_METRIC,
    'secondary_metrics': SECONDARY_METRICS,
    'baseline_performance': {},
    'next_steps': [
        'Implement full training pipeline',
        'Perform hyperparameter tuning',
        'Conduct thorough model evaluation',
        'Implement systematic testing'
    ]
}

if baseline_results:
    design_summary['baseline_performance'] = {
        name: {'val_accuracy': result['val_accuracy']}
        for name, result in baseline_results.items()
    }
    design_summary['best_baseline'] = best_model_name

print("Model Design Summary:")
print(yaml.dump(design_summary, default_flow_style=False))

# Save design summary
summary_path = Path('../outputs/reports/model_design_summary.yaml')
summary_path.parent.mkdir(parents=True, exist_ok=True)
with open(summary_path, 'w') as f:
    yaml.dump(design_summary, f, default_flow_style=False)

print(f"\nModel design summary saved to: {summary_path}")

## 7. Next Steps

Based on the model design analysis, the recommended next steps are:

1. **Learning Pipeline (Stage 2)**: Implement comprehensive data preprocessing and feature engineering
2. **Evaluation Setup (Stage 3)**: Define detailed evaluation metrics and validation strategies
3. **Systematic Testing (Stage 4)**: Create unit tests for model components
4. **Training Optimization (Stage 5)**: Implement hyperparameter tuning and model optimization
5. **Analysis Validation (Stage 6)**: Perform model interpretability and validation analysis
6. **Refinement Deployment (Stage 7)**: Prepare final model for deployment

**Key Insights from Model Design:**
- Data shape and characteristics
- Feature importance and correlations
- Baseline model performance benchmarks
- Framework-specific implementation considerations

Continue to the next notebook: `02_training_pipeline.ipynb`
