In [None]:
# Cell 1: Setup
"""
Air Quality Index - Model Training
Notebook 2: Building and Training ML Models
"""

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import warnings
warnings.filterwarnings('ignore')

print("✅ Libraries imported successfully!")


In [None]:
# Cell 2: Load Data
"""
Load preprocessed data
"""

# Load data
df = pd.read_csv('data/processed/aqi_data_explored.csv')

# Remove category column if it exists
if 'AQI_Category' in df.columns:
    df = df.drop('AQI_Category', axis=1)

print(f"Dataset shape: {df.shape}")
df.head()

In [None]:
# Cell 3: Prepare Data
"""
Split features and target, create train/test sets
"""

# Features and target
X = df.drop('AQI', axis=1)
y = df['AQI']

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"\nFeatures: {list(X.columns)}")

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"\nTraining set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("✅ Data prepared and scaled!")


In [None]:
# Cell 4: Baseline Model
"""
Train a simple linear regression as baseline
"""

print("="*60)
print("BASELINE MODEL: LINEAR REGRESSION")
print("="*60)

baseline_model = LinearRegression()
baseline_model.fit(X_train_scaled, y_train)
y_pred_baseline = baseline_model.predict(X_test_scaled)

mae_baseline = mean_absolute_error(y_test, y_pred_baseline)
rmse_baseline = np.sqrt(mean_squared_error(y_test, y_pred_baseline))
r2_baseline = r2_score(y_test, y_pred_baseline)

print(f"MAE:  {mae_baseline:.2f}")
print(f"RMSE: {rmse_baseline:.2f}")
print(f"R²:   {r2_baseline:.4f}")


In [None]:
# Cell 5: Multiple Models Comparison
"""
Train and compare multiple models
"""

print("="*60)
print("TRAINING MULTIPLE MODELS")
print("="*60)

models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(alpha=1.0),
    'Lasso Regression': Lasso(alpha=1.0),
    'Decision Tree': DecisionTreeRegressor(random_state=42, max_depth=10),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42)
}

results = {}

for name, model in models.items():
    print(f"\nTraining {name}...")
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    
    results[name] = {
        'model': model,
        'predictions': y_pred,
        'MAE': mae,
        'RMSE': rmse,
        'R2': r2
    }
    
    print(f"  MAE:  {mae:.2f}")
    print(f"  RMSE: {rmse:.2f}")
    print(f"  R²:   {r2:.4f}")


In [None]:
# Cell 6: Model Comparison Visualization
"""
Visualize model performance comparison
"""

# Create comparison dataframe
comparison_df = pd.DataFrame({
    'Model': list(results.keys()),
    'MAE': [results[m]['MAE'] for m in results],
    'RMSE': [results[m]['RMSE'] for m in results],
    'R2': [results[m]['R2'] for m in results]
})

print("\n" + "="*60)
print("MODEL COMPARISON")
print("="*60)
print(comparison_df.to_string(index=False))

# Visualize
fig, axes = plt.subplots(1, 3, figsize=(16, 5))

comparison_df.plot(x='Model', y='MAE', kind='bar', ax=axes[0], color='coral', legend=False)
axes[0].set_title('Mean Absolute Error', fontweight='bold')
axes[0].set_ylabel('MAE')
axes[0].tick_params(axis='x', rotation=45)
axes[0].grid(alpha=0.3)

comparison_df.plot(x='Model', y='RMSE', kind='bar', ax=axes[1], color='steelblue', legend=False)
axes[1].set_title('Root Mean Squared Error', fontweight='bold')
axes[1].set_ylabel('RMSE')
axes[1].tick_params(axis='x', rotation=45)
axes[1].grid(alpha=0.3)

comparison_df.plot(x='Model', y='R2', kind='bar', ax=axes[2], color='green', legend=False)
axes[2].set_title('R² Score', fontweight='bold')
axes[2].set_ylabel('R²')
axes[2].tick_params(axis='x', rotation=45)
axes[2].grid(alpha=0.3)

plt.tight_layout()
plt.savefig('screenshots/model_comparison.png', dpi=300, bbox_inches='tight')
plt.show()


In [None]:
# Cell 7: Select Best Model
"""
Identify and analyze the best performing model
"""

best_model_name = comparison_df.loc[comparison_df['R2'].idxmax(), 'Model']
best_model = results[best_model_name]['model']
best_predictions = results[best_model_name]['predictions']

print("="*60)
print(f"BEST MODEL: {best_model_name}")
print("="*60)
print(f"MAE:  {results[best_model_name]['MAE']:.2f}")
print(f"RMSE: {results[best_model_name]['RMSE']:.2f}")
print(f"R²:   {results[best_model_name]['R2']:.4f}")


In [None]:
# Cell 8: Hyperparameter Tuning
"""
Fine-tune the best model using GridSearchCV
"""

print("="*60)
print("HYPERPARAMETER TUNING")
print("="*60)

if best_model_name == 'Random Forest':
    param_grid = {
        'n_estimators': [50, 100, 200],
        'max_depth': [10, 20, None],
        'min_samples_split': [2, 5, 10]
    }
    
    print("Performing Grid Search... (this may take a few minutes)")
    grid_search = GridSearchCV(
        RandomForestRegressor(random_state=42),
        param_grid,
        cv=3,
        scoring='r2',
        n_jobs=-1
    )
    
    grid_search.fit(X_train_scaled, y_train)
    
    print(f"\nBest parameters: {grid_search.best_params_}")
    print(f"Best CV score: {grid_search.best_score_:.4f}")
    
    # Use tuned model
    best_model = grid_search.best_estimator_
    best_predictions = best_model.predict(X_test_scaled)
    
    print(f"\nTuned Model Performance:")
    print(f"MAE:  {mean_absolute_error(y_test, best_predictions):.2f}")
    print(f"RMSE: {np.sqrt(mean_squared_error(y_test, best_predictions)):.2f}")
    print(f"R²:   {r2_score(y_test, best_predictions):.4f}")

In [None]:
# Cell 9: Cross-Validation
"""
Perform cross-validation for robust performance estimate
"""

print("="*60)
print("CROSS-VALIDATION")
print("="*60)

cv_scores = cross_val_score(
    best_model, X_train_scaled, y_train, 
    cv=5, scoring='r2'
)

print(f"Cross-validation R² scores: {cv_scores}")
print(f"Mean CV R²: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")


In [None]:
# Cell 10: Save Model
"""
Save the trained model for deployment
"""

import joblib

# Save model and scaler
joblib.dump(best_model, 'models/random_forest_model.pkl')
joblib.dump(scaler, 'models/scaler.pkl')

print("✅ Model and scaler saved successfully!")
print("   - models/random_forest_model.pkl")
print("   - models/scaler.pkl")