In [None]:
# Install required packages (fallback if not already installed)
import sys
!{sys.executable} -m pip install pandas numpy scikit-learn

In [None]:
# rockfall-prediction-system/notebooks/03_model_development.py

import pandas as pd
import os
import pickle
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import classification_report, accuracy_score

# Load the processed data
processed_data_dir = '../data/processed'
X_train = pd.read_csv(os.path.join(processed_data_dir, 'X_train.csv'))
X_test = pd.read_csv(os.path.join(processed_data_dir, 'X_test.csv'))
y_train = pd.read_csv(os.path.join(processed_data_dir, 'y_train.csv')).values.ravel()
y_test = pd.read_csv(os.path.join(processed_data_dir, 'y_test.csv')).values.ravel()

print("Data loaded successfully!")
print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")
print("="*70)

--- Training Logistic Regression ---
Logistic Regression trained successfully.

--- Training Random Forest ---
Random Forest trained successfully.

--- Training Support Vector Machine ---
Support Vector Machine trained successfully.

--- Evaluating Logistic Regression ---
Accuracy: 0.9710

Classification Report:
              precision    recall  f1-score   support

         Low       0.99      1.00      0.99       250
      Medium       0.97      0.98      0.97       250
        High       0.97      0.96      0.97       250
    Critical       0.96      0.94      0.95       250

    accuracy                           0.97      1000
   macro avg       0.97      0.97      0.97      1000
weighted avg       0.97      0.97      0.97      1000

--- Evaluating Random Forest ---
Accuracy: 0.9420

Classification Report:
              precision    recall  f1-score   support

         Low       0.98      0.98      0.98       250
      Medium       0.92      0.94      0.93       250
        High  

# Phase 1: Train Multiple Classification Models

We'll evaluate 6 different classification algorithms to find the best baseline model.

In [None]:
# Define models with default parameters
models = {
    'Logistic Regression': Pipeline([
        ('scaler', StandardScaler()), 
        ('clf', LogisticRegression(random_state=42, max_iter=1000))
    ]),
    'Random Forest': Pipeline([
        ('scaler', StandardScaler()), 
        ('clf', RandomForestClassifier(random_state=42))
    ]),
    'Support Vector Machine': Pipeline([
        ('scaler', StandardScaler()), 
        ('clf', SVC(random_state=42))
    ]),
    'Decision Tree': Pipeline([
        ('scaler', StandardScaler()), 
        ('clf', DecisionTreeClassifier(random_state=42))
    ]),
    'Naive Bayes': Pipeline([
        ('scaler', StandardScaler()), 
        ('clf', GaussianNB())
    ]),
    'K-Nearest Neighbors': Pipeline([
        ('scaler', StandardScaler()), 
        ('clf', KNeighborsClassifier())
    ])
}

# Train and evaluate all models
results = {}
trained_models = {}

print("\n" + "="*70)
print("TRAINING AND EVALUATING MODELS WITH DEFAULT PARAMETERS")
print("="*70 + "\n")

for name, pipeline in models.items():
    print(f"Training {name}...")
    
    # Train the model
    pipeline.fit(X_train, y_train)
    trained_models[name] = pipeline
    
    # Cross-validation score (5-fold)
    cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='accuracy')
    cv_mean = cv_scores.mean()
    cv_std = cv_scores.std()
    
    # Test set evaluation
    y_pred = pipeline.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_pred)
    
    # Store results
    results[name] = {
        'CV Mean': cv_mean,
        'CV Std': cv_std,
        'Test Accuracy': test_accuracy
    }
    
    print(f"  ✓ Cross-Validation Accuracy: {cv_mean:.4f} (+/- {cv_std:.4f})")
    print(f"  ✓ Test Accuracy: {test_accuracy:.4f}\n")

# Display results summary
results_df = pd.DataFrame(results).T.sort_values('Test Accuracy', ascending=False)
print("\n" + "="*70)
print("MODEL COMPARISON SUMMARY")
print("="*70)
print(results_df)
print("="*70)

# Phase 2: Hyperparameter Tuning for Top Models

Now we'll use GridSearchCV to find optimal hyperparameters for the best performing models.

In [None]:
# Select top 3 models for hyperparameter tuning
top_models = results_df.head(3).index.tolist()

print("\n" + "="*70)
print("HYPERPARAMETER TUNING WITH GRIDSEARCHCV")
print("="*70)
print(f"\nTuning top {len(top_models)} models: {', '.join(top_models)}\n")

# Define parameter grids for each model type
param_grids = {
    'Random Forest': {
        'clf__n_estimators': [100, 200, 300],
        'clf__max_depth': [10, 20, 30, None],
        'clf__min_samples_split': [2, 5, 10],
        'clf__min_samples_leaf': [1, 2, 4]
    },
    'Support Vector Machine': {
        'clf__C': [0.1, 1, 10, 100],
        'clf__gamma': ['scale', 'auto', 0.001, 0.01],
        'clf__kernel': ['rbf', 'linear']
    },
    'Logistic Regression': {
        'clf__C': [0.01, 0.1, 1, 10, 100],
        'clf__penalty': ['l2'],
        'clf__solver': ['lbfgs', 'saga']
    },
    'K-Nearest Neighbors': {
        'clf__n_neighbors': [3, 5, 7, 9, 11],
        'clf__weights': ['uniform', 'distance'],
        'clf__metric': ['euclidean', 'manhattan']
    }
}

# Perform grid search for top models
tuned_models = {}
tuning_results = {}

for model_name in top_models:
    if model_name in param_grids:
        print(f"Tuning {model_name}...")
        print(f"  Parameter grid size: {np.prod([len(v) for v in param_grids[model_name].values()])} combinations")
        
        # Create a fresh pipeline
        base_pipeline = models[model_name]
        
        # Grid search with 3-fold CV (faster than 5-fold)
        grid_search = GridSearchCV(
            base_pipeline,
            param_grids[model_name],
            cv=3,
            scoring='accuracy',
            n_jobs=-1,
            verbose=0
        )
        
        grid_search.fit(X_train, y_train)
        
        # Store tuned model
        tuned_models[model_name] = grid_search.best_estimator_
        
        # Evaluate on test set
        y_pred_tuned = grid_search.predict(X_test)
        test_acc_tuned = accuracy_score(y_test, y_pred_tuned)
        
        tuning_results[model_name] = {
            'Best CV Score': grid_search.best_score_,
            'Test Accuracy': test_acc_tuned,
            'Best Parameters': grid_search.best_params_,
            'Improvement': test_acc_tuned - results[model_name]['Test Accuracy']
        }
        
        print(f"  ✓ Best CV Score: {grid_search.best_score_:.4f}")
        print(f"  ✓ Test Accuracy: {test_acc_tuned:.4f}")
        print(f"  ✓ Improvement: {tuning_results[model_name]['Improvement']:.4f}")
        print(f"  ✓ Best Parameters: {grid_search.best_params_}\n")

print("="*70)
print("TUNING COMPLETE")
print("="*70)

# Phase 3: Select and Save Best Model

In [None]:
# Find the best overall model (tuned or untuned)
all_scores = {}

# Add default model scores
for name, res in results.items():
    all_scores[name] = {'Test Accuracy': res['Test Accuracy'], 'Tuned': False}

# Add tuned model scores (will override if better)
for name, res in tuning_results.items():
    if res['Test Accuracy'] > all_scores[name]['Test Accuracy']:
        all_scores[name] = {'Test Accuracy': res['Test Accuracy'], 'Tuned': True}

# Find best model
best_model_name = max(all_scores, key=lambda x: all_scores[x]['Test Accuracy'])
best_accuracy = all_scores[best_model_name]['Test Accuracy']
is_tuned = all_scores[best_model_name]['Tuned']

# Get the best model pipeline
if is_tuned and best_model_name in tuned_models:
    best_model_pipeline = tuned_models[best_model_name]
else:
    best_model_pipeline = trained_models[best_model_name]

print("\n" + "="*70)
print("BEST MODEL SELECTION")
print("="*70)
print(f"Best Model: {best_model_name}")
print(f"Test Accuracy: {best_accuracy:.4f}")
print(f"Hyperparameter Tuned: {'Yes' if is_tuned else 'No'}")

if is_tuned and best_model_name in tuning_results:
    print(f"Best Parameters: {tuning_results[best_model_name]['Best Parameters']}")

print("\n" + "="*70)
print("DETAILED CLASSIFICATION REPORT")
print("="*70)
y_pred_best = best_model_pipeline.predict(X_test)
print(classification_report(y_test, y_pred_best, target_names=['Low', 'Medium', 'High', 'Critical']))

# Save the best model
models_dir = '../models'
if not os.path.exists(models_dir):
    os.makedirs(models_dir)

model_path = os.path.join(models_dir, 'best_model.pkl')
with open(model_path, 'wb') as f:
    pickle.dump(best_model_pipeline, f)

# Also save model metadata
metadata = {
    'model_name': best_model_name,
    'test_accuracy': best_accuracy,
    'is_tuned': is_tuned,
    'best_params': tuning_results[best_model_name]['Best Parameters'] if is_tuned and best_model_name in tuning_results else None
}

metadata_path = os.path.join(models_dir, 'model_metadata.pkl')
with open(metadata_path, 'wb') as f:
    pickle.dump(metadata, f)

print(f"\n✓ Best model saved to {model_path}")
print(f"✓ Model metadata saved to {metadata_path}")
print("="*70)