In [2]:
# rockfall-prediction-system/notebooks/03_model_development.py

import pandas as pd
import os
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

# Load the processed data
processed_data_dir = '../data/processed'
X_train = pd.read_csv(os.path.join(processed_data_dir, 'X_train.csv'))
X_test = pd.read_csv(os.path.join(processed_data_dir, 'X_test.csv'))
y_train = pd.read_csv(os.path.join(processed_data_dir, 'y_train.csv')).values.ravel()
y_test = pd.read_csv(os.path.join(processed_data_dir, 'y_test.csv')).values.ravel()

# Define models to train
pipeline_lr = Pipeline([('scaler', StandardScaler()), ('clf', LogisticRegression(random_state=42, max_iter=1000))])
pipeline_rf = Pipeline([('scaler', StandardScaler()), ('clf', RandomForestClassifier(random_state=42))])
pipeline_svc = Pipeline([('scaler', StandardScaler()), ('clf', SVC(random_state=42))])
pipelines = [pipeline_lr, pipeline_rf, pipeline_svc]
model_names = ['Logistic Regression', 'Random Forest', 'Support Vector Machine']
trained_models = {}

# Train models
for i, pipe in enumerate(pipelines):
    print(f"--- Training {model_names[i]} ---")
    pipe.fit(X_train, y_train)
    trained_models[model_names[i]] = pipe
    print(f"{model_names[i]} trained successfully.\n")

# Evaluate models
best_accuracy = 0.0
best_model_name = ''
best_model_pipeline = None

for name, model in trained_models.items():
    print(f"--- Evaluating {name} ---")
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy:.4f}\n")
    print("Classification Report:")
    print(classification_report(y_test, y_pred, target_names=['Low', 'Medium', 'High', 'Critical']))
    
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model_name = name
        best_model_pipeline = model

print(f"\nBest performing model: {best_model_name} with an accuracy of {best_accuracy:.4f}")

# Save the best model
models_dir = '../models'
if not os.path.exists(models_dir):
    os.makedirs(models_dir)

model_path = os.path.join(models_dir, 'best_model.pkl')
with open(model_path, 'wb') as f:
    pickle.dump(best_model_pipeline, f)

print(f"Best model ({best_model_name}) saved to {model_path}")

--- Training Logistic Regression ---
Logistic Regression trained successfully.

--- Training Random Forest ---
Random Forest trained successfully.

--- Training Support Vector Machine ---
Support Vector Machine trained successfully.

--- Evaluating Logistic Regression ---
Accuracy: 0.9710

Classification Report:
              precision    recall  f1-score   support

         Low       0.99      1.00      0.99       250
      Medium       0.97      0.98      0.97       250
        High       0.97      0.96      0.97       250
    Critical       0.96      0.94      0.95       250

    accuracy                           0.97      1000
   macro avg       0.97      0.97      0.97      1000
weighted avg       0.97      0.97      0.97      1000

--- Evaluating Random Forest ---
Accuracy: 0.9420

Classification Report:
              precision    recall  f1-score   support

         Low       0.98      0.98      0.98       250
      Medium       0.92      0.94      0.93       250
        High  