# Machine Learning Model Training for Black Spot Alert System

This notebook:
1. Loads the processed dataset
2. Performs feature engineering
3. Creates risk labels using quartiles (Low, Medium, High, Very High)
4. Splits data into train/test sets
5. Trains a Random Forest Classifier
6. Evaluates the model (Accuracy, Precision, Recall, F1, Confusion Matrix)
7. Saves the trained model as .pkl file


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, classification_report
)
import joblib
import warnings
warnings.filterwarnings('ignore')

# Set style for plots
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (10, 6)


## 1. Load Processed Dataset


In [None]:
# Load the processed dataset
df = pd.read_csv("processed_blackspot_dataset.csv")
print(f"Dataset shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
print(f"\nFirst few rows:")
df.head()


## 2. Feature Engineering


In [None]:
def create_features(df):
    """Create additional features for ML model."""
    df = df.copy()
    
    # Create composite risk indicators
    df['total_incidents'] = (
        df['accident_count'].fillna(0) + 
        df['traffic_incident_count'].fillna(0)
    )
    
    # Create risk intensity score (0-10 scale)
    # Normalize accident count to 0-10 scale
    if df['accident_count'].max() > 0:
        df['recent_incidents_score'] = (
            (df['accident_count'] / df['accident_count'].max()) * 10
        ).clip(0, 10)
    else:
        df['recent_incidents_score'] = 0
    
    # Create urban indicator (if tourist density is high, likely urban)
    df['is_urban'] = (df['tourist_density'].fillna(0) > df['tourist_density'].median()).astype(int)
    
    # Create crowding level based on tourist density
    tourist_quantiles = df['tourist_density'].fillna(0).quantile([0.33, 0.67])
    df['crowding_level_encoded'] = pd.cut(
        df['tourist_density'].fillna(0),
        bins=[-np.inf, tourist_quantiles[0.33], tourist_quantiles[0.67], np.inf],
        labels=[0, 1, 2]  # Low, Medium, High
    ).astype(int)
    
    # Create transport exposure risk (based on traffic volume and accidents)
    df['transport_exposure'] = (
        df['avg_traffic_volume'].fillna(0) * df['accident_count'].fillna(0)
    ) / 1000  # Normalize
    
    # Create weather risk indicator
    weather_risk_map = {
        'Storm': 3, 'Fog': 2, 'Snow': 2, 'Rain': 1,
        'Clear': 0, 'Unknown': 1
    }
    df['weather_risk'] = df['dominant_weather'].map(
        lambda x: weather_risk_map.get(x, 1)
    ).fillna(1)
    
    return df

# Create features
df = create_features(df)
print("Features created successfully!")
print(f"\nNew feature columns: {[col for col in df.columns if col not in ['City', 'State']]}")
df.head()


## 3. Create Risk Labels (Target Variable)

We'll use quartiles of total incidents to create risk categories:
- Q1 (0-25%) → Low
- Q2 (25-50%) → Medium  
- Q3 (50-75%) → High
- Q4 (75-100%) → Very High


In [None]:
def create_risk_labels(df):
    """Create risk labels based on quartiles of total incidents."""
    # Use total_incidents as the basis for risk classification
    incident_counts = df['total_incidents'].fillna(0)
    
    # Calculate quartiles
    q1 = incident_counts.quantile(0.25)
    q2 = incident_counts.quantile(0.50)
    q3 = incident_counts.quantile(0.75)
    
    print(f"Risk Label Thresholds:")
    print(f"  Q1 (Low): 0 - {q1:.1f}")
    print(f"  Q2 (Medium): {q1:.1f} - {q2:.1f}")
    print(f"  Q3 (High): {q2:.1f} - {q3:.1f}")
    print(f"  Q4 (Very High): {q3:.1f} - {incident_counts.max():.1f}")
    
    # Create labels
    def assign_risk_level(count):
        if count <= q1:
            return "Low"
        elif count <= q2:
            return "Medium"
        elif count <= q3:
            return "High"
        else:
            return "Very High"
    
    df['risk_level'] = incident_counts.apply(assign_risk_level)
    
    # Display distribution
    print(f"\nRisk Level Distribution:")
    print(df['risk_level'].value_counts().sort_index())
    
    return df

# Create risk labels
df = create_risk_labels(df)
df[['City', 'State', 'total_incidents', 'risk_level']].head(10)


## 4. Prepare Features for Training


In [None]:
# Select features for training
# Exclude City, State (identifiers) and risk_level (target)
feature_columns = [
    'accident_count', 'avg_severity', 'max_severity',
    'night_accidents', 'weekend_accidents',
    'night_accident_rate', 'weekend_accident_rate', 'precipitation_rate',
    'avg_temperature', 'avg_visibility', 'avg_wind_speed',
    'tourist_visit_count', 'tourist_density', 'avg_stay_nights',
    'avg_spending', 'avg_satisfaction',
    'avg_traffic_severity', 'max_traffic_severity', 'traffic_incident_count',
    'avg_traffic_volume', 'avg_speed',
    'total_incidents', 'recent_incidents_score', 'is_urban',
    'crowding_level_encoded', 'transport_exposure', 'weather_risk'
]

# Remove any columns that don't exist
available_features = [col for col in feature_columns if col in df.columns]
print(f"Selected {len(available_features)} features for training")
print(f"Features: {available_features}")

# Prepare X and y
X = df[available_features].copy()
y = df['risk_level'].copy()

# Handle any remaining NaN values
X = X.fillna(X.median())

print(f"\nX shape: {X.shape}")
print(f"y shape: {y.shape}")
print(f"\nTarget distribution:\n{y.value_counts().sort_index()}")


## 5. Train/Test Split


In [None]:
# Split into train and test sets (80/20)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")
print(f"\nTraining set target distribution:")
print(y_train.value_counts().sort_index())
print(f"\nTest set target distribution:")
print(y_test.value_counts().sort_index())


## 6. Train Random Forest Classifier


In [None]:
# Initialize and train Random Forest Classifier
print("Training Random Forest Classifier...")
rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1
)

rf_model.fit(X_train, y_train)
print("✅ Model training complete!")


## 7. Model Evaluation


In [None]:
# Make predictions
y_train_pred = rf_model.predict(X_train)
y_test_pred = rf_model.predict(X_test)

# Calculate metrics
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

print("=" * 60)
print("MODEL EVALUATION RESULTS")
print("=" * 60)
print(f"\nTraining Accuracy: {train_accuracy:.4f} ({train_accuracy*100:.2f}%)")
print(f"Test Accuracy: {test_accuracy:.4f} ({test_accuracy*100:.2f}%)")

# Precision, Recall, F1 (macro average)
precision = precision_score(y_test, y_test_pred, average='macro')
recall = recall_score(y_test, y_test_pred, average='macro')
f1 = f1_score(y_test, y_test_pred, average='macro')

print(f"\nPrecision (macro): {precision:.4f}")
print(f"Recall (macro): {recall:.4f}")
print(f"F1-Score (macro): {f1:.4f}")

# Per-class metrics
print("\n" + "=" * 60)
print("PER-CLASS METRICS")
print("=" * 60)
print(classification_report(y_test, y_test_pred))


In [None]:
# Confusion Matrix
cm = confusion_matrix(y_test, y_test_pred)
class_names = sorted(y_test.unique())

plt.figure(figsize=(8, 6))
sns.heatmap(
    cm, annot=True, fmt='d', cmap='Blues',
    xticklabels=class_names,
    yticklabels=class_names
)
plt.title('Confusion Matrix - Random Forest Classifier')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.tight_layout()
plt.show()

print("\nConfusion Matrix:")
print(pd.DataFrame(cm, index=class_names, columns=class_names))


## 8. Feature Importance


In [None]:
# Get feature importance
feature_importance = pd.DataFrame({
    'feature': available_features,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

print("Top 15 Most Important Features:")
print(feature_importance.head(15))

# Visualize feature importance
plt.figure(figsize=(10, 8))
top_features = feature_importance.head(15)
sns.barplot(data=top_features, y='feature', x='importance', palette='viridis')
plt.title('Top 15 Feature Importance')
plt.xlabel('Importance')
plt.tight_layout()
plt.show()


## 9. Save Trained Model


In [None]:
# Create label encoder for risk levels
label_encoder = LabelEncoder()
label_encoder.fit(['Low', 'Medium', 'High', 'Very High'])

# Save model and metadata
model_package = {
    "model": rf_model,
    "label_encoder": label_encoder,
    "feature_columns": available_features,
    "scaler": None,  # Not using scaler for tree-based models
    "risk_thresholds": {
        "q1": df['total_incidents'].quantile(0.25),
        "q2": df['total_incidents'].quantile(0.50),
        "q3": df['total_incidents'].quantile(0.75)
    }
}

model_path = "trained_blackspot_model.pkl"
joblib.dump(model_package, model_path)
print(f"✅ Model saved to: {model_path}")
print(f"\nModel package contains:")
print(f"  - Trained Random Forest model")
print(f"  - Label encoder")
print(f"  - Feature columns: {len(available_features)} features")
print(f"  - Risk thresholds")


## 10. Test Model Loading

Verify that the saved model can be loaded correctly.


In [None]:
# Test loading the model
loaded_package = joblib.load(model_path)
loaded_model = loaded_package["model"]
loaded_features = loaded_package["feature_columns"]

print("✅ Model loaded successfully!")
print(f"Loaded model type: {type(loaded_model)}")
print(f"Number of features: {len(loaded_features)}")

# Test prediction with a sample
sample_idx = 0
sample_features = X_test.iloc[sample_idx:sample_idx+1]
prediction = loaded_model.predict(sample_features)[0]
actual = y_test.iloc[sample_idx]

print(f"\nSample Prediction Test:")
print(f"  Actual: {actual}")
print(f"  Predicted: {prediction}")
print(f"  Correct: {actual == prediction}")


: 