### Implementation of Cross-Validation & Hyperparameter Tuning

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [3]:
data = pd.read_csv('/content/sample_data/preprocessed_earthquake_data.csv')

In [4]:
target='Status_Reviewed'
categorical_cols=['Type','Magnitude Type','Source','Status']

X =data.drop(columns=[target]+categorical_cols)
y =data[target]

In [5]:
models={
    'RandomForest': RandomForestClassifier(random_state=42),
    'LogisticRegression': LogisticRegression(random_state=42)
}

In [8]:
for name, model in models.items():
    print(f"\nCross-validation for {name}:")
    y_cleaned = y.dropna()
    X_cleaned = X.loc[y_cleaned.index]

    scores = cross_val_score(model, X_cleaned, y_cleaned, cv=10, scoring='accuracy')
    print(f"Accuracy Scores for each fold: {scores}")
    print(f"Mean Accuracy: {np.mean(scores):.4f}")
    print(f"Accuracy Variance: {np.var(scores):.6f}")


Cross-validation for RandomForest:
Accuracy Scores for each fold: [1.         1.         1.         1.         1.         1.
 1.         1.         1.         0.97282099]
Mean Accuracy: 0.9973
Accuracy Variance: 0.000066

Cross-validation for LogisticRegression:
Accuracy Scores for each fold: [0.99672131 1.         1.         1.         1.         1.
 1.         1.         1.         0.97000937]
Mean Accuracy: 0.9967
Accuracy Variance: 0.000080


In [9]:
from sklearn.model_selection import GridSearchCV

param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5]
}

In [10]:
param_grid_lr = {
    'C': [0.01, 0.1, 1, 10],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear']
}

In [11]:
grid_searches = {
    'RandomForest': GridSearchCV(RandomForestClassifier(random_state=42), param_grid_rf, cv=5, scoring='accuracy'),
    'LogisticRegression': GridSearchCV(LogisticRegression(max_iter=500, random_state=42), param_grid_lr, cv=5, scoring='accuracy')
}

In [13]:
for name, gs in grid_searches.items():
    y_cleaned = y.dropna()
    X_cleaned = X.loc[y_cleaned.index]

    gs.fit(X_cleaned, y_cleaned)
    print(f"\nBest parameters for {name}: {gs.best_params_}")
    print(f"Best cross-validation accuracy for {name}: {gs.best_score_:.4f}")


Best parameters for RandomForest: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Best cross-validation accuracy for RandomForest: 0.9972

Best parameters for LogisticRegression: {'C': 0.1, 'penalty': 'l1', 'solver': 'liblinear'}
Best cross-validation accuracy for LogisticRegression: 0.9972
