###Training the Advanced ML models on Preprocessed Dataset

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
earth_df = pd.read_csv('/content/sample_data/preprocessed_earthquake_data.csv')
earth_df.sample(5)

Unnamed: 0,Latitude,Longitude,Type,Depth,Magnitude,Magnitude Type,Root Mean Square,Source,Status,Year,...,Source_ISCGEM,Source_ISCGEMSUP,Source_NC,Source_NN,Source_OFFICIAL,Source_PR,Source_SE,Source_US,Source_UW,Status_Reviewed
15590,-0.216987,0.671481,Earthquake,-0.307934,-0.431457,MWC,2.109888,US,Reviewed,0.580028,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
6188,-0.87389,1.112133,Earthquake,3.760577,-0.667832,MB,-0.103839,US,Reviewed,-0.737068,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
2327,0.493913,0.657713,Earthquake,-0.291628,2.405043,MW,-0.103839,ISCGEM,Automatic,-1.430277,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
363,0.632028,0.848726,Earthquake,-0.204387,1.459543,MW,-0.103839,ISCGEM,Automatic,-1.846202,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7760,1.76262,-1.588942,Earthquake,-0.307934,0.514043,MW,-1.333688,US,Reviewed,-0.529106,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0


In [3]:
target = 'Magnitude'
categorical_cols = ['Type', 'Magnitude Type', 'Source', 'Status']

X = earth_df.drop(columns=[target]+categorical_cols)
y = earth_df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train.head()

Unnamed: 0,Latitude,Longitude,Depth,Root Mean Square,Year,Day,Month_sin,Month_cos,Hour_sin,Hour_cos,...,Source_ISCGEM,Source_ISCGEMSUP,Source_NC,Source_NN,Source_OFFICIAL,Source_PR,Source_SE,Source_US,Source_UW,Status_Reviewed
16953,-0.078008,0.755211,-0.495461,1.003024,0.787991,-1.116502,-1.215716,0.705254,-0.713894,1.234637,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
15800,-0.471466,1.014252,1.194724,-1.087719,0.649349,0.844142,1.218537,-0.717954,-1.006043,-0.995938,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
9014,0.213764,-0.62189,-0.495461,0.511085,-0.321143,0.498146,0.704119,-1.238884,-0.713894,-1.221272,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
15516,-1.274852,0.310949,-0.495461,-0.534287,0.580028,0.036818,-1.215716,-0.717954,-1.419204,0.006682,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
17837,0.01046,0.456978,-0.405774,-0.718764,0.926632,1.074806,1.218537,-0.717954,-1.419204,0.006682,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0


###1. Advanced Model-1 : GBM Regressor

In [4]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib

gbr = GradientBoostingRegressor(random_state=42)

param_grid = {
}

grid_search = GridSearchCV(estimator=gbr,
                           param_grid=param_grid,
                           cv=5,
                           scoring='neg_mean_absolute_error',
                           n_jobs=-1,
                           verbose=2)

grid_search.fit(X_train, y_train)

best_gbr = grid_search.best_estimator_

y_pred = best_gbr.predict(X_test)

# Calculate evaluation metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

cv_mae_scores = cross_val_score(best_gbr, X, y, cv=5, scoring='neg_mean_absolute_error')
cv_mse_scores = cross_val_score(best_gbr, X, y, cv=5, scoring='neg_mean_squared_error')
cv_r2_scores = cross_val_score(best_gbr, X, y, cv=5, scoring='r2')

print("Best model parameters:", grid_search.best_params_)
print(f"Test MAE: {mae:.4f}")
print(f"Test MSE: {mse:.4f}")
print(f"Test R2 score: {r2:.4f}")
print(f"5-Fold CV Mean MAE: {-np.mean(cv_mae_scores):.4f} ± {np.std(cv_mae_scores):.4f}")
print(f"5-Fold CV Mean MSE: {-np.mean(cv_mse_scores):.4f} ± {np.std(cv_mse_scores):.4f}")
print(f"5-Fold CV Mean R2: {np.mean(cv_r2_scores):.4f} ± {np.std(cv_r2_scores):.4f}")

model_filename = 'gbm_regressor_model.pkl'
joblib.dump(best_gbr, model_filename)
print(f"Model saved to {model_filename}")

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Best model parameters: {}
Test MAE: 0.6810
Test MSE: 0.8941
Test R2 score: 0.1394
5-Fold CV Mean MAE: 0.7183 ± 0.0267
5-Fold CV Mean MSE: 0.9400 ± 0.0671
5-Fold CV Mean R2: 0.0582 ± 0.0241
Model saved to gbm_regressor_model.pkl


###Advanced Model-2: catGBM Regressor


In [5]:
from sklearn.preprocessing import LabelEncoder
target = 'Magnitude'
categorical_cols = ['Type', 'Magnitude Type', 'Source', 'Status']

X = earth_df.drop(columns=[target])
y = earth_df[target]

for col in categorical_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [8]:
from catboost import CatBoostRegressor, Pool

train_pool = Pool(data=X_train, label=y_train, cat_features=categorical_cols)
test_pool = Pool(data=X_test, label=y_test, cat_features=categorical_cols)

catboost_model = CatBoostRegressor(
    random_seed=42,
    verbose=0
)

param_grid = {
    'iterations': [100, 200],
    'depth': [4, 6, 8],
    'learning_rate': [0.03, 0.1],
    'l2_leaf_reg': [1, 3, 5]
}

grid_search = GridSearchCV(
    estimator=catboost_model,
    param_grid=param_grid,
    cv=5,
    scoring='neg_mean_absolute_error',
    n_jobs=-1,
    verbose=2
)

grid_search.fit(X_train, y_train, cat_features=categorical_cols)

best_model = grid_search.best_estimator_

y_pred = best_model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

cv_mae_scores = cross_val_score(best_model, X, y, cv=5, scoring='neg_mean_absolute_error')

print("Best parameters:", grid_search.best_params_)
print(f"Test MAE: {mae:.4f}")
print(f"Test MSE: {mse:.4f}")
print(f"Test R2 score: {r2:.4f}")
print(f"5-Fold CV Mean MAE: {-np.mean(cv_mae_scores):.4f} ± {np.std(cv_mae_scores):.4f}")

model_filename = 'catboost_regressor_model.cbm'
best_model.save_model(model_filename)
print(f"Model saved to {model_filename}")

Fitting 5 folds for each of 36 candidates, totalling 180 fits
Best parameters: {'depth': 8, 'iterations': 200, 'l2_leaf_reg': 3, 'learning_rate': 0.1}
Test MAE: 0.6692
Test MSE: 0.8701
Test R2 score: 0.1625
5-Fold CV Mean MAE: 0.7062 ± 0.0434
Model saved to catboost_regressor_model.cbm
