In [1]:
# Basic Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
# Modelling
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
import warnings

In [2]:
df =pd.read_csv("C:/Users/Genesis/OneDrive/Desktop/Projects/Earthquake_Predictor/dataset/Japan_Earthquake_1920_2020.csv")
df.head()

Unnamed: 0,time,latitude,longitude,depth,mag,magType,nst,gap,dmin,rms,...,updated,place,type,horizontalError,depthError,magError,magNst,status,locationSource,magSource
0,2020-12-12T23:49:38.281Z,35.5552,138.6886,182.4,4.0,mb,,108.0,0.968,0.56,...,2021-02-20T16:12:29.040Z,"7 km N of Fujikawaguchiko, Japan",earthquake,6.7,6.0,0.119,19.0,reviewed,us,us
1,2020-12-12T07:18:54.231Z,40.1213,142.0551,44.0,5.5,mww,,69.0,1.008,0.66,...,2021-02-20T16:11:27.040Z,"53 km N of Miyako, Japan",earthquake,6.4,1.9,0.063,24.0,reviewed,us,us
2,2020-12-11T12:58:06.708Z,23.9426,125.3592,26.64,4.8,mb,,98.0,2.206,0.7,...,2021-02-20T16:11:19.040Z,"95 km S of Hirara, Japan",earthquake,3.1,5.9,0.102,30.0,reviewed,us,us
3,2020-12-11T09:48:40.595Z,24.2994,125.2777,19.46,5.1,mww,,75.0,2.073,0.66,...,2021-02-20T16:11:17.040Z,"southwestern Ryukyu Islands, Japan",earthquake,6.1,3.4,0.057,30.0,reviewed,us,us
4,2020-12-10T18:15:08.670Z,24.539,122.0538,66.38,4.9,mww,,32.0,0.554,1.04,...,2021-02-20T16:11:09.040Z,"38 km SE of Yilan, Taiwan",earthquake,5.7,4.5,0.098,10.0,reviewed,us,us


In [3]:
train_df = df[['time','latitude','longitude','depth','mag']]
train_df.head()

Unnamed: 0,time,latitude,longitude,depth,mag
0,2020-12-12T23:49:38.281Z,35.5552,138.6886,182.4,4.0
1,2020-12-12T07:18:54.231Z,40.1213,142.0551,44.0,5.5
2,2020-12-11T12:58:06.708Z,23.9426,125.3592,26.64,4.8
3,2020-12-11T09:48:40.595Z,24.2994,125.2777,19.46,5.1
4,2020-12-10T18:15:08.670Z,24.539,122.0538,66.38,4.9


In [8]:
X = train_df.drop(columns=['mag','time'],axis=1)
X

Unnamed: 0,latitude,longitude,depth
0,35.5552,138.6886,182.40
1,40.1213,142.0551,44.00
2,23.9426,125.3592,26.64
3,24.2994,125.2777,19.46
4,24.5390,122.0538,66.38
...,...,...,...
50546,24.3370,122.5470,25.00
50547,41.1720,143.0540,30.00
50548,23.8010,124.8290,15.00
50549,43.9420,150.5010,35.00


In [9]:
y = train_df['mag']
y

0        4.0
1        5.5
2        4.8
3        5.1
4        4.9
        ... 
50546    6.9
50547    6.5
50548    7.0
50549    6.3
50550    6.4
Name: mag, Length: 50551, dtype: float64

In [10]:
X.shape

(50551, 3)

In [12]:
# Create Column Transformer with 3 types of transformers
num_features = X.select_dtypes(exclude="object").columns #holds all the numerical features

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    [
        ("StandardScaler", numeric_transformer, num_features),        
    ]
)

In [13]:
X=preprocessor.fit_transform(X)

In [14]:
X.shape

(50551, 3)

In [15]:
# separate dataset into train and test
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
X_train.shape, X_test.shape

((40440, 3), (10111, 3))

In [16]:
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [17]:
models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "XGBRegressor": XGBRegressor(), 
    "CatBoosting Regressor": CatBoostRegressor(verbose=False),
    "AdaBoost Regressor": AdaBoostRegressor()
}
model_list = []
r2_list =[]

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Evaluate Train and Test dataset
    model_train_mae , model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)

    model_test_mae , model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)

    
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))
    r2_list.append(model_test_r2)
    
    print('='*35)
    print('\n')

Linear Regression
Model performance for Training set
- Root Mean Squared Error: 0.5393
- Mean Absolute Error: 0.3842
- R2 Score: 0.0283
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 0.5301
- Mean Absolute Error: 0.3784
- R2 Score: 0.0263


Lasso
Model performance for Training set
- Root Mean Squared Error: 0.5471
- Mean Absolute Error: 0.3929
- R2 Score: 0.0000
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 0.5372
- Mean Absolute Error: 0.3869
- R2 Score: -0.0001


Ridge
Model performance for Training set
- Root Mean Squared Error: 0.5393
- Mean Absolute Error: 0.3842
- R2 Score: 0.0283
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 0.5301
- Mean Absolute Error: 0.3784
- R2 Score: 0.0263


K-Neighbors Regressor
Model performance for Training set
- Root Mean Squared Error: 0.4442
- Mean Absolute Error: 0.3244
- R2 Score: 0.3408
----------------------

In [18]:
pd.DataFrame(list(zip(model_list, r2_list)), columns=['Model Name', 'R2_Score']).sort_values(by=["R2_Score"],ascending=False)


Unnamed: 0,Model Name,R2_Score
7,CatBoosting Regressor,0.143899
6,XGBRegressor,0.133889
5,Random Forest Regressor,0.07229
2,Ridge,0.026287
0,Linear Regression,0.026287
1,Lasso,-0.000138
3,K-Neighbors Regressor,-0.006515
8,AdaBoost Regressor,-0.482015
4,Decision Tree,-0.673394


In [20]:
lin_model = CatBoostRegressor()
lin_model = lin_model.fit(X_train, y_train)
y_pred = lin_model.predict(X_test)
score = r2_score(y_test, y_pred)*100
print(" Accuracy of the model is %.2f" %score)

Learning rate set to 0.073461
0:	learn: 0.5442546	total: 7.99ms	remaining: 7.98s
1:	learn: 0.5415781	total: 14.8ms	remaining: 7.41s
2:	learn: 0.5389829	total: 23.3ms	remaining: 7.76s
3:	learn: 0.5367536	total: 29.9ms	remaining: 7.45s
4:	learn: 0.5349015	total: 36ms	remaining: 7.17s
5:	learn: 0.5334045	total: 44.2ms	remaining: 7.33s
6:	learn: 0.5320659	total: 50.5ms	remaining: 7.16s
7:	learn: 0.5309071	total: 57.5ms	remaining: 7.13s
8:	learn: 0.5299009	total: 63.7ms	remaining: 7.01s
9:	learn: 0.5288084	total: 70ms	remaining: 6.93s
10:	learn: 0.5282265	total: 76.3ms	remaining: 6.86s
11:	learn: 0.5275170	total: 82.1ms	remaining: 6.76s
12:	learn: 0.5268549	total: 89.3ms	remaining: 6.78s
13:	learn: 0.5263896	total: 95.1ms	remaining: 6.7s
14:	learn: 0.5258701	total: 101ms	remaining: 6.62s
15:	learn: 0.5254503	total: 107ms	remaining: 6.59s
16:	learn: 0.5248737	total: 113ms	remaining: 6.55s
17:	learn: 0.5243208	total: 121ms	remaining: 6.58s
18:	learn: 0.5237662	total: 128ms	remaining: 6.6s
19: