In [1]:
import time
import datetime as dt
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn import linear_model
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings("ignore")

In [2]:
main_df = pd.read_csv('eq_database_place.csv')
dummy_eq = main_df.copy()

In [3]:
dummy_df = dummy_eq[dummy_eq['Place'].str.contains('JP')]
# dummy_df = dummy_df.reindex(np.random.permutation(dummy_df.index))
dummy_df.head()

Unnamed: 0,Date,Time,Latitude,Longitude,Type,Depth,Depth Error,Depth Seismic Stations,Magnitude,Magnitude Type,...,Azimuthal Gap,Horizontal Distance,Horizontal Error,Root Mean Square,ID,Source,Location Source,Magnitude Source,Status,Place
55,02/16/1965,12:24:10,38.908,142.095,Earthquake,53.5,,,5.7,MW,...,,,,,ISCGEM860246,ISCGEM,ISCGEM,ISCGEM,Automatic,"Ofunato, JP"
89,03/16/1965,16:46:17,40.697,143.032,Earthquake,32.1,,,6.4,MW,...,,,,,ISCGEM858655,ISCGEM,ISCGEM,ISCGEM,Automatic,"Kuji, JP"
101,03/29/1965,10:47:38,40.687,142.915,Earthquake,30.0,,,6.4,MW,...,,,,,ISCGEM858966,ISCGEM,ISCGEM,ISCGEM,Automatic,"Kuji, JP"
111,04/06/1965,05:31:59,36.083,139.968,Earthquake,50.0,,,5.7,MW,...,,,,,ISCGEM857506,ISCGEM,ISCGEM,ISCGEM,Automatic,"Ishige, JP"
122,04/15/1965,05:09:50,25.08,122.897,Earthquake,165.0,,,5.6,MW,...,,,,,ISCGEM857769,ISCGEM,ISCGEM,ISCGEM,Automatic,"Yonakuni, JP"


## NaN removing function

In [4]:
def nan_helper(y):
    """
    Helper to handle indices and logical indices of NaNs.
    Input:
        - y, 1d numpy array with possible NaNs
    Output:
        - nans, logical indices of NaNs
        - index, a function, with signature indices= index(logical_indices),
          to convert logical indices of NaNs to 'equivalent' indices
    Example:
        >>> # linear interpolation of NaNs
        >>> nans, x= nan_helper(y)
        >>> y[nans]= np.interp(x(nans), x(~nans), y[~nans])
    """
    
    return np.isnan(y), lambda z: z.nonzero()[0]

## String label encoding

In [5]:
def label_integer_encoder(my_df, series_name):
    """
    This function is encoding values of a series
    Parameter
    ---------
    * `my_df`: Pandas dataframe
    * `series_name`: Pandas series name to encode
    Returns : a encoded array
    """
    arr_name = np.array(list(my_df[str(series_name)]))
    label_arr_encoder = LabelEncoder()
    integer_arr_encoded = label_arr_encoder.fit_transform(arr_name)
    
    return integer_arr_encoded

## Interpolation function

In [6]:
def get_interpolation(my_df, nan_series):
    arr_series = np.array(my_df[str(nan_series)])
    nans, x = nan_helper(arr_series)
    arr_series[nans] = np.interp(x(nans), x(~nans), arr_series[~nans])
    return arr_series.round(2)

## Removing NaN values from the series

In [7]:
dummy_df['Depth Error'] = get_interpolation(dummy_df, 'Depth Error')
dummy_df['Depth Seismic Stations'] = get_interpolation(dummy_df, 'Depth Seismic Stations')
dummy_df['Magnitude Error'] = get_interpolation(dummy_df, 'Magnitude Error')
dummy_df['Magnitude Seismic Stations'] = get_interpolation(dummy_df, 'Magnitude Seismic Stations')
dummy_df['Azimuthal Gap'] = get_interpolation(dummy_df, 'Azimuthal Gap')
dummy_df['Horizontal Distance'] = get_interpolation(dummy_df, 'Horizontal Distance')
dummy_df['Horizontal Error'] = get_interpolation(dummy_df, 'Horizontal Error')
dummy_df['Root Mean Square'] = get_interpolation(dummy_df, 'Root Mean Square')

## Actual encoding of strings

In [8]:
dummy_df['Type'] = label_integer_encoder(dummy_df, 'Type')
dummy_df['Magnitude Type'] = label_integer_encoder(dummy_df, 'Magnitude Type')
dummy_df['Place'] = label_integer_encoder(dummy_df, 'Place')
dummy_df['Status'] = label_integer_encoder(dummy_df, 'Status')

## Dropping unwanted

In [9]:
dummy_df = dummy_df.drop(['ID', 'Source', 'Location Source', 'Magnitude Source'], axis=1)

## Time object numerical values

In [10]:
timestamp = []
for d, t in zip(dummy_df['Date'], dummy_df['Time']):
    try:
        ts = dt.datetime.strptime(d + ' ' + t, '%m/%d/%Y %H:%M:%S')
        timestamp.append(time.mktime(ts.timetuple())) # inverse funtion of localtime
    except ValueError as e:
        timestamp.append('ValueError')

time_s = pd.Series(timestamp)
dummy_df['TimeStamp'] = time_s.values
dummy_df = dummy_df.drop(['Date', 'Time'], axis=1)

In [11]:
dummy_df.head()

Unnamed: 0,Latitude,Longitude,Type,Depth,Depth Error,Depth Seismic Stations,Magnitude,Magnitude Type,Magnitude Error,Magnitude Seismic Stations,Azimuthal Gap,Horizontal Distance,Horizontal Error,Root Mean Square,Status,Place,TimeStamp
55,38.908,142.095,0,53.5,3.1,412.0,5.7,2,0.04,7.0,112.5,3.36,5.9,1.0,0,133,-153767150.0
89,40.697,143.032,0,32.1,3.1,412.0,6.4,2,0.04,7.0,112.5,3.36,5.9,1.0,0,81,-151332223.0
101,40.687,142.915,0,30.0,3.1,412.0,6.4,2,0.04,7.0,112.5,3.36,5.9,1.0,0,81,-150230542.0
111,36.083,139.968,0,50.0,3.1,412.0,5.7,2,0.04,7.0,112.5,3.36,5.9,1.0,0,42,-149558281.0
122,25.08,122.897,0,165.0,3.1,412.0,5.6,2,0.04,7.0,112.5,3.36,5.9,1.0,0,229,-148782010.0


## Split into two

In [12]:
X = dummy_df[['Depth', 'Magnitude Error', 'Magnitude Type', 'Place', 'Depth Error', 
              'Azimuthal Gap', 'Horizontal Distance', 'Horizontal Error', 'Root Mean Square']]
y = dummy_df[['Latitude', 'Longitude', 'Magnitude']]

## Train & Test Split

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=5)
# display(X_train)
# display(len(X_train))

## Fitting the model -- __RandomForestRegressor__

In [14]:
reg = RandomForestRegressor()
reg.fit(X_train, y_train)
preds = reg.predict(X_test)
print(preds)

[[ 37.1959     141.262        5.82      ]
 [ 39.442      143.1833       6.71      ]
 [ 37.153      141.3507       5.99      ]
 [ 34.90772    137.15577      6.03      ]
 [ 38.92875    142.53005      5.9       ]
 [ 24.4047     122.822        5.87      ]
 [ 41.8598     139.3057       6.03      ]
 [ 37.394      141.3134       5.79      ]
 [ 37.2923     141.3847       5.93      ]
 [ 38.6952     141.0724       5.69      ]
 [ 38.7993     141.5966       5.83      ]
 [ 32.3086     132.5775       5.93      ]
 [ 39.9694     141.5383       5.83      ]
 [ 32.91078    136.13056      5.75      ]
 [ 33.0574     134.9513       5.62      ]
 [ 39.7846     140.2036       5.72      ]
 [ 39.327      143.1444       5.89      ]
 [ 40.0813     143.223        5.75      ]
 [ 33.0379     135.2231       5.82      ]
 [ 40.1179     140.2          5.8       ]
 [ 37.6875     141.3236       5.67      ]
 [ 25.4046     124.8028       5.73      ]
 [ 36.73809    141.82808      5.82      ]
 [ 35.602      139.8231       5.85

In [15]:
accuracy = reg.score(X_test, y_test)
print(accuracy)

0.7400954720686624


## Fitting the model -- __GridSearchCV__

In [16]:
parameters = {'n_estimators' : [13, 18, 43, 77, 45, 450]}
gs = GridSearchCV(reg, parameters)

In [17]:
grid_fit = gs.fit(X_train, y_train)
best_fit = grid_fit.best_estimator_
gs_preds = best_fit.predict(X_test)
print(gs_preds)

[[ 37.41883488 141.43004651   5.7372093 ]
 [ 39.43253488 143.16404651   6.73023256]
 [ 37.23870698 141.24127442   6.06046512]
 [ 35.43711163 138.30184651   5.95116279]
 [ 35.94966163 140.26760426   5.89496124]
 [ 24.44432558 122.83195349   5.89069767]
 [ 37.25795349 137.98211628   5.98372093]
 [ 37.28586047 140.01090698   5.81395349]
 [ 37.72876744 141.29076744   6.02325581]
 [ 38.27093256 141.6584814    5.83953488]
 [ 38.23727907 141.78090698   5.74418605]
 [ 32.2144186  133.9604186    6.0627907 ]
 [ 38.73509302 141.23209302   5.79069767]
 [ 32.31611628 134.30186047   5.75581395]
 [ 32.78198605 134.76067907   5.66976744]
 [ 39.22706977 140.42398605   5.65581395]
 [ 39.07518605 142.87967442   5.9744186 ]
 [ 40.40155814 143.34960465   5.73255814]
 [ 33.51643488 135.78653256   5.82093023]
 [ 36.28973953 139.91808372   5.74418605]
 [ 37.71195349 141.72134884   5.77674419]
 [ 25.74174419 124.98423256   5.73488372]
 [ 34.30799535 141.04760465   6.10232558]
 [ 35.78162791 140.61813953   5.94

In [18]:
gs_accuracy = best_fit.score(X_test, y_test)
print(gs_accuracy)

0.7876639552719702
