In [1]:
import time
import datetime as dt
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn import linear_model
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings("ignore")

In [2]:
main_df = pd.read_csv('eq_database_place.csv')
dummy_eq = main_df.copy()

In [3]:
dummy_df = dummy_eq[dummy_eq['Place'].str.contains('US')]
# dummy_df = dummy_df.reindex(np.random.permutation(dummy_df.index))
dummy_df.head()

Unnamed: 0,Date,Time,Latitude,Longitude,Type,Depth,Depth Error,Depth Seismic Stations,Magnitude,Magnitude Type,...,Azimuthal Gap,Horizontal Distance,Horizontal Error,Root Mean Square,ID,Source,Location Source,Magnitude Source,Status,Place
123,04/16/1965,23:22:21,64.572,-160.375,Earthquake,15.0,,,6.0,MW,...,,,,,ISCGEM857809,ISCGEM,ISCGEM,ISCGEM,Automatic,"Golovin, US"
131,04/26/1965,20:29:07,54.157,-162.59,Earthquake,36.8,,,5.6,MW,...,,,,,ISCGEM858049,ISCGEM,ISCGEM,ISCGEM,Automatic,"Aleutians East Borough, US"
136,04/29/1965,15:28:45,47.288,-122.406,Earthquake,64.7,,,6.7,MW,...,,,,,ISCGEM858143,ISCGEM,ISCGEM,ISCGEM,Automatic,"Tacoma, US"
138,05/01/1965,21:27:54,60.35,-146.176,Earthquake,15.0,,,5.6,MW,...,,,,,ISCGEM856572,ISCGEM,ISCGEM,ISCGEM,Automatic,"Cordova, US"
184,06/23/1965,11:09:17,56.543,-152.948,Earthquake,20.0,,,6.5,MW,...,,,,,ISCGEM856357,ISCGEM,ISCGEM,ISCGEM,Automatic,"Uhaiak (historical), US"


## NaN removing function

In [4]:
def nan_helper(y):
    """
    Helper to handle indices and logical indices of NaNs.
    Input:
        - y, 1d numpy array with possible NaNs
    Output:
        - nans, logical indices of NaNs
        - index, a function, with signature indices= index(logical_indices),
          to convert logical indices of NaNs to 'equivalent' indices
    Example:
        >>> # linear interpolation of NaNs
        >>> nans, x= nan_helper(y)
        >>> y[nans]= np.interp(x(nans), x(~nans), y[~nans])
    """
    
    return np.isnan(y), lambda z: z.nonzero()[0]

## String label encoding

In [5]:
def label_integer_encoder(my_df, series_name):
    """
    This function is encoding values of a series
    Parameter
    ---------
    * `my_df`: Pandas dataframe
    * `series_name`: Pandas series name to encode
    Returns : a encoded array
    """
    arr_name = np.array(list(my_df[str(series_name)]))
    label_arr_encoder = LabelEncoder()
    integer_arr_encoded = label_arr_encoder.fit_transform(arr_name)
    
    return integer_arr_encoded

## Interpolation function

In [6]:
def get_interpolation(my_df, nan_series):
    arr_series = np.array(my_df[str(nan_series)])
    nans, x = nan_helper(arr_series)
    arr_series[nans] = np.interp(x(nans), x(~nans), arr_series[~nans])
    return arr_series.round(2)

## Removing NaN values from the series

In [7]:
dummy_df['Depth Error'] = get_interpolation(dummy_df, 'Depth Error')
dummy_df['Depth Seismic Stations'] = get_interpolation(dummy_df, 'Depth Seismic Stations')
dummy_df['Magnitude Error'] = get_interpolation(dummy_df, 'Magnitude Error')
dummy_df['Magnitude Seismic Stations'] = get_interpolation(dummy_df, 'Magnitude Seismic Stations')
dummy_df['Azimuthal Gap'] = get_interpolation(dummy_df, 'Azimuthal Gap')
dummy_df['Horizontal Distance'] = get_interpolation(dummy_df, 'Horizontal Distance')
dummy_df['Horizontal Error'] = get_interpolation(dummy_df, 'Horizontal Error')
dummy_df['Root Mean Square'] = get_interpolation(dummy_df, 'Root Mean Square')

## Actual encoding of strings

In [8]:
dummy_df['Type'] = label_integer_encoder(dummy_df, 'Type')
dummy_df['Magnitude Type'] = label_integer_encoder(dummy_df, 'Magnitude Type')
dummy_df['Place'] = label_integer_encoder(dummy_df, 'Place')
dummy_df['Status'] = label_integer_encoder(dummy_df, 'Status')

## Dropping unwanted

In [9]:
dummy_df = dummy_df.drop(['ID', 'Source', 'Location Source', 'Magnitude Source'], axis=1)

## Time object numerical values

In [10]:
timestamp = []
for d, t in zip(dummy_df['Date'], dummy_df['Time']):
    try:
        ts = dt.datetime.strptime(d + ' ' + t, '%m/%d/%Y %H:%M:%S')
        timestamp.append(time.mktime(ts.timetuple())) # inverse funtion of localtime
    except ValueError as e:
        timestamp.append('ValueError')

time_s = pd.Series(timestamp)
dummy_df['TimeStamp'] = time_s.values
dummy_df = dummy_df.drop(['Date', 'Time'], axis=1)

In [11]:
dummy_df.head()

Unnamed: 0,Latitude,Longitude,Type,Depth,Depth Error,Depth Seismic Stations,Magnitude,Magnitude Type,Magnitude Error,Magnitude Seismic Stations,Azimuthal Gap,Horizontal Distance,Horizontal Error,Root Mean Square,Status,Place,TimeStamp
123,64.572,-160.375,0,15.0,31.61,16.0,6.0,5,0.24,10.0,261.0,1.48,99.0,0.86,0,55,-148630059.0
131,54.157,-162.59,0,36.8,31.61,16.0,5.6,5,0.24,10.0,261.0,1.48,99.0,0.86,0,4,-147776453.0
136,47.288,-122.406,0,64.7,31.61,16.0,6.7,5,0.24,10.0,261.0,1.48,99.0,0.86,0,123,-147535275.0
138,60.35,-146.176,0,15.0,31.61,16.0,5.6,5,0.24,10.0,261.0,1.48,99.0,0.86,0,33,-147340926.0
184,56.543,-152.948,0,20.0,31.61,16.0,6.5,5,0.24,10.0,261.0,1.48,99.0,0.86,0,129,-142798843.0


## Split into two

In [12]:
X = dummy_df[['Latitude', 'Longitude', 'Magnitude Error', 'Magnitude Type', 'Depth Error', 
              'Azimuthal Gap', 'Horizontal Distance', 'Horizontal Error', 'Root Mean Square']]
y = dummy_df[['Magnitude', 'Depth']]

## Train & Test Split

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

## Fitting the model -- __RandomForestRegressor__

In [14]:
reg = RandomForestRegressor()
reg.fit(X_train, y_train)
preds = reg.predict(X_test)
print(preds)

[[  6.082   11.0038]
 [  6.11    32.26  ]
 [  6.21    43.67  ]
 [  5.854    8.523 ]
 [  5.66    33.    ]
 [  5.735    7.2687]
 [  5.84    23.7   ]
 [  5.8    116.24  ]
 [  5.99    37.48  ]
 [  5.61    78.07  ]
 [  6.16    14.09  ]
 [  5.886    9.5164]
 [  5.58    34.    ]
 [  6.03    38.06  ]
 [  5.8     45.92  ]
 [  6.34    29.77  ]
 [  5.65    39.31  ]
 [  5.87    32.97  ]
 [  5.835    8.4034]
 [  5.84    43.57  ]
 [  6.087    8.202 ]
 [  5.58    11.4781]
 [  6.069   10.9253]
 [  5.86    27.2   ]
 [  5.79    36.65  ]
 [  5.91   112.4   ]
 [  5.905    6.4128]
 [  5.98    41.04  ]
 [  5.74    44.08  ]
 [  5.76    56.99  ]
 [  5.8     32.03  ]
 [  5.95    68.67  ]
 [  5.68    33.    ]
 [  5.66    30.7   ]
 [  5.6     35.    ]
 [  5.9     14.48  ]
 [  5.86    34.07  ]
 [  5.94    13.7   ]
 [  5.9     63.62  ]
 [  5.82    33.95  ]
 [  5.82     8.3022]
 [  5.86    42.19  ]
 [  5.71    18.803 ]
 [  5.81    29.17  ]
 [  5.7     47.7   ]
 [  5.82    29.85  ]
 [  6.14    30.69  ]
 [  5.599    

In [15]:
accuracy = reg.score(X_test, y_test)
print(accuracy)

0.6481051614857934


## Fitting the model -- __GridSearchCV__

In [16]:
parameters = {'n_estimators' : [13, 18, 43, 77, 45, 450]}
gs = GridSearchCV(reg, parameters)

In [17]:
grid_fit = gs.fit(X_train, y_train)
best_fit = grid_fit.best_estimator_
gs_preds = best_fit.predict(X_test)
print(gs_preds)

[[  5.97022222  10.09718889]
 [  6.08422222  28.33151111]
 [  5.948       39.12222222]
 [  5.9966       7.63327778]
 [  5.66266667  30.42911111]
 [  5.79095556   7.08177556]
 [  5.93377778  29.60466667]
 [  5.81888889 112.09111111]
 [  6.00177778  35.06591111]
 [  5.668       61.04755556]
 [  6.10255556  15.09117111]
 [  5.93397778   8.97686889]
 [  5.72711111  35.08109778]
 [  5.81177778  36.64755556]
 [  5.76755556  47.44148889]
 [  6.26577778  32.87893333]
 [  5.89644444  77.80948889]
 [  5.88044444  29.38484444]
 [  5.96682222  10.90902222]
 [  5.98422222  40.36586667]
 [  6.12566667   8.96824444]
 [  5.70024444   8.82429778]
 [  5.9408       9.63674667]
 [  5.93777778  35.93562222]
 [  5.86066667  31.53375556]
 [  5.94355556 118.67777778]
 [  5.89117778   5.90024667]
 [  6.016       37.59646222]
 [  5.89088889  39.45631111]
 [  5.68644444  55.14822222]
 [  5.77511111  32.39355556]
 [  5.82222222  53.63177778]
 [  5.80888889  37.01671111]
 [  5.76622222  39.28635556]
 [  5.63822222

In [18]:
gs_accuracy = best_fit.score(X_test, y_test)
print(gs_accuracy)

0.79318404374048
