In [54]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xgboost 
from scipy.stats import pearsonr
from sklearn.linear_model import LinearRegression
from sklearn import cross_validation, tree, linear_model
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

data = pd.read_csv("C:/Users/x555l/Desktop/ML_ACA/database.csv")
print(data.shape)

# Check any number of columns with NaN
print(data.isnull().any().sum(), ' / ', len(data.columns))
# Check any number of data points with NaN
print(data.isnull().any(axis=1).sum(), ' / ', len(data))

data.dropna(axis = 1, how = 'all')
print(data.shape)

to_drop = ['Type','Depth Error','Magnitude Error','Magnitude Source','Status','Magnitude Type','ID','Source' ,'Location Source', 'Magnitude Source']
data.drop(to_drop, inplace = True, axis = 1)

data.fillna(data.mean(), inplace = True)

data['Date'] = pd.to_datetime(data['Date'])
data['Time'] = pd.to_datetime(data['Time'])

data['Date'] = pd.to_numeric(data['Date'])
data['Time'] = pd.to_numeric(data['Time'])


print(data.isnull().any(axis=1).sum(), ' / ', len(data))
data.head()

(23412, 21)
9  /  21
23398  /  23412
(23412, 21)
0  /  23412


Unnamed: 0,Date,Time,Latitude,Longitude,Depth,Depth Seismic Stations,Magnitude,Magnitude Seismic Stations,Azimuthal Gap,Horizontal Distance,Horizontal Error,Root Mean Square
0,-157680000000000000,1533908658000000000,19.246,145.616,131.6,275.364098,6.0,48.944618,44.163532,3.99266,7.662759,1.022784
1,-157507200000000000,1533900589000000000,1.863,127.352,80.0,275.364098,5.8,48.944618,44.163532,3.99266,7.662759,1.022784
2,-157420800000000000,1533924358000000000,-20.579,-173.972,20.0,275.364098,6.2,48.944618,44.163532,3.99266,7.662759,1.022784
3,-157161600000000000,1533926983000000000,-59.076,-23.557,15.0,275.364098,5.8,48.944618,44.163532,3.99266,7.662759,1.022784
4,-157075200000000000,1533907970000000000,11.938,126.427,15.0,275.364098,5.8,48.944618,44.163532,3.99266,7.662759,1.022784


In [55]:
features = data.iloc[:,:6].columns.tolist() + data.iloc[:,7:].columns.tolist()
target = data.iloc[:,6].name
target


'Magnitude'

In [56]:
correlations = {}
for f in features:
    data_temp = data[[f,target]]
    a = data_temp[f].values.astype(int)
    b = data_temp[target].values
    key = f + ' vs ' + target
    correlations[key] = pearsonr(a,b)[0]

In [57]:
data_correlations = pd.DataFrame(correlations, index=['Value']).T
data_correlations.loc[data_correlations['Value'].abs().sort_values(ascending=False).index]

Unnamed: 0,Value
Depth Seismic Stations vs Magnitude,0.243903
Azimuthal Gap vs Magnitude,-0.132397
Root Mean Square vs Magnitude,0.072973
Longitude vs Magnitude,0.038605
Latitude vs Magnitude,0.035081
Horizontal Distance vs Magnitude,-0.024818
Depth vs Magnitude,0.023503
Horizontal Error vs Magnitude,-0.018137
Date vs Magnitude,0.006152
Time vs Magnitude,-0.006134


In [58]:
new_data = data[features]

In [59]:
X = new_data.values
y = data[target].values
y.shape

(23412,)

In [60]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2)




# Linear Regression


In [61]:
regr = LinearRegression()
regr.fit(X_train, y_train)
print(np.mean((regr.predict(X_test) - y_test) ** 2))
-cross_val_score(regr,X_test,y_test,scoring='neg_mean_squared_error')

0.18333102349945948


array([0.17481495, 0.19284125, 0.18376666])

# Tree Classifier


In [62]:
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeRegressor


In [63]:
clf = DecisionTreeRegressor()
clf.fit(X_train,y_train)
print(np.mean((clf.predict(X_test) - y_test) ** 2))
-cross_val_score(clf,X_test,y_test,scoring='neg_mean_squared_error')

0.28743333333333343


array([0.28684004, 0.31202229, 0.31345657])

# XGB


In [64]:
xgb = xgboost.XGBRegressor(n_estimators=100, learning_rate=0.08, gamma=0, subsample=0.75,
                           colsample_bytree=1, max_depth=7)

traindf, testdf = train_test_split(X_train, test_size = 0.3)
xgb.fit(X_train,y_train)

predictions = xgb.predict(X_test)
np.mean((predictions - y_test) ** 2)
-cross_val_score(xgb,X_test,y_test,scoring='neg_mean_squared_error')

array([0.15427853, 0.16177469, 0.14847768])