# HOMEWORK 1 - Earthquakes

In [1]:
import numpy as np
import pandas as pd
import lightgbm as lgb

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV

## Cleaning

In [2]:
# Let us read the dataset and see overall appearance of it
earthquakes = pd.read_csv("database.csv")
earthquakes.head()

Unnamed: 0,Date,Time,Latitude,Longitude,Type,Depth,Depth Error,Depth Seismic Stations,Magnitude,Magnitude Type,...,Magnitude Seismic Stations,Azimuthal Gap,Horizontal Distance,Horizontal Error,Root Mean Square,ID,Source,Location Source,Magnitude Source,Status
0,01/02/1965,13:44:18,19.246,145.616,Earthquake,131.6,,,6.0,MW,...,,,,,,ISCGEM860706,ISCGEM,ISCGEM,ISCGEM,Automatic
1,01/04/1965,11:29:49,1.863,127.352,Earthquake,80.0,,,5.8,MW,...,,,,,,ISCGEM860737,ISCGEM,ISCGEM,ISCGEM,Automatic
2,01/05/1965,18:05:58,-20.579,-173.972,Earthquake,20.0,,,6.2,MW,...,,,,,,ISCGEM860762,ISCGEM,ISCGEM,ISCGEM,Automatic
3,01/08/1965,18:49:43,-59.076,-23.557,Earthquake,15.0,,,5.8,MW,...,,,,,,ISCGEM860856,ISCGEM,ISCGEM,ISCGEM,Automatic
4,01/09/1965,13:32:50,11.938,126.427,Earthquake,15.0,,,5.8,MW,...,,,,,,ISCGEM860890,ISCGEM,ISCGEM,ISCGEM,Automatic


In [3]:
# Keep only those rows which type is Earthquake 
earthquakes = earthquakes[earthquakes["Type"] == "Earthquake"]

In [4]:
# See for each column which percentage is NA
all_data_na = (earthquakes.isnull().sum() / len(earthquakes)) * 100
all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values(ascending=False)[:]
missing_data = pd.DataFrame({'Missing Ratio' :all_data_na})
missing_data.head(100)

Unnamed: 0,Missing Ratio
Magnitude Error,98.644112
Horizontal Error,95.075758
Horizontal Distance,93.13447
Magnitude Seismic Stations,89.411157
Depth Error,80.84969
Depth Seismic Stations,69.507576
Azimuthal Gap,68.638085
Root Mean Square,25.76188
Magnitude Type,0.012913


In [5]:
earthquakes.drop(["Magnitude Error", "Horizontal Error", "Horizontal Distance", 
           "Magnitude Seismic Stations", "Depth Error", "Depth Seismic Stations", 
           "Azimuthal Gap", "ID", "Source", "Location Source", "Magnitude Source", "Type", "Date", "Time"], inplace=True, axis = 1)

In [6]:
# we have only three rows for which 
# Magnitude Type is NA, we can just drop that rows
earthquakes = earthquakes[pd.notnull(earthquakes['Magnitude Type'])]

In [7]:
# categorical features
status = pd.get_dummies(earthquakes.Status)
earthquakes = earthquakes.drop(["Status"], axis = 1).join(status)

mgn_type = pd.get_dummies(earthquakes["Magnitude Type"])
earthquakes = earthquakes.drop(["Magnitude Type"], axis = 1).join(mgn_type)

In [8]:
earthquakes["Root Mean Square"][earthquakes["Root Mean Square"].isna()] = 2
earthquakes.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,Latitude,Longitude,Depth,Magnitude,Root Mean Square,Automatic,Reviewed,MB,MD,MH,ML,MS,MW,MWB,MWC,MWR,MWW
0,19.246,145.616,131.6,6.0,2.0,1,0,0,0,0,0,0,1,0,0,0,0
1,1.863,127.352,80.0,5.8,2.0,1,0,0,0,0,0,0,1,0,0,0,0
2,-20.579,-173.972,20.0,6.2,2.0,1,0,0,0,0,0,0,1,0,0,0,0
3,-59.076,-23.557,15.0,5.8,2.0,1,0,0,0,0,0,0,1,0,0,0,0
4,11.938,126.427,15.0,5.8,2.0,1,0,0,0,0,0,0,1,0,0,0,0


In [9]:
# specify target and values
y = earthquakes["Magnitude"]
X = earthquakes.drop(["Magnitude"], axis = 1)

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=1)


## Training

#### Linear Regression

In [16]:
lrg = LinearRegression()
lrg.fit(X_train,y_train)
y_train_pred = lrg.predict(X_train)
y_test_pred = lrg.predict(X_test)

print("Score for train dataset:", lrg.score(X_train, y_train) )
print("Score for test dataset:", lrg.score(X_test, y_test) )

print("\nMSE for train dataset:", mean_squared_error(y_train, y_train_pred))
print("MSE for test dataset:", mean_squared_error(y_test, y_test_pred))


Score for train dataset: 0.07188044284447914
Score for test dataset: 0.06862559668818924

MSE for train dataset: 0.165202137826119
MSE for test dataset: 0.1724879827328813


#### Random Forest

In [17]:
estimator = RandomForestRegressor()
param_grid = {
    'max_depth': [10, 15, 18],
    'min_samples_leaf': [2, 3, 4] 
}

gbm_rfr = GridSearchCV(estimator, param_grid)
gbm_rfr.fit(X_train, y_train)

print('Best parameters found by grid search are:', gbm_rfr.best_params_)

y_train_pred = gbm_rfr.predict(X_train)
y_test_pred = gbm_rfr.predict(X_test)

print("\nScore for train dataset:", gbm_rfr.score(X_train, y_train) )
print("Score for test dataset:", gbm_rfr.score(X_test, y_test) )

print("\nMSE for train dataset:", mean_squared_error(y_train, y_train_pred))
print("MSE for test dataset:", mean_squared_error(y_test, y_test_pred))


Best parameters found by grid search are: {'max_depth': 10, 'min_samples_leaf': 4}

Score for train dataset: 0.24812502659848323
Score for test dataset: 0.11291108806958061

MSE for train dataset: 0.13383119882159036
MSE for test dataset: 0.16428643129927029


#### LightGBM Regressor

In [18]:
estimator = lgb.LGBMRegressor(metric = 'l2', objective = "regression")

param_grid = {
    'max_depth': [8, 10],
    'lambda': [0.01, 0.1, 0.5],
    'learning rate': [0.003, 0.008, 0.01],
    'n_estimators': [10, 20]
}
gbm_lgb = GridSearchCV(estimator, param_grid)

gbm_lgb.fit(X_train, y_train)

print('Best parameters found by grid search are:', gbm_lgb.best_params_)

lgbr = gbm_lgb.fit(X_train, y_train)

y_train_pred = gbm_lgb.predict(X_train)
y_test_pred = gbm_lgb.predict(X_test)

print("\nMSE for train dataset:", mean_squared_error(y_train, y_train_pred))
print("MSE for test dataset:", mean_squared_error(y_test, y_test_pred))


Best parameters found by grid search are: {'lambda': 0.01, 'learning rate': 0.003, 'max_depth': 10, 'n_estimators': 20}

MSE for train dataset: 0.15172173757474167
MSE for test dataset: 0.16428834912376955
