# Import libraries 📚

In [1]:
# for modifying and manipulating the data
import pandas as pd 
import numpy as np
import glob

pd.set_option('display.max_columns', None)

# for visualization purposes
import matplotlib.pyplot as plt
import seaborn as sns

# Earthquakes

In [2]:
#collect all the data in the earthquakes file into one dataframe

path = r'C:\Users\ibrah\Downloads\PR\earthquakes' # use your path
all_files = glob.glob(path + "/*.csv")

li = []

for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    li.append(df)

earth = pd.concat(li, axis=0, ignore_index=True)

In [3]:
earth.head()

Unnamed: 0,time,latitude,longitude,depth,mag,magType,nst,gap,dmin,rms,net,id,updated,place,type,horizontalError,depthError,magError,magNst,status,locationSource,magSource
0,2020-05-02T23:49:13.744Z,43.1532,17.9828,10.0,3.3,ml,,43.0,1.186,0.58,us,us700099gx,2020-07-18T21:57:32.040Z,"7 km NNE of Stolac, Bosnia and Herzegovina",earthquake,5.1,2.0,0.056,42.0,reviewed,us,us
1,2020-05-02T23:44:37.050Z,32.755333,-115.430667,16.91,3.58,ml,82.0,36.0,0.05166,0.3,ci,ci38477458,2020-07-18T21:57:14.040Z,"8km SW of Holtville, CA",earthquake,0.22,0.41,0.123,179.0,reviewed,ci,ci
2,2020-05-02T23:35:46.541Z,-17.7025,167.3018,10.0,4.6,mb,,151.0,2.244,0.91,us,us70009bsc,2020-07-18T21:57:44.040Z,"107 km W of Port-Vila, Vanuatu",earthquake,5.8,1.9,0.102,29.0,reviewed,us,us
3,2020-05-02T23:26:49.070Z,17.9413,-66.6436,7.0,2.73,md,20.0,188.0,0.1082,0.16,pr,pr2020123043,2020-05-04T02:02:14.448Z,"7 km SSW of Ponce, Puerto Rico",earthquake,0.55,0.28,0.15,12.0,reviewed,pr,pr
4,2020-05-02T22:24:49.290Z,19.3091,-67.389,51.0,3.55,md,22.0,291.0,0.8863,0.33,pr,pr2020123028,2020-07-18T21:57:14.040Z,"95 km NNW of San Antonio, Puerto Rico",earthquake,2.09,7.13,0.12,18.0,reviewed,pr,pr


Get the shape of the data

In [4]:
earth.shape

(207058, 22)

Getting more info about the data

In [5]:
earth.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 207058 entries, 0 to 207057
Data columns (total 22 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   time             207058 non-null  object 
 1   latitude         207058 non-null  float64
 2   longitude        207058 non-null  float64
 3   depth            207058 non-null  float64
 4   mag              207058 non-null  float64
 5   magType          207057 non-null  object 
 6   nst              53767 non-null   float64
 7   gap              185764 non-null  float64
 8   dmin             177065 non-null  float64
 9   rms              207045 non-null  float64
 10  net              207058 non-null  object 
 11  id               207058 non-null  object 
 12  updated          207058 non-null  object 
 13  place            207035 non-null  object 
 14  type             207058 non-null  object 
 15  horizontalError  188158 non-null  float64
 16  depthError       207043 non-null  floa

Convert time to column into time dataframe

In [6]:
earth['time'] = pd.to_datetime(earth['time'])

Sort the data by the time

In [7]:
earth=earth.sort_values(by='time')
earth.head()

Unnamed: 0,time,latitude,longitude,depth,mag,magType,nst,gap,dmin,rms,net,id,updated,place,type,horizontalError,depthError,magError,magNst,status,locationSource,magSource
33613,2014-10-02 00:13:39.470000+00:00,-21.1,-174.4863,10.0,5.0,mb,,76.0,17.764,0.42,us,usb000si16,2014-12-23T01:41:57.040Z,"54 km ENE of ‘Ohonua, Tonga",earthquake,10.3,1.7,0.055,111.0,reviewed,us,us
33612,2014-10-02 00:37:01.600000+00:00,64.5978,-17.6197,8.27,4.9,mb,,63.0,1.601,0.45,us,usb000si1b,2014-12-23T01:41:57.040Z,"122 km WNW of Höfn, Iceland",earthquake,6.7,3.8,0.045,163.0,reviewed,us,us
33611,2014-10-02 00:53:40.180000+00:00,-24.2301,-179.8368,512.78,4.2,mb,,104.0,5.277,0.8,us,usb000slzq,2014-12-23T01:41:57.040Z,south of the Fiji Islands,earthquake,15.4,9.5,0.112,23.0,reviewed,us,us
33610,2014-10-02 01:20:00.100000+00:00,36.1799,-97.2545,5.0,2.8,ml,,57.0,,0.45,us,usb000si1g,2014-12-23T01:41:57.040Z,"11 km ENE of Orlando, Oklahoma",earthquake,1.4,2.0,,,reviewed,tul,tul
33609,2014-10-02 01:27:22.060000+00:00,-24.5057,-179.9794,524.26,4.5,mb,,118.0,6.973,0.76,us,usb000si24,2014-12-23T01:41:57.040Z,south of the Fiji Islands,earthquake,13.3,8.1,0.098,32.0,reviewed,us,us


Investigating the NaNs

In [8]:
earth.isnull().sum()

time                    0
latitude                0
longitude               0
depth                   0
mag                     0
magType                 1
nst                153291
gap                 21294
dmin                29993
rms                    13
net                     0
id                      0
updated                 0
place                  23
type                    0
horizontalError     18900
depthError             15
magError            33092
magNst              30384
status                  0
locationSource          0
magSource               0
dtype: int64

As shown above columns [nst - gap - dmin - horizontalError - magError - magNst] will be dropped because of the Nan values

In [9]:
earth.drop(['nst','gap','dmin','horizontalError','magError','magNst'],axis=1,inplace=True)

In [10]:
earth.isnull().sum()

time               0
latitude           0
longitude          0
depth              0
mag                0
magType            1
rms               13
net                0
id                 0
updated            0
place             23
type               0
depthError        15
status             0
locationSource     0
magSource          0
dtype: int64

Drop Nan Values

In [11]:
earth.dropna(inplace=True)

In [12]:
earth.isnull().sum()

time              0
latitude          0
longitude         0
depth             0
mag               0
magType           0
rms               0
net               0
id                0
updated           0
place             0
type              0
depthError        0
status            0
locationSource    0
magSource         0
dtype: int64

Check the duplicates

In [13]:
earth.duplicated().sum()

1257

Drop Duplicates

In [14]:
earth.drop_duplicates(inplace=True)

In [15]:
earth.duplicated().sum()

0

Get Data info

In [16]:
earth.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 205757 entries, 33613 to 179784
Data columns (total 16 columns):
 #   Column          Non-Null Count   Dtype              
---  ------          --------------   -----              
 0   time            205757 non-null  datetime64[ns, UTC]
 1   latitude        205757 non-null  float64            
 2   longitude       205757 non-null  float64            
 3   depth           205757 non-null  float64            
 4   mag             205757 non-null  float64            
 5   magType         205757 non-null  object             
 6   rms             205757 non-null  float64            
 7   net             205757 non-null  object             
 8   id              205757 non-null  object             
 9   updated         205757 non-null  object             
 10  place           205757 non-null  object             
 11  type            205757 non-null  object             
 12  depthError      205757 non-null  float64            
 13  status    

In [17]:
earth.head()

Unnamed: 0,time,latitude,longitude,depth,mag,magType,rms,net,id,updated,place,type,depthError,status,locationSource,magSource
33613,2014-10-02 00:13:39.470000+00:00,-21.1,-174.4863,10.0,5.0,mb,0.42,us,usb000si16,2014-12-23T01:41:57.040Z,"54 km ENE of ‘Ohonua, Tonga",earthquake,1.7,reviewed,us,us
33612,2014-10-02 00:37:01.600000+00:00,64.5978,-17.6197,8.27,4.9,mb,0.45,us,usb000si1b,2014-12-23T01:41:57.040Z,"122 km WNW of Höfn, Iceland",earthquake,3.8,reviewed,us,us
33611,2014-10-02 00:53:40.180000+00:00,-24.2301,-179.8368,512.78,4.2,mb,0.8,us,usb000slzq,2014-12-23T01:41:57.040Z,south of the Fiji Islands,earthquake,9.5,reviewed,us,us
33610,2014-10-02 01:20:00.100000+00:00,36.1799,-97.2545,5.0,2.8,ml,0.45,us,usb000si1g,2014-12-23T01:41:57.040Z,"11 km ENE of Orlando, Oklahoma",earthquake,2.0,reviewed,tul,tul
33609,2014-10-02 01:27:22.060000+00:00,-24.5057,-179.9794,524.26,4.5,mb,0.76,us,usb000si24,2014-12-23T01:41:57.040Z,south of the Fiji Islands,earthquake,8.1,reviewed,us,us


According to our Goals there is columns [Time - id - updated - status - locationSource	- magSource] will be dropped

In [18]:
earth.drop(['time','updated','id','status','locationSource','magSource'],axis=1,inplace=True)

Trying to convert (latitude - longitude ) to int to be able to merge with rain dataframe according to (latitude , longitude )

In [19]:
earth['latitude'] = round(earth['latitude'])
earth['longitude'] = round(earth['longitude'])

Try to merge (latitude - longitude) as pair of coordinates

In [20]:
earth['Coordinates'] = earth['latitude'].astype('str') + ',' +earth['longitude'].astype('str')

Display the Data

In [21]:
earth.head()

Unnamed: 0,latitude,longitude,depth,mag,magType,rms,net,place,type,depthError,Coordinates
33613,-21.0,-174.0,10.0,5.0,mb,0.42,us,"54 km ENE of ‘Ohonua, Tonga",earthquake,1.7,"-21.0,-174.0"
33612,65.0,-18.0,8.27,4.9,mb,0.45,us,"122 km WNW of Höfn, Iceland",earthquake,3.8,"65.0,-18.0"
33611,-24.0,-180.0,512.78,4.2,mb,0.8,us,south of the Fiji Islands,earthquake,9.5,"-24.0,-180.0"
33610,36.0,-97.0,5.0,2.8,ml,0.45,us,"11 km ENE of Orlando, Oklahoma",earthquake,2.0,"36.0,-97.0"
33609,-25.0,-180.0,524.26,4.5,mb,0.76,us,south of the Fiji Islands,earthquake,8.1,"-25.0,-180.0"


# -----------------------------------------------------------------------------------------------------------

# Rain

In [22]:
rain = pd.read_csv('rain data.csv')
rain.head()

Unnamed: 0,OBJECTID,landslide_trigger,event_date,latitude,longitude,landslide_size
0,5689756,rain,2008-08-01 00:00:00,32.5625,107.45,large
1,5689757,downpour,2009-01-02 02:00:00,45.42,-122.663,small
2,5689758,downpour,2007-01-19 00:00:00,-11.1295,-75.3587,large
3,5689761,downpour,2012-02-16 00:00:00,10.7004,124.9668,medium
4,5689762,downpour,2012-03-30 00:00:00,48.2797,-117.2665,small


Get the shape of the data

In [23]:
rain.shape

(9675, 6)

Getting more info about the data

In [24]:
rain.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9675 entries, 0 to 9674
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   OBJECTID           9675 non-null   int64  
 1   landslide_trigger  9674 non-null   object 
 2   event_date         9471 non-null   object 
 3   latitude           9675 non-null   float64
 4   longitude          9675 non-null   float64
 5   landslide_size     9252 non-null   object 
dtypes: float64(2), int64(1), object(3)
memory usage: 453.6+ KB


Investigating the NaNs

In [25]:
rain.isnull().sum()

OBJECTID               0
landslide_trigger      1
event_date           204
latitude               0
longitude              0
landslide_size       423
dtype: int64

Drop Nans Value

In [26]:
rain.dropna(inplace=True)

In [27]:
rain.isnull().sum()

OBJECTID             0
landslide_trigger    0
event_date           0
latitude             0
longitude            0
landslide_size       0
dtype: int64

Checking the duplicates

In [28]:
rain.duplicated().sum()

0

In [29]:
rain.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9063 entries, 0 to 9672
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   OBJECTID           9063 non-null   int64  
 1   landslide_trigger  9063 non-null   object 
 2   event_date         9063 non-null   object 
 3   latitude           9063 non-null   float64
 4   longitude          9063 non-null   float64
 5   landslide_size     9063 non-null   object 
dtypes: float64(2), int64(1), object(3)
memory usage: 495.6+ KB


As shown above the columns that dosen't support our goal [event_date - OBJECTID ] will be dropped

In [30]:
rain.drop(['event_date','OBJECTID'],axis=1,inplace=True)

Trying to convert (latitude - longitude ) to int to be able to merge with rain dataframe according to (latitude , longitude )

In [31]:
rain['latitude'] = round(rain['latitude'])
rain['longitude'] = round(rain['longitude'])

Try to merge (latitude - longitude) as pair of coordinates

In [32]:
rain['Coordinates'] = rain['latitude'].astype('str') + ',' + rain['longitude'].astype('str')

In [33]:
rain.head()

Unnamed: 0,landslide_trigger,latitude,longitude,landslide_size,Coordinates
0,rain,33.0,107.0,large,"33.0,107.0"
1,downpour,45.0,-123.0,small,"45.0,-123.0"
2,downpour,-11.0,-75.0,large,"-11.0,-75.0"
3,downpour,11.0,125.0,medium,"11.0,125.0"
4,downpour,48.0,-117.0,small,"48.0,-117.0"


# -----------------------------------------------------------------------------------------------------------

Merging the Two datasets into Final_Data according to (latitude - longitude) Coordinates

In [34]:
Final_Data = pd.merge(rain, earth, left_on='Coordinates',right_on='Coordinates',how='inner').drop(columns='Coordinates')

Display the Final_Data

In [35]:
Final_Data.head()

Unnamed: 0,landslide_trigger,latitude_x,longitude_x,landslide_size,latitude_y,longitude_y,depth,mag,magType,rms,net,place,type,depthError
0,downpour,45.0,-123.0,small,45.0,-123.0,51.602,3.13,ml,0.1,uw,"24 km WNW of Gaston, Oregon",earthquake,0.68
1,downpour,45.0,-123.0,small,45.0,-123.0,26.65,3.03,ml,0.2,uw,Washington-Oregon border region,earthquake,0.41
2,downpour,45.0,-123.0,small,45.0,-123.0,23.7,3.03,ml,0.24,uw,"4 km WNW of Woodburn, Oregon",earthquake,0.96
3,downpour,45.0,-123.0,small,45.0,-123.0,22.48,2.96,ml,0.22,uw,"9 km NW of Keizer, Oregon",earthquake,0.98
4,downpour,45.0,-123.0,small,45.0,-123.0,17.37,3.96,ml,0.19,uw,"5 km E of Scotts Mills, Oregon",earthquake,0.82


Get the shape

In [36]:
Final_Data.shape

(238658, 14)

Checking the Nans Values

In [37]:
Final_Data.isnull().sum()

landslide_trigger    0
latitude_x           0
longitude_x          0
landslide_size       0
latitude_y           0
longitude_y          0
depth                0
mag                  0
magType              0
rms                  0
net                  0
place                0
type                 0
depthError           0
dtype: int64

Checking the duplicates

In [38]:
Final_Data.duplicated().sum()

149010

Drop the duplicates

In [63]:
Final_Data.drop_duplicates(inplace=True)

In [40]:
Final_Data.duplicated().sum()

0

Change some names of columns to be understandable

In [41]:
Final_Data = Final_Data.rename(columns={'landslide_trigger': 'rain intensity','mag' : 'earthquakes magnitude'})

Getting more info about the data

In [42]:
Final_Data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 89648 entries, 0 to 238657
Data columns (total 14 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   rain intensity         89648 non-null  object 
 1   latitude_x             89648 non-null  float64
 2   longitude_x            89648 non-null  float64
 3   landslide_size         89648 non-null  object 
 4   latitude_y             89648 non-null  float64
 5   longitude_y            89648 non-null  float64
 6   depth                  89648 non-null  float64
 7   earthquakes magnitude  89648 non-null  float64
 8   magType                89648 non-null  object 
 9   rms                    89648 non-null  float64
 10  net                    89648 non-null  object 
 11  place                  89648 non-null  object 
 12  type                   89648 non-null  object 
 13  depthError             89648 non-null  float64
dtypes: float64(8), object(6)
memory usage: 10.3+ MB


selectting the desird features

In [43]:
Final_Data = Final_Data.iloc[:,np.r_[0,1,2,3,6,7,9]]

In [44]:
Final_Data.head()

Unnamed: 0,rain intensity,latitude_x,longitude_x,landslide_size,depth,earthquakes magnitude,rms
0,downpour,45.0,-123.0,small,51.602,3.13,0.1
1,downpour,45.0,-123.0,small,26.65,3.03,0.2
2,downpour,45.0,-123.0,small,23.7,3.03,0.24
3,downpour,45.0,-123.0,small,22.48,2.96,0.22
4,downpour,45.0,-123.0,small,17.37,3.96,0.19


Getting the one-hot encoding to the categorical features

In [45]:
Final_Data = pd.get_dummies(Final_Data,drop_first=True)

In [46]:
Final_Data.head()

Unnamed: 0,latitude_x,longitude_x,depth,earthquakes magnitude,rms,rain intensity_downpour,rain intensity_rain,rain intensity_snowfall_snowmelt,landslide_size_large,landslide_size_medium,landslide_size_small,landslide_size_unknown,landslide_size_very_large
0,45.0,-123.0,51.602,3.13,0.1,1,0,0,0,0,1,0,0
1,45.0,-123.0,26.65,3.03,0.2,1,0,0,0,0,1,0,0
2,45.0,-123.0,23.7,3.03,0.24,1,0,0,0,0,1,0,0
3,45.0,-123.0,22.48,2.96,0.22,1,0,0,0,0,1,0,0
4,45.0,-123.0,17.37,3.96,0.19,1,0,0,0,0,1,0,0


the minimum ratio of earthquakes magnitude is 2.12 after searching we find that when the earthquakes magnitude is equal to or lower than 3.1 the earthquakes don't cause landslide

In [47]:
Final_Data['landslide'] = np.where(Final_Data['earthquakes magnitude'] < 3.1,0,1)

In [48]:
Final_Data['landslide'].value_counts()

1    46895
0    42753
Name: landslide, dtype: int64

In [49]:
Final_Data.head()

Unnamed: 0,latitude_x,longitude_x,depth,earthquakes magnitude,rms,rain intensity_downpour,rain intensity_rain,rain intensity_snowfall_snowmelt,landslide_size_large,landslide_size_medium,landslide_size_small,landslide_size_unknown,landslide_size_very_large,landslide
0,45.0,-123.0,51.602,3.13,0.1,1,0,0,0,0,1,0,0,1
1,45.0,-123.0,26.65,3.03,0.2,1,0,0,0,0,1,0,0,0
2,45.0,-123.0,23.7,3.03,0.24,1,0,0,0,0,1,0,0,0
3,45.0,-123.0,22.48,2.96,0.22,1,0,0,0,0,1,0,0,0
4,45.0,-123.0,17.37,3.96,0.19,1,0,0,0,0,1,0,0,1


In [50]:
new_data = Final_Data.drop('earthquakes magnitude',axis=1)

# Model

- define the inputs and the outputs

In [51]:
x = new_data.iloc[:,:-1]
y = new_data.iloc[:,-1]

- split the data into train and test data

In [52]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)

> LGB Model

In [53]:
import lightgbm as lgb
clf = lgb.LGBMClassifier()
clf.fit(X_train, y_train)

LGBMClassifier()

- Prediction

In [54]:
y_pred=clf.predict(X_test)

- Model Accuracy

In [55]:
from sklearn.metrics import accuracy_score
accuracy=accuracy_score(y_pred, y_test)
print('LightGBM Model accuracy score: {0:0.4f}'.format(accuracy_score(y_test, y_pred)))

LightGBM Model accuracy score: 0.8676


> Logistic Regression Model

In [56]:
# train a logistic regression model on the training set
from sklearn.linear_model import LogisticRegression


# instantiate the model
logreg = LogisticRegression(solver='liblinear')


# fit the model
logreg.fit(X_train, y_train)

LogisticRegression(solver='liblinear')

- Prediction

In [57]:
y_pred_test = logreg.predict(X_test)

- Model Accuracy

In [58]:
from sklearn.metrics import accuracy_score

print('Model accuracy score: {0:0.4f}'. format(accuracy_score(y_test, y_pred_test)))

Model accuracy score: 0.8416


> XGBoost Model

In [59]:
from xgboost import XGBClassifier

my_model = XGBClassifier()
# Add silent=True to avoid printing out updates with each cycle
my_model.fit(X_train, y_train)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=4, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

- Prediction

In [60]:
# make predictions
predictions = my_model.predict(X_test)

- Model Accuracy

In [61]:
from sklearn.metrics import accuracy_score

print('Model accuracy score: {0:0.4f}'. format(accuracy_score(y_test, predictions)))

Model accuracy score: 0.8789


# Conclusion

As we see the XGBoost Model has the best the accuracy (0.8789) 