In [7]:
import pandas as pd

In [8]:
df=pd.read_csv('Earthquakes Dataset 1900-2023.csv')

### Data Exploration

In [9]:
df.shape

(37331, 23)

In [10]:
df.columns

Index(['Time', 'Place', 'Latitude', 'Longitude', 'Depth', 'Mag', 'MagType',
       'nst', 'gap', 'dmin', 'rms', 'net', 'ID', 'Updated', 'Unnamed: 14',
       'Type', 'horizontalError', 'depthError', 'magError', 'magNst', 'status',
       'locationSource', 'magSource'],
      dtype='object')

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37331 entries, 0 to 37330
Data columns (total 23 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Time             37331 non-null  object 
 1   Place            37047 non-null  object 
 2   Latitude         37331 non-null  float64
 3   Longitude        37331 non-null  float64
 4   Depth            37197 non-null  float64
 5   Mag              37331 non-null  float64
 6   MagType          37331 non-null  object 
 7   nst              7473 non-null   float64
 8   gap              10087 non-null  float64
 9   dmin             4395 non-null   float64
 10  rms              20218 non-null  float64
 11  net              37331 non-null  object 
 12  ID               37331 non-null  object 
 13  Updated          37331 non-null  object 
 14  Unnamed: 14      0 non-null      float64
 15  Type             37331 non-null  object 
 16  horizontalError  3970 non-null   float64
 17  depthError  

In [12]:
df.isnull().sum()

Time                   0
Place                284
Latitude               0
Longitude              0
Depth                134
Mag                    0
MagType                0
nst                29858
gap                27244
dmin               32936
rms                17113
net                    0
ID                     0
Updated                0
Unnamed: 14        37331
Type                   0
horizontalError    33361
depthError         16504
magError           20780
magNst             31959
status                 0
locationSource         0
magSource              0
dtype: int64

In [13]:
df.duplicated().sum()

0

### Data Cleaning


In [14]:
# The dataset has an empty column "Unnamed: 14"
# Deleting all the redundant columns that do not have an affect on our analysis 

In [15]:
df['Unnamed: 14'].value_counts()

Series([], Name: count, dtype: int64)

In [16]:
df.drop(['Unnamed: 14','horizontalError', 'depthError', 'magError', 'magNst', 'status',
       'locationSource', 'magSource'],axis=1,inplace=True)


In [17]:
df.columns

Index(['Time', 'Place', 'Latitude', 'Longitude', 'Depth', 'Mag', 'MagType',
       'nst', 'gap', 'dmin', 'rms', 'net', 'ID', 'Updated', 'Type'],
      dtype='object')

In [18]:
df.Place

0              130 km SW of Tual, Indonesia
1             7 km SW of Port-Olry, Vanuatu
2               Masbate region, Philippines
3           54 km WNW of Otaki, New Zealand
4               2 km NW of Lele?ti, Romania
                        ...                
37326        113 km ESE of Nikolski, Alaska
37327         221 km SW of Nikolski, Alaska
37328                       south of Alaska
37329    12 km NNW of Parkfield, California
37330        16 km SW of Old Harbor, Alaska
Name: Place, Length: 37331, dtype: object

In [19]:
df['Time']

0        2023-02-17T09:37:34.868Z
1        2023-02-16T05:37:05.138Z
2        2023-02-15T18:10:10.060Z
3        2023-02-15T06:38:09.034Z
4        2023-02-14T13:16:51.072Z
                   ...           
37326    1902-01-01T05:20:30.000Z
37327    1901-12-31T09:02:30.000Z
37328    1901-12-30T22:34:00.000Z
37329    1901-03-03T07:45:00.000Z
37330    1900-10-09T12:25:00.000Z
Name: Time, Length: 37331, dtype: object

In [20]:
df["Time"] = pd.to_datetime(df["Time"]).dt.strftime('%Y-%m-%d %H:%M:%S') 


In [21]:
df["Time"]

0        2023-02-17 09:37:34
1        2023-02-16 05:37:05
2        2023-02-15 18:10:10
3        2023-02-15 06:38:09
4        2023-02-14 13:16:51
                ...         
37326    1902-01-01 05:20:30
37327    1901-12-31 09:02:30
37328    1901-12-30 22:34:00
37329    1901-03-03 07:45:00
37330    1900-10-09 12:25:00
Name: Time, Length: 37331, dtype: object

In [22]:
df.dtypes

Time          object
Place         object
Latitude     float64
Longitude    float64
Depth        float64
Mag          float64
MagType       object
nst          float64
gap          float64
dmin         float64
rms          float64
net           object
ID            object
Updated       object
Type          object
dtype: object

In [23]:
df['Place'] = df['Place'].astype(str)

In [24]:
## Splitting the columns to form new columns with the desired data

In [25]:
for index, row in df.iterrows():
    df.loc[index, 'Date'] = row['Time'].split()[0]
    df.loc[index, 'time'] = row['Time'].split()[-1]
    df.loc[index, 'country'] = row['Place'].split()[-1]

In [26]:
df.country.unique

<bound method Series.unique of 0          Indonesia
1            Vanuatu
2        Philippines
3            Zealand
4            Romania
            ...     
37326         Alaska
37327         Alaska
37328         Alaska
37329     California
37330         Alaska
Name: country, Length: 37331, dtype: object>

In [27]:
df['country'] = df['country'].replace('Zealand', 'New Zealand')

In [28]:
df.country

0          Indonesia
1            Vanuatu
2        Philippines
3        New Zealand
4            Romania
            ...     
37326         Alaska
37327         Alaska
37328         Alaska
37329     California
37330         Alaska
Name: country, Length: 37331, dtype: object

In [29]:
# Deleting some more columns

In [30]:
df.drop(['Time'],axis=1,inplace=True)

In [31]:
df.head()

Unnamed: 0,Place,Latitude,Longitude,Depth,Mag,MagType,nst,gap,dmin,rms,net,ID,Updated,Type,Date,time,country
0,"130 km SW of Tual, Indonesia",-6.5986,132.0763,38.615,6.1,mww,119.0,51.0,2.988,0.76,us,us6000jpl7,2023-02-17T17:58:24.040Z,earthquake,2023-02-17,09:37:34,Indonesia
1,"7 km SW of Port-Olry, Vanuatu",-15.0912,167.0294,36.029,5.6,mww,81.0,26.0,0.392,0.94,us,us6000jpb1,2023-02-17T05:41:32.448Z,earthquake,2023-02-16,05:37:05,Vanuatu
2,"Masbate region, Philippines",12.3238,123.8662,20.088,6.1,mww,148.0,47.0,5.487,0.54,us,us6000jp76,2023-02-16T20:12:32.595Z,earthquake,2023-02-15,18:10:10,Philippines
3,"54 km WNW of Otaki, New Zealand",-40.5465,174.5709,74.32,5.7,mww,81.0,40.0,0.768,1.15,us,us6000jp1g,2023-02-16T06:42:09.738Z,earthquake,2023-02-15,06:38:09,New Zealand
4,"2 km NW of Lele?ti, Romania",45.1126,23.1781,10.0,5.6,mww,132.0,28.0,1.197,0.4,us,us6000jnqz,2023-02-17T09:15:18.586Z,earthquake,2023-02-14,13:16:51,Romania


In [32]:
## Summary Statistics of our data 

In [33]:
df.describe()

Unnamed: 0,Latitude,Longitude,Depth,Mag,nst,gap,dmin,rms
count,37331.0,37331.0,37197.0,37331.0,7473.0,10087.0,4395.0,20218.0
mean,5.457651,38.877695,58.583346,5.949996,265.481065,45.014891,4.315178,1.000779
std,30.789822,123.090934,109.5634,0.456006,161.982149,34.311032,5.480411,0.356822
min,-77.08,-179.997,-4.0,5.5,0.0,8.0,0.004505,0.005
25%,-16.5198,-75.807,15.0,5.6,134.0,24.1,1.155,0.89
50%,1.153,98.577,28.5,5.8,241.0,36.0,2.509,1.0
75%,33.786,143.34785,41.0,6.1,372.0,54.8,5.1275,1.11
max,87.199,180.0,700.0,9.5,934.0,360.0,39.73,42.41


### Exporting our cleaned data into a .csv file 

In [34]:
df.to_csv('Cleaned_Earthquakes data 1900-2023.csv',index=False)

In [35]:
df1=pd.read_csv('Cleaned_Earthquakes data 1900-2023.csv')

In [36]:
df1.shape

(37331, 17)

In [37]:
df1.columns

Index(['Place', 'Latitude', 'Longitude', 'Depth', 'Mag', 'MagType', 'nst',
       'gap', 'dmin', 'rms', 'net', 'ID', 'Updated', 'Type', 'Date', 'time',
       'country'],
      dtype='object')