In [232]:
import pandas as pd
import re
import numpy as np
import matplotlib.pyplot as plt

In [233]:
earthquake_df = pd.read_csv('earthquakes.csv')

In [234]:
# first look at DataFrame head
earthquake_df.head()

Unnamed: 0,_EventID_,_Latitude_,_Longitude_,_DepthMeters_,_Contributor_,_MagType_,_Magnitude_,_EventLocationName_,_Time_,ts,potential_blast,bix_potential_blasts
0,usp000gyu4,39.644,-75.483,5000,NEIC,md,2.8,New Jersey,2009-07-01T13:44:43.380Z,2009/07/01 09:44:43-04,0,0
1,usp000gyxs,37.379,-80.746,11500,NEIC,mblg,2.8,West Virginia,2009-07-04T12:24:43.460Z,2009/07/04 08:24:43-04,0,0
2,usp000gz1d,45.296,-73.475,18000,NEIC,md,2.1,"southern Quebec, Canada",2009-07-08T10:22:11.430Z,2009/07/08 06:22:11-04,0,0
3,usp000h1sz,42.825,-78.239,5000,NEIC,md,2.4,New York,2009-09-23T03:45:59.900Z,2009/09/22 23:45:59-04,0,0
4,usp000hab0,38.619,-80.909,100,NEIC,mblg,3.4,West Virginia,2010-04-04T09:19:14.000Z,2010/04/04 05:19:14-04,0,0


In [235]:
# first look at DataFrame tail
earthquake_df.tail()

Unnamed: 0,_EventID_,_Latitude_,_Longitude_,_DepthMeters_,_Contributor_,_MagType_,_Magnitude_,_EventLocationName_,_Time_,ts,potential_blast,bix_potential_blasts
1642,5110375,37.9496,-81.3663,9500,ANF,ML,2.3,WEST VIRGINIA,2014-11-14T20:53:10,2014/11/14 15:53:10-05,1,1
1643,5110460,41.4521,-77.3709,11100,ANF,ML,2.1,PENNSYLVANIA,2014-11-25T16:26:59,2014/11/25 11:26:59-05,1,1
1644,5110483,41.0997,-78.367,3100,ANF,ML,2.1,PENNSYLVANIA,2014-11-28T16:14:11,2014/11/28 11:14:11-05,1,1
1645,5109389,43.7508,-74.1109,17000,ANF,ML,2.2,NEW YORK,2014-12-16T17:58:29,2014/12/16 12:58:29-05,1,1
1646,5111627,42.4991,-73.8542,18900,ANF,ML,2.2,NEW YORK,2015-01-02T18:08:29,2015/01/02 13:08:29-05,1,1


In [236]:
# DataFrame shape
earthquake_df.shape

(1647, 12)

DataFrame has:
1,647 rows
12 columns

In [237]:
# list of DataFrame column names
earthquake_df.columns

Index(['_EventID_', '_Latitude_', '_Longitude_', '_DepthMeters_',
       '_Contributor_', '_MagType_', '_Magnitude_', '_EventLocationName_',
       '_Time_', 'ts', 'potential_blast', 'bix_potential_blasts'],
      dtype='object')

In [238]:
# rename columns
earthquake_df.rename(columns = {
    '_EventID_' : 'Event_ID',
    '_Latitude_' : 'Latitude',
    '_Longitude_' : 'Longitude',
    '_DepthMeters_' : 'Depth_In_Meters',
    '_Contributor_' : 'Contributor',
    '_MagType_' : 'Magnitude_Type',
    '_Magnitude_' : 'Magnitude',
    '_EventLocationName_' : 'Event_Location',
    '_Time_' : 'Time',
    'ts' : 'Time_Series',
    'potential_blast' : 'Natural_Earthquake',
    'bix_potential_blasts' : 'Bix_Potential_Blasts'
}, inplace=True)

In [239]:
earthquake_df.head()

Unnamed: 0,Event_ID,Latitude,Longitude,Depth_In_Meters,Contributor,Magnitude_Type,Magnitude,Event_Location,Time,Time_Series,Natural_Earthquake,Bix_Potential_Blasts
0,usp000gyu4,39.644,-75.483,5000,NEIC,md,2.8,New Jersey,2009-07-01T13:44:43.380Z,2009/07/01 09:44:43-04,0,0
1,usp000gyxs,37.379,-80.746,11500,NEIC,mblg,2.8,West Virginia,2009-07-04T12:24:43.460Z,2009/07/04 08:24:43-04,0,0
2,usp000gz1d,45.296,-73.475,18000,NEIC,md,2.1,"southern Quebec, Canada",2009-07-08T10:22:11.430Z,2009/07/08 06:22:11-04,0,0
3,usp000h1sz,42.825,-78.239,5000,NEIC,md,2.4,New York,2009-09-23T03:45:59.900Z,2009/09/22 23:45:59-04,0,0
4,usp000hab0,38.619,-80.909,100,NEIC,mblg,3.4,West Virginia,2010-04-04T09:19:14.000Z,2010/04/04 05:19:14-04,0,0


In [240]:
# DataFrame info
earthquake_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1647 entries, 0 to 1646
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Event_ID              1647 non-null   object 
 1   Latitude              1647 non-null   float64
 2   Longitude             1647 non-null   float64
 3   Depth_In_Meters       1647 non-null   int64  
 4   Contributor           1647 non-null   object 
 5   Magnitude_Type        1628 non-null   object 
 6   Magnitude             1647 non-null   float64
 7   Event_Location        1647 non-null   object 
 8   Time                  1647 non-null   object 
 9   Time_Series           1647 non-null   object 
 10  Natural_Earthquake    1647 non-null   int64  
 11  Bix_Potential_Blasts  1647 non-null   int64  
dtypes: float64(3), int64(3), object(6)
memory usage: 154.5+ KB


In [241]:
# Identify datatypes
earthquake_df.dtypes

Event_ID                 object
Latitude                float64
Longitude               float64
Depth_In_Meters           int64
Contributor              object
Magnitude_Type           object
Magnitude               float64
Event_Location           object
Time                     object
Time_Series              object
Natural_Earthquake        int64
Bix_Potential_Blasts      int64
dtype: object

In [242]:
# Split Time to Date and Time 
earthquake_df[['Date', 'Time']] = earthquake_df['Time'].str.split('T', expand=True)


In [243]:
# Move Date column
column_to_move = earthquake_df.pop('Date')
earthquake_df.insert(8, 'Date', column_to_move)

In [244]:
# Modify data type for Date and Time columns


In [245]:
earthquake_df.dtypes


Event_ID                 object
Latitude                float64
Longitude               float64
Depth_In_Meters           int64
Contributor              object
Magnitude_Type           object
Magnitude               float64
Event_Location           object
Date                     object
Time                     object
Time_Series              object
Natural_Earthquake        int64
Bix_Potential_Blasts      int64
dtype: object

In [246]:
earthquake_df.head()

Unnamed: 0,Event_ID,Latitude,Longitude,Depth_In_Meters,Contributor,Magnitude_Type,Magnitude,Event_Location,Date,Time,Time_Series,Natural_Earthquake,Bix_Potential_Blasts
0,usp000gyu4,39.644,-75.483,5000,NEIC,md,2.8,New Jersey,2009-07-01,13:44:43.380Z,2009/07/01 09:44:43-04,0,0
1,usp000gyxs,37.379,-80.746,11500,NEIC,mblg,2.8,West Virginia,2009-07-04,12:24:43.460Z,2009/07/04 08:24:43-04,0,0
2,usp000gz1d,45.296,-73.475,18000,NEIC,md,2.1,"southern Quebec, Canada",2009-07-08,10:22:11.430Z,2009/07/08 06:22:11-04,0,0
3,usp000h1sz,42.825,-78.239,5000,NEIC,md,2.4,New York,2009-09-23,03:45:59.900Z,2009/09/22 23:45:59-04,0,0
4,usp000hab0,38.619,-80.909,100,NEIC,mblg,3.4,West Virginia,2010-04-04,09:19:14.000Z,2010/04/04 05:19:14-04,0,0


In [247]:
# Identify missing values
earthquake_df.isnull().sum()

Event_ID                 0
Latitude                 0
Longitude                0
Depth_In_Meters          0
Contributor              0
Magnitude_Type          19
Magnitude                0
Event_Location           0
Date                     0
Time                     0
Time_Series              0
Natural_Earthquake       0
Bix_Potential_Blasts     0
dtype: int64

19 missing values under Magnitude_Type

In [248]:
# Fill missing values with "Unknown"
earthquake_df. fillna({'Magnitude_Type': 'Unknown'}, inplace=True)

In [249]:
# Re-check for missing values
earthquake_df.isnull().sum()

Event_ID                0
Latitude                0
Longitude               0
Depth_In_Meters         0
Contributor             0
Magnitude_Type          0
Magnitude               0
Event_Location          0
Date                    0
Time                    0
Time_Series             0
Natural_Earthquake      0
Bix_Potential_Blasts    0
dtype: int64

In [250]:
# statistical data of DataFrame
earthquake_df.describe()

Unnamed: 0,Latitude,Longitude,Depth_In_Meters,Magnitude,Natural_Earthquake,Bix_Potential_Blasts
count,1647.0,1647.0,1647.0,1647.0,1647.0,1647.0
mean,39.903436,-79.008111,7561.554341,2.375823,0.621129,0.746812
std,2.410973,3.192442,5206.616854,0.454489,0.485253,0.43497
min,37.0077,-82.881,100.0,0.0,0.0,0.0
25%,37.9024,-81.69415,3450.0,2.1,0.0,0.0
50%,38.4778,-80.8314,6400.0,2.4,1.0,1.0
75%,41.4672,-75.93405,11500.0,2.6,1.0,1.0
max,45.3009,-73.0634,32300.0,5.2,1.0,1.0


In [251]:
# value counts for categories
locations = earthquake_df['Event_Location'].value_counts()
pd.set_option('display.max_rows', None)
print(locations)

Event_Location
WEST VIRGINIA                                   842
NEW YORK                                        189
PENNSYLVANIA                                    172
New York                                        119
Pennsylvania                                     55
West Virginia                                    23
VIRGINIA                                         22
Ontario-Quebec border region, Canada             16
Ohio                                             14
New Jersey                                       12
Greater New York area, New Jersey                11
NEW JERSEY                                       10
SOUTHERN QUEBEC, CANADA                          10
OHIO                                              9
southern Quebec, Canada                           8
CHESAPEAKE BAY REGION                             7
Greater Philadelphia area, New Jersey             6
Youngstown-Akron urban area, Ohio                 6
Virginia                                         

In [252]:
# Complile locations into states

# New Jersey
pattern_1 = '|'.join(["New Jersey", "NEW JERSEY","Greater New York area, New Jersey","Greater Philadelphia area, New Jersey",
    "1km SSW of Clifton, New Jersey", "5km N of Boonton, New Jersey", "3km WSW of Jersey City, New Jersey","0km SW of Lindenwold, New Jersey",
    "1km NNE of Hillsdale, New Jersey","1km N of Wanaque, New Jersey","2km WNW of Fairfield, New Jersey","8km W of Vista Center, New Jersey",
    "4km SW of Ringwood, New Jersey"
    ])
earthquake_df['Event_Location'] = earthquake_df['Event_Location'].str.replace(pattern_1, 'New Jersey', regex=True)

# New York
pattern_2 = '|'.join(["New York", "NEW YORK", "41km S of Lake Placid, New York", "5km SSE of Attica, New York",
    "45km NW of Warrensburg, New York", "21km NNE of Medina, New York","18km WNW of Malone, New York", "15km SW of Altamont, New York",
    "42km NNE of Lake Pleasant, New York", "9km NE of Dundee, New York", "9km NNW of Brownville, New York", "5km NE of Hadley, New York",
    "7km ENE of Massena, New York", "5km SE of Canton, New York", "18km NW of Champlain, New York","9km WNW of Dannemora, New York",
    "26km SE of Malone, New York", "4km W of Altamont, New York","19km N of Alexandria Bay, New York","2km S of Heritage Hills, New York",
    "7km NE of Massena, New York","9km NNE of Medina, New York","21km NW of Warrensburg, New York","6km W of South Lockport, New York",
    "27km NNE of Olcott, New York","7km NNW of Brownville, New York","19km NNW of Youngstown, New York","35km E of Carthage, New York",
    "16km WSW of Peru, New York","8km ENE of Broadalbin, New York","1km ESE of Norwood, New York","25km NNE of Utica, New York",
    "2km ESE of Highland Falls, New York", "4km WSW of Fonda, New York","15km NW of Hadley, New York","32km NW of Warrensburg, New York",
    "39km SE of Tupper Lake, New York","7km SE of Dannemora, New York","12km NNW of Tupper Lake, New York","27km ENE of Carthage, New York",
    "25km N of Olcott, New York","9km NE of Medina, New York","13km W of Norwood, New York","5km S of Lake Erie Beach, New York",
    "21km SSE of Hannawa Falls, New York","1km SW of Rye Brook, New York","11km W of Dannemora, New York","31km NW of Warrensburg, New York",
    "24km WSW of Malone, New York"
    ])
earthquake_df['Event_Location'] = earthquake_df['Event_Location'].str.replace(pattern_2, 'New York', regex=True)

# West Virginia
pattern_3 = '|'.join(["West Virginia", "WEST VIRGINIA", "13km SW of Glenville, West Virginia", "12km SSW of Glenville, West Virginia",
    "2km N of Alderson, West Virginia", "14km SW of Glenville, West Virginia","13km NNE of Sissonville, West Virginia",
    "11km WSW of Sutton, West Virginia","10km WNW of Sutton, West Virginia","Potomac-Shenandoah region" 
    ])
earthquake_df['Event_Location'] = earthquake_df['Event_Location'].str.replace(pattern_3, 'West Virginia', regex=True)

# Pennsylvania
pattern_4 = '|'.join(["PENNSYLVANIA", "Pennsylvania", "Greater Philadelphia area, Pennsylvania", "2km SE of Guilford Siding, Pennsylvania",
    "1km SSW of Garden View, Pennsylvania", "8km NNE of Greenville, Pennsylvania","2km SE of Conestoga, Pennsylvania",
    "3km WSW of Exton, Pennsylvania","11km NNW of Susquehanna Trails, Pennsylvania"                 
    ])
earthquake_df['Event_Location'] = earthquake_df['Event_Location'].str.replace(pattern_4, 'Pennsylvania', regex=True)

# Virginia
pattern_5 = '|'.join(["VIRGINIA", "Virginia","11km E of Bland, Virginia"])
earthquake_df['Event_Location'] = earthquake_df['Event_Location'].str.replace(pattern_5, 'Virginia', regex=True)

# Ohio
pattern_6 = '|'.join(["Ohio ", "OHIO", "Lake Erie, Ohio", "Youngstown-Akron urban area, Ohio", "2km S of Lowellville, Ohio", 
    "15km S of Gallipolis, Ohio", "3km SSW of Lowellville, Ohio", "3km S of Lowellville, Ohio","1km WSW of Bolindale, Ohio",
    "2km SSW of Lowellville, Ohio"
    ])
earthquake_df['Event_Location'] = earthquake_df['Event_Location'].str.replace(pattern_6, 'Ohio', regex=True)

# Quebec
pattern_7 = '|'.join(["Ontario-Quebec border region, Canada","SOUTHERN QUEBEC, CANADA", "southern Quebec, Canada", "10km ESE of Cornwall, Canada",
    "3km WNW of Huntingdon, Canada","39km SSW of Cobourg, Canada","29km SE of Toronto, Canada","3km SSW of Huntingdon, Canada",
    "6km NNE of Ormstown, Canada","8km SSW of Salaberry-de-Valleyfield, Canada","6km SW of Salaberry-de-Valleyfield, Canada",
    "southern Ontario, Canada","11km S of Sainte-Martine, Canada","4km SW of Huntingdon, Canada","9km E of Ormstown, Canada",
    "8km SSW of Huntingdon, Canada","8km NNW of Huntingdon, Canada","1km W of Ormstown, Canada"
    ])
earthquake_df['Event_Location'] = earthquake_df['Event_Location'].str.replace(pattern_7, 'Quebec', regex=True)

# Kentucky
pattern_8 = '|'.join(["eastern Kentucky","7km WSW of Flatwoods, Kentucky","8km S of Greenup, Kentucky"])
earthquake_df['Event_Location'] = earthquake_df['Event_Location'].str.replace(pattern_8, 'Kentucky', regex=True)

# Connecticut
pattern_9 = '|'.join(["0km SE of Byram, Connecticut"])
earthquake_df['Event_Location'] = earthquake_df['Event_Location'].str.replace(pattern_9, 'Connecticut', regex=True)

# Delaware
pattern_10 = '|'.join(["Wilmington urban area, Delaware"])
earthquake_df['Event_Location'] = earthquake_df['Event_Location'].str.replace(pattern_10, 'Delaware', regex=True)

# Maryland
pattern_11 = '|'.join(["CHESAPEAKE BAY REGION","Chesapeake Bay region"])
earthquake_df['Event_Location'] = earthquake_df['Event_Location'].str.replace(pattern_11, 'Maryland', regex=True)

In [253]:
# Updated location totals
locations = earthquake_df['Event_Location'].value_counts()
pd.set_option('display.max_rows', None)
print(locations)

Event_Location
West Virginia    874
New York         355
Pennsylvania     237
New Jersey        49
Quebec            49
Ohio              38
Virginia          27
Maryland           9
Kentucky           4
Vermont            3
Connecticut        1
Delaware           1
Name: count, dtype: int64


In [254]:
earthquake_df.head()

Unnamed: 0,Event_ID,Latitude,Longitude,Depth_In_Meters,Contributor,Magnitude_Type,Magnitude,Event_Location,Date,Time,Time_Series,Natural_Earthquake,Bix_Potential_Blasts
0,usp000gyu4,39.644,-75.483,5000,NEIC,md,2.8,New Jersey,2009-07-01,13:44:43.380Z,2009/07/01 09:44:43-04,0,0
1,usp000gyxs,37.379,-80.746,11500,NEIC,mblg,2.8,West Virginia,2009-07-04,12:24:43.460Z,2009/07/04 08:24:43-04,0,0
2,usp000gz1d,45.296,-73.475,18000,NEIC,md,2.1,Quebec,2009-07-08,10:22:11.430Z,2009/07/08 06:22:11-04,0,0
3,usp000h1sz,42.825,-78.239,5000,NEIC,md,2.4,New York,2009-09-23,03:45:59.900Z,2009/09/22 23:45:59-04,0,0
4,usp000hab0,38.619,-80.909,100,NEIC,mblg,3.4,West Virginia,2010-04-04,09:19:14.000Z,2010/04/04 05:19:14-04,0,0


In [255]:
# Drop Contributor column
earthquake_df.drop('Contributor', axis=1, inplace=True)

# Drop Magnitude Type column
earthquake_df.drop('Magnitude_Type', axis=1, inplace=True)

#Drop Time Series
earthquake_df.drop('Time_Series', axis=1, inplace=True)

#Drop Event ID
earthquake_df.drop('Event_ID', axis=1, inplace=True)

In [256]:
earthquake_df.head(10)

Unnamed: 0,Latitude,Longitude,Depth_In_Meters,Magnitude,Event_Location,Date,Time,Natural_Earthquake,Bix_Potential_Blasts
0,39.644,-75.483,5000,2.8,New Jersey,2009-07-01,13:44:43.380Z,0,0
1,37.379,-80.746,11500,2.8,West Virginia,2009-07-04,12:24:43.460Z,0,0
2,45.296,-73.475,18000,2.1,Quebec,2009-07-08,10:22:11.430Z,0,0
3,42.825,-78.239,5000,2.4,New York,2009-09-23,03:45:59.900Z,0,0
4,38.619,-80.909,100,3.4,West Virginia,2010-04-04,09:19:14.000Z,0,0
5,38.675,-80.82,2300,2.4,West Virginia,2010-07-24,09:15:44.000Z,0,0
6,38.818,-80.43,17000,2.5,West Virginia,2010-08-15,04:38:47.000Z,0,0
7,38.792,-80.398,10700,2.5,West Virginia,2010-08-21,03:16:22.000Z,0,0
8,39.433,-81.362,5000,2.8,Ohio,2010-10-24,08:12:44.510Z,0,0
9,41.514,-80.333,5000,2.7,Pennsylvania,2010-12-10,21:26:32.790Z,0,0
