In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import datetime

# read in our data
earthquake = pd.read_csv('../../../datasets/Data_Cleaning/earthquake-database/database.csv')

# set seed for reproducibility
np.random.seed(0)

In [2]:
earthquake.Date.dtype

dtype('O')

In [4]:
# Most of the entries in the 'Data' column follow the same format:
# "month/day/four-digit yera". However, the entry at index 3378 follows
# a completely different pattern.
earthquake[3378:3383]

Unnamed: 0,Date,Time,Latitude,Longitude,Type,Depth,Depth Error,Depth Seismic Stations,Magnitude,Magnitude Type,...,Magnitude Seismic Stations,Azimuthal Gap,Horizontal Distance,Horizontal Error,Root Mean Square,ID,Source,Location Source,Magnitude Source,Status
3378,1975-02-23T02:58:41.000Z,1975-02-23T02:58:41.000Z,8.017,124.075,Earthquake,623.0,,,5.6,MB,...,,,,,,USP0000A09,US,US,US,Reviewed
3379,02/23/1975,03:53:36,-21.727,-71.356,Earthquake,33.0,,,5.6,MB,...,,,,,,USP0000A0A,US,US,US,Reviewed
3380,02/23/1975,07:34:11,-10.879,166.667,Earthquake,33.0,,,5.5,MS,...,,,,,,USP0000A0C,US,US,US,Reviewed
3381,02/25/1975,05:20:05,-7.388,149.798,Earthquake,33.0,,,5.5,MB,...,,,,,,USP0000A12,US,US,US,Reviewed
3382,02/26/1975,04:48:55,85.047,97.969,Earthquake,33.0,,,5.6,MS,...,,,,,,USP0000A1H,US,US,US,Reviewed


In [6]:
# This does appear to be an issue with data entry: ideally, all entries in
# the column have the same format. We can get an idea of how widespread this
# issue is by checking the length of each entry in "Date" column.
date_lengths = earthquake.Date.str.len()
date_lengths

0        10
1        10
2        10
3        10
4        10
         ..
23407    10
23408    10
23409    10
23410    10
23411    10
Name: Date, Length: 23412, dtype: int64

In [7]:
date_lengths.value_counts()

10    23409
24        3
Name: Date, dtype: int64

In [11]:
indices = np.where([date_lengths == 24])[1]
print('Indices with corrupted data:', indices)
earthquake.loc[indices]


Indices with corrupted data: [ 3378  7512 20650]


Unnamed: 0,Date,Time,Latitude,Longitude,Type,Depth,Depth Error,Depth Seismic Stations,Magnitude,Magnitude Type,...,Magnitude Seismic Stations,Azimuthal Gap,Horizontal Distance,Horizontal Error,Root Mean Square,ID,Source,Location Source,Magnitude Source,Status
3378,1975-02-23T02:58:41.000Z,1975-02-23T02:58:41.000Z,8.017,124.075,Earthquake,623.0,,,5.6,MB,...,,,,,,USP0000A09,US,US,US,Reviewed
7512,1985-04-28T02:53:41.530Z,1985-04-28T02:53:41.530Z,-32.998,-71.766,Earthquake,33.0,,,5.6,MW,...,,,,,1.3,USP0002E81,US,US,HRV,Reviewed
20650,2011-03-13T02:23:34.520Z,2011-03-13T02:23:34.520Z,36.344,142.344,Earthquake,10.1,13.9,289.0,5.8,MWC,...,,32.3,,,1.06,USP000HWQP,US,US,GCMT,Reviewed


In [12]:
earthquake.loc[3378, "Date"] = "02/23/1975"
earthquake.loc[7512, "Date"] = "04/28/1985"
earthquake.loc[20650, "Date"] = "03/13/2011"
earthquake['date_parsed'] = pd.to_datetime(earthquake['Date'], format="%m/%d/%Y")

In [13]:
earthquake['date_parsed']

0       1965-01-02
1       1965-01-04
2       1965-01-05
3       1965-01-08
4       1965-01-09
           ...    
23407   2016-12-28
23408   2016-12-28
23409   2016-12-28
23410   2016-12-29
23411   2016-12-30
Name: date_parsed, Length: 23412, dtype: datetime64[ns]

In [14]:
day_of_month_earthquakes = earthquake['date_parsed'].dt.day
day_of_month_earthquakes

0         2
1         4
2         5
3         8
4         9
         ..
23407    28
23408    28
23409    28
23410    29
23411    30
Name: date_parsed, Length: 23412, dtype: int64