In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import requests
import tqdm

In [2]:
MIN_TIME = "100-01-01"
START_TIME = '1940-01-01'
END_TIME = '2023-10-01'
MIN_LAT = -90
MAX_LAT = 90
MIN_LON = -180
MAX_LON = 180

In [3]:
def make_params(starttime, endtime, minlatitude, maxlatitude, minlongitude, maxlongitude):
    params = {
        "format": "geojson",
        "starttime": starttime,
        "endtime": endtime,
        "minlatitude": minlatitude,
        "maxlatitude": maxlatitude,
        "minlongitude": minlongitude,
        "maxlongitude": maxlongitude
    }
    return params

In [4]:
def make_datarange(start_time, end_time, min_time):
    datarange = pd.date_range(start_time, end_time, freq='W').tolist()
    datarange = [str(x)[:10] for x in datarange]
    datarange.insert(0, min_time)
    return datarange

In [5]:
def get_earthquake_count(params):
    url = "https://earthquake.usgs.gov/fdsnws/event/1/count"
    response = requests.get(url, params=params)
    return response.json()["count"]

In [6]:
def get_earthquake_data(params):
    url = "https://earthquake.usgs.gov/fdsnws/event/1/query"
    response = requests.get(url, params=params)
    return response

In [7]:
def make_df(resp, params, errors):
    all_eqs = []
    try:
        for eq in resp.json()["features"]:
            prop = list(eq["properties"].values())
            prop.extend(eq["geometry"]["coordinates"])
            all_eqs.append(prop)
        cols = list(resp.json()["features"][0]["properties"].keys())
        cols.extend(["longitude", "latitude", "depth"])
        df = pd.DataFrame(all_eqs, columns=cols)
    except:
        errors.append(params)
        df = pd.DataFrame()
    return df, errors

In [8]:
def download_data(starttime, endtime, minlatitude, maxlatitude, minlongitude, maxlongitude):
    dfs = []
    errors = []
    data_range = make_datarange(starttime, endtime, MIN_TIME)
    for i in tqdm.tqdm(range(len(data_range)-1)):
        params = make_params(data_range[i], data_range[i+1], minlatitude, maxlatitude, minlongitude, maxlongitude)
        resp = get_earthquake_data(params)
        eq_count = get_earthquake_count(params)
        df, errors = make_df(resp, params, errors)
        if len(df) != eq_count or eq_count > 20000:
            errors.append(params)
            print("Error: Dataframe length does not match earthquake count")
            print(params)
        dfs.append(df)
    df = pd.concat(dfs)
    return df, errors

In [9]:
df, errors = download_data(START_TIME, END_TIME, MIN_LAT, MAX_LAT, MIN_LON, MAX_LON)
df

  0%|          | 0/4370 [00:00<?, ?it/s]

100%|██████████| 4370/4370 [2:33:31<00:00,  2.11s/it]  


Unnamed: 0,mag,place,time,updated,tz,url,detail,felt,cdi,mmi,...,nst,dmin,rms,gap,magType,type,title,longitude,latitude,depth
0,5.84,"14 km NE of Eloúnda, Greece",-946270524830,1651001597199,,https://earthquake.usgs.gov/earthquakes/eventp...,https://earthquake.usgs.gov/fdsnws/event/1/que...,,,,...,,,,,mw,earthquake,"M 5.8 - 14 km NE of Eloúnda, Greece",25.814000,35.373000,15.00
1,,southeast of the Loyalty Islands,-946288579660,1652114853165,,https://earthquake.usgs.gov/earthquakes/eventp...,https://earthquake.usgs.gov/fdsnws/event/1/que...,,,,...,,,,,,earthquake,M ? - southeast of the Loyalty Islands,171.134000,-22.015000,180.00
2,6.07,"285 km E of Kuril’sk, Russia",-946309460790,1651001591890,,https://earthquake.usgs.gov/earthquakes/eventp...,https://earthquake.usgs.gov/fdsnws/event/1/que...,,,,...,,,,,mw,earthquake,"M 6.1 - 285 km E of Kuril’sk, Russia",151.498000,45.077000,25.00
3,3.42,"9km S of Borrego Springs, CA",-946394224430,1453944863830,,https://earthquake.usgs.gov/earthquakes/eventp...,https://earthquake.usgs.gov/fdsnws/event/1/que...,,,,...,3.0,0.8066,0.61,293.0,ml,earthquake,"M 3.4 - 9km S of Borrego Springs, CA",-116.367333,33.173167,6.00
4,3.97,"35km S of San Nicolas Is., CA",-946402749540,1453944629960,,https://earthquake.usgs.gov/earthquakes/eventp...,https://earthquake.usgs.gov/fdsnws/event/1/que...,,,,...,5.0,1.614,0.66,316.0,ml,earthquake,"M 4.0 - 35km S of San Nicolas Is., CA",-119.442000,32.929333,6.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2547,1.09,"0 km S of Colton, CA",1695514839520,1695656113633,,https://earthquake.usgs.gov/earthquakes/eventp...,https://earthquake.usgs.gov/fdsnws/event/1/que...,,,,...,50.0,0.09282,0.17,36.0,ml,earthquake,"M 1.1 - 0 km S of Colton, CA",-117.314167,34.071333,16.11
2548,0.14,"84 km NW of Karluk, Alaska",1695514282160,1696540900348,,https://earthquake.usgs.gov/earthquakes/eventp...,https://earthquake.usgs.gov/fdsnws/event/1/que...,,,,...,11.0,,0.09,82.0,ml,earthquake,"M 0.1 - 84 km NW of Karluk, Alaska",-155.259000,58.195167,2.55
2549,0.7,"30 km N of Sutcliffe, Nevada",1695514246071,1695580950150,,https://earthquake.usgs.gov/earthquakes/eventp...,https://earthquake.usgs.gov/fdsnws/event/1/que...,,,,...,4.0,0.072,0.0215,280.74,ml,earthquake,"M 0.7 - 30 km N of Sutcliffe, Nevada",-119.668800,40.223400,8.40
2550,1.91,"2 km NW of Redwood Valley, CA",1695513995040,1696302470640,,https://earthquake.usgs.gov/earthquakes/eventp...,https://earthquake.usgs.gov/fdsnws/event/1/que...,1.0,3.1,,...,43.0,0.03224,0.07,50.0,md,earthquake,"M 1.9 - 2 km NW of Redwood Valley, CA",-123.223333,39.279500,4.43


In [10]:
df.to_csv("../data/usgs_data.csv", index=False)

In [11]:
len(errors)

0

In [3]:
df = pd.read_csv("../data/usgs_data.csv")

  df = pd.read_csv("../data/usgs_data.csv")


In [4]:
df.isna().sum()

mag           171656
place           7950
time               0
updated            0
tz           4451665
url                0
detail             0
felt         4343303
cdi          4343303
mmi          4420568
alert        4443085
status             0
tsunami            0
sig                0
net                0
code               0
ids                0
sources            0
types              0
nst          1293748
dmin         1922732
rms           223547
gap          1189661
magType       182736
type               0
title              0
longitude          0
latitude           0
depth           1614
dtype: int64

In [5]:
df.columns

Index(['mag', 'place', 'time', 'updated', 'tz', 'url', 'detail', 'felt', 'cdi',
       'mmi', 'alert', 'status', 'tsunami', 'sig', 'net', 'code', 'ids',
       'sources', 'types', 'nst', 'dmin', 'rms', 'gap', 'magType', 'type',
       'title', 'longitude', 'latitude', 'depth'],
      dtype='object')

In [6]:
df

Unnamed: 0,mag,place,time,updated,tz,url,detail,felt,cdi,mmi,...,nst,dmin,rms,gap,magType,type,title,longitude,latitude,depth
0,5.84,"14 km NE of Eloúnda, Greece",-946270524830,1651001597199,,https://earthquake.usgs.gov/earthquakes/eventp...,https://earthquake.usgs.gov/fdsnws/event/1/que...,,,,...,,,,,mw,earthquake,"M 5.8 - 14 km NE of Eloúnda, Greece",25.814000,35.373000,15.00
1,,southeast of the Loyalty Islands,-946288579660,1652114853165,,https://earthquake.usgs.gov/earthquakes/eventp...,https://earthquake.usgs.gov/fdsnws/event/1/que...,,,,...,,,,,,earthquake,M ? - southeast of the Loyalty Islands,171.134000,-22.015000,180.00
2,6.07,"285 km E of Kuril’sk, Russia",-946309460790,1651001591890,,https://earthquake.usgs.gov/earthquakes/eventp...,https://earthquake.usgs.gov/fdsnws/event/1/que...,,,,...,,,,,mw,earthquake,"M 6.1 - 285 km E of Kuril’sk, Russia",151.498000,45.077000,25.00
3,3.42,"9km S of Borrego Springs, CA",-946394224430,1453944863830,,https://earthquake.usgs.gov/earthquakes/eventp...,https://earthquake.usgs.gov/fdsnws/event/1/que...,,,,...,3.0,0.80660,0.6100,293.00,ml,earthquake,"M 3.4 - 9km S of Borrego Springs, CA",-116.367333,33.173167,6.00
4,3.97,"35km S of San Nicolas Is., CA",-946402749540,1453944629960,,https://earthquake.usgs.gov/earthquakes/eventp...,https://earthquake.usgs.gov/fdsnws/event/1/que...,,,,...,5.0,1.61400,0.6600,316.00,ml,earthquake,"M 4.0 - 35km S of San Nicolas Is., CA",-119.442000,32.929333,6.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4451672,1.09,"0 km S of Colton, CA",1695514839520,1695656113633,,https://earthquake.usgs.gov/earthquakes/eventp...,https://earthquake.usgs.gov/fdsnws/event/1/que...,,,,...,50.0,0.09282,0.1700,36.00,ml,earthquake,"M 1.1 - 0 km S of Colton, CA",-117.314167,34.071333,16.11
4451673,0.14,"84 km NW of Karluk, Alaska",1695514282160,1696540900348,,https://earthquake.usgs.gov/earthquakes/eventp...,https://earthquake.usgs.gov/fdsnws/event/1/que...,,,,...,11.0,,0.0900,82.00,ml,earthquake,"M 0.1 - 84 km NW of Karluk, Alaska",-155.259000,58.195167,2.55
4451674,0.70,"30 km N of Sutcliffe, Nevada",1695514246071,1695580950150,,https://earthquake.usgs.gov/earthquakes/eventp...,https://earthquake.usgs.gov/fdsnws/event/1/que...,,,,...,4.0,0.07200,0.0215,280.74,ml,earthquake,"M 0.7 - 30 km N of Sutcliffe, Nevada",-119.668800,40.223400,8.40
4451675,1.91,"2 km NW of Redwood Valley, CA",1695513995040,1696302470640,,https://earthquake.usgs.gov/earthquakes/eventp...,https://earthquake.usgs.gov/fdsnws/event/1/que...,1.0,3.1,,...,43.0,0.03224,0.0700,50.00,md,earthquake,"M 1.9 - 2 km NW of Redwood Valley, CA",-123.223333,39.279500,4.43


In [7]:
df = df[df["type"] == "earthquake"]
df = df[["time", "longitude", "latitude", "depth", "mag", "magType"]]

In [8]:
df["time"] = df["time"].apply(lambda x: dt.datetime.fromtimestamp(x/1000))

In [9]:
df[df.duplicated()]

Unnamed: 0,time,longitude,latitude,depth,mag,magType
1430237,1999-11-30 01:00:00,0.000000,0.000000,0.000000,0.0,mc
1694192,2003-01-06 01:04:28.100000,-85.143000,34.857000,10.500000,2.2,md
2262110,2008-03-27 21:16:42.786000,-119.944600,39.527000,1.900000,0.8,ml
2262326,2008-03-27 06:22:35.523000,-119.878300,39.518100,0.000000,0.4,ml
2262888,2008-03-25 15:40:03.343000,-119.910800,39.510400,0.000000,,
...,...,...,...,...,...,...
4200163,2022-01-26 09:53:31.817000,-150.598400,62.132000,4.000000,0.9,ml
4200280,2022-01-26 05:18:05.072000,-153.482700,60.107000,160.400000,2.1,ml
4214304,2022-03-03 00:42:23.449000,-104.401929,31.677717,7.288428,2.3,ml
4215135,2022-03-01 05:29:53.805000,-104.386630,31.682642,6.131567,2.0,ml


In [10]:
df.drop_duplicates(inplace=True)

In [11]:
df.isna().sum().sum() / (len(df) * len(df.columns))

0.01315462733483836

In [12]:
df

Unnamed: 0,time,longitude,latitude,depth,mag,magType
0,1940-01-06 20:04:35.170000,25.814000,35.373000,15.00,5.84,mw
1,1940-01-06 15:03:40.340000,171.134000,-22.015000,180.00,,
2,1940-01-06 09:15:39.210000,151.498000,45.077000,25.00,6.07,mw
3,1940-01-05 09:42:55.570000,-116.367333,33.173167,6.00,3.42,ml
4,1940-01-05 07:20:50.460000,-119.442000,32.929333,6.00,3.97,ml
...,...,...,...,...,...,...
4451672,2023-09-24 02:20:39.520000,-117.314167,34.071333,16.11,1.09,ml
4451673,2023-09-24 02:11:22.160000,-155.259000,58.195167,2.55,0.14,ml
4451674,2023-09-24 02:10:46.071000,-119.668800,40.223400,8.40,0.70,ml
4451675,2023-09-24 02:06:35.040000,-123.223333,39.279500,4.43,1.91,md


In [13]:
df = df.dropna().drop_duplicates().reset_index(drop=True)
df

Unnamed: 0,time,longitude,latitude,depth,mag,magType
0,1940-01-06 20:04:35.170000,25.814000,35.373000,15.00,5.84,mw
1,1940-01-06 09:15:39.210000,151.498000,45.077000,25.00,6.07,mw
2,1940-01-05 09:42:55.570000,-116.367333,33.173167,6.00,3.42,ml
3,1940-01-05 07:20:50.460000,-119.442000,32.929333,6.00,3.97,ml
4,1940-01-04 21:44:55.390000,37.926000,40.415000,15.00,5.51,mw
...,...,...,...,...,...,...
4152281,2023-09-24 02:20:39.520000,-117.314167,34.071333,16.11,1.09,ml
4152282,2023-09-24 02:11:22.160000,-155.259000,58.195167,2.55,0.14,ml
4152283,2023-09-24 02:10:46.071000,-119.668800,40.223400,8.40,0.70,ml
4152284,2023-09-24 02:06:35.040000,-123.223333,39.279500,4.43,1.91,md


In [14]:
df.to_csv("../data/usgs_data_small.csv", index=False)

In [15]:
df.isna().sum()

time         0
longitude    0
latitude     0
depth        0
mag          0
magType      0
dtype: int64