In [71]:
import pandas as pd
import geopandas as gpd
from datetime import date, datetime, timedelta
from dateutil.relativedelta import relativedelta


In [123]:
def listarfechas(fecha_inicial, fecha_final):
    ''' 
    This function gets two dates in str format and outputs a list of dates between them, separated 10 days (aprox)
        between each other.
    Output list is also in str format.
    Both input dates must be the first day of each month.

    '''
    
    fecha_inicial_dt = datetime.strptime(fecha_inicial, '%Y-%m-%d')
    
    lista_fechas=[]
    lista_fechas.append(fecha_inicial)
    i=0
    while lista_fechas[i] != fecha_final:
        aux_dt = datetime.strptime(lista_fechas[i], '%Y-%m-%d')
        if aux_dt.day == 1:
            aux_dt = aux_dt + timedelta(days=9)
        elif aux_dt.day == 10:
            aux_dt = aux_dt + timedelta(days=10)
        else: # meaning day = 20
            aux_dt = datetime.strptime(lista_fechas[i-2], '%Y-%m-%d')
            aux_dt = aux_dt + relativedelta(months=1)
        aux = datetime.strftime(aux_dt, '%Y-%m-%d')
        lista_fechas.append(aux)
        i+=1
    
    return lista_fechas
    


In [124]:
# Define start and end dates
fecha_inicial = '2010-01-01'
fecha_final = '2022-11-01'
lista_fechas = listarfechas(fecha_inicial, fecha_final)


In [125]:
# Download segmented data from USGS event api.
# It needs to be segmented because it cannot return more than 20.000 rows in each query. You get a 404 error.

data = []
print('Progress:')
for i in range(0,len(lista_fechas)-1):
    query = 'https://earthquake.usgs.gov/fdsnws/event/1/query?format=geojson&starttime='+lista_fechas[i]+'&endtime='+lista_fechas[i+1]
    df_aux = gpd.read_file(query)
    data.append(df_aux)
    print( lista_fechas[i],' to ',lista_fechas[i+1],' OK! ---> ',round(((i+1)/(len(lista_fechas)-1))*100,1),'% Done' )

# 22 min. aprox.

Progress:
2010-01-01  a  2010-01-10  OK! --->  0.2 % Done
2010-01-10  a  2010-01-20  OK! --->  0.4 % Done
2010-01-20  a  2010-02-01  OK! --->  0.6 % Done
2010-02-01  a  2010-02-10  OK! --->  0.9 % Done
2010-02-10  a  2010-02-20  OK! --->  1.1 % Done
2010-02-20  a  2010-03-01  OK! --->  1.3 % Done
2010-03-01  a  2010-03-10  OK! --->  1.5 % Done
2010-03-10  a  2010-03-20  OK! --->  1.7 % Done
2010-03-20  a  2010-04-01  OK! --->  1.9 % Done
2010-04-01  a  2010-04-10  OK! --->  2.2 % Done
2010-04-10  a  2010-04-20  OK! --->  2.4 % Done
2010-04-20  a  2010-05-01  OK! --->  2.6 % Done
2010-05-01  a  2010-05-10  OK! --->  2.8 % Done
2010-05-10  a  2010-05-20  OK! --->  3.0 % Done
2010-05-20  a  2010-06-01  OK! --->  3.2 % Done
2010-06-01  a  2010-06-10  OK! --->  3.5 % Done
2010-06-10  a  2010-06-20  OK! --->  3.7 % Done
2010-06-20  a  2010-07-01  OK! --->  3.9 % Done
2010-07-01  a  2010-07-10  OK! --->  4.1 % Done
2010-07-10  a  2010-07-20  OK! --->  4.3 % Done
2010-07-20  a  2010-08-01  OK!

In [126]:
# Control. Sum of count of rows for each gdf.

cant_filas = []
for i in range(0,len(data)):
    cant_filas.append(len(data[i]))

print(sum(cant_filas))

1857134


In [127]:
# merge all geodataframes into one.
gdf = gpd.GeoDataFrame( pd.concat( data, ignore_index=True) )

In [128]:
# Observe there are the same amount of rows.
gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 1857134 entries, 0 to 1857133
Data columns (total 28 columns):
 #   Column    Dtype   
---  ------    -----   
 0   id        object  
 1   mag       float64 
 2   place     object  
 3   time      int64   
 4   updated   int64   
 5   tz        object  
 6   url       object  
 7   detail    object  
 8   felt      float64 
 9   cdi       float64 
 10  mmi       float64 
 11  alert     object  
 12  status    object  
 13  tsunami   int64   
 14  sig       int64   
 15  net       object  
 16  code      object  
 17  ids       object  
 18  sources   object  
 19  types     object  
 20  nst       float64 
 21  dmin      float64 
 22  rms       float64 
 23  gap       float64 
 24  magType   object  
 25  type      object  
 26  title     object  
 27  geometry  geometry
dtypes: float64(8), geometry(1), int64(4), object(15)
memory usage: 396.7+ MB


In [136]:
# Export to csv format.
gdf.to_csv('USGS_events_'+fecha_inicial+'_'+fecha_final+'.csv', index=False)

In [137]:
# Export to parquet format.
gdf.to_parquet('USGS_events_'+fecha_inicial+'_'+fecha_final+'.parquet', index=False)