In [88]:
import datetime
import geopy.distance
import pandas as pd
import os
from pandas import Timestamp


In [None]:

def get_earthquake_data_csv(year, mag,  filepath):
    """
        Downloads earthquake data from earthquake.usgs.gov for 1 year with 

        Arguments:
        year - year, data will be collected from 01.01 00:00:00 to 12.12 23:59:59 of this year
        mag - lower border of the magnitude of the earthquakes
        filepath - the path to the directory in which the dataset will be saved

        Returns:
        saves the csv file to the dataset and prints an info message
    """
    url = 'https://earthquake.usgs.gov/fdsnws/event/1/query.csv?starttime={year}-01-01%2000%3A00%3A00&endtime={year}-12-24%2023%3A59%3A59&minmagnitude={mag}&orderby=time'.format(year = year, mag = mag)
    df = pd.read_csv(url)
    df.to_csv(filepath + 'earthquake_{year}.csv'.format(year = year), sep='\t')
    print('Data for the {year} was saved to earthquake_{year}.csv'.format(year = year))
    
    
def get_earthquake_dataset(start_year, end_year, path):
    """
        Downloads earthquake data from earthquake.usgs.gov for a sertain period of time

        Arguments:
        start_year, end_year - start and end of the time period for the dataset
        mag - lower border of the magnitude of the earthquakes
        path - the path to the directory in which the dataset will be saved
    """
    for y in range(start_year, end_year+1):
        get_earthquake_data_csv(y, 4.5, path)
        
get_earthquake_dataset(1992, 2021, 'earthquakes_data/')

In [None]:
eq_cols = pd.read_csv('earthquakes_data/earthquake_1992.csv', sep='\t').columns
print(eq_cols)
def eq_join_in_one_file(earthquakes_folder):  
    df_res = pd.DataFrame()
    earthquakes_list = os.listdir(earthquakes_folder)
    for name_e in earthquakes_list: 
        path = earthquakes_folder +'/'+ name_e
        print(path)
        df_e = pd.read_csv(path, sep='\t')
        df_e['time'] = pd.to_datetime(df_e['time'])
        df_e = df_e.dropna(subset=['time'])
        try:
            df_e['time'] = [datetime.datetime.strptime(str(d).split('.')[0], "%Y-%m-%d %H:%M:%S") for d in df_e['time']]
        except ValueError:
            print("An error occured while converting to datetime")
            
        df_e['date'] = [datetime.datetime.date(dt) for dt in df_e['time']]
        df_e['time'] = [datetime.datetime.time(dt) for dt in df_e['time']]
        df_res = pd.concat([df_res, df_e])
    df_res.drop(df_res.columns[[0]], axis = 1, inplace = True)
    df_res.to_csv('earthquakes_data.csv', index=False)

eq_join_in_one_file('earthquakes_data')

In [None]:
import pandas as pd

sondes = {'WK545': [x for x in range(1992, 2004)],
          'WK546' : [x for x in range(2001, 2021)]
          'AK539' : [x for x in range(1992, 1994)],
          'TO535' : [x for x in range(1992, 2003)],
          'TO536' : [x for x in range(2001, 2021)],
          'YG431' : [x for x in range(1992, 2021)],
          'OK426' : [x for x in range(1992, 2021)],
          'SY951' : [x for x in range(1996, 2006)]}

def get_df_columns():
    url_txt = "http://wdc.nict.go.jp/IONO/observation-history/factor-auto-WK546-2011.sjis.txt"
    df = pd.read_csv(url_txt)
    df_columns = df.columns.values.tolist()
    return df_columns

def get_japan_data_csv(sondes, df_columns, filepath):
    """
        Downloads all automatically scaled data from ionosondes in Japan from http://wdc.nict.go.jp catalog

        Arguments:
        sondes - dictionary with keys - ionosondes names and values - years with records
        df_columns - list of columns with the structure of catalog
        filepath - the path to the directory in which the dataset will be saved

        Returns:
        saves the csv file to the dataset and prints an info message
    """
    url_str_beg = "http://wdc.nict.go.jp/IONO/observation-history/factor-auto-"
    url_str_end = ".sjis.txt"
    for sonde, years in sondes.items():
        df = pd.DataFrame(columns=df_columns)
        for year in years:
            url = url_str_beg + sonde + "-" + str(year) + url_str_end
#             print("Downloading data from ", url)
            df1 = pd.read_csv(url)
            df = pd.concat([df, df1])
        fname = str.format("{path}{sonde}.csv", path = filepath, sonde = sonde)
        df.to_csv(fname)
        print('Data for the {sonde} sonde was saved to {sonde}.csv'.format(sonde = sonde))
    
df_columns = get_df_columns()
path = "sondes_japan_data/"
get_japan_data_csv(sondes, df_columns, path)

In [89]:
def clean_time(df):
    df['time'] = df['#                       fmin  ']
    df['time'] = df['time'].apply(lambda x: x.split(':')[0])
    df['time'] = pd.to_datetime(df['time'], errors='coerce')
    df = df.drop('#                       fmin  ', 1)
    return df[df.time.notnull()]

def convert_tz(row, tz_old = 'Asia/Tokyo', tz_new = 'UTC'):
    stamp = Timestamp(pd.to_datetime(row['time'], errors='ignore'), tz=tz_old)
    row['time'] = stamp.tz_convert(tz=tz_new)
    return row

In [None]:
# cleaning time & converting TZ              
def clean_sondes(sondes_folder):
    sondes_names = os.listdir(sondes_folder)
    print(sondes_names)
    for name_s in sondes_names:
        if '.csv' not in name_s:
            continue
        path = sondes_folder +'/'+ name_s
        df_s = pd.read_csv(path, sep=',')
        df_s = clean_time(df_s) 
        #convert tz
        if "SY951" in name_s:
            df_s = df_s.apply(convert_tz, tz_old = 'Antarctica/Syowa',  axis=1)
        else:
            df_s = df_s.apply(convert_tz, axis=1)
        df_s['time'] = pd.to_datetime(df_s['time'])
        df_s['date'] = [datetime.datetime.date(d) for d in df_s['time']] 
        df_s.drop(df_s.columns[[0,1]], axis = 1, inplace = True)
        df_s.to_csv(path, index=False)
        print('saved ', path)
        
clean_sondes('sondes_japan_data')

In [7]:
def filter_by_prep_zone(row, sonde, size = 1):
    radius = 10**(0.43*row['mag'])*size
    coord_e = (row['latitude'], row['longitude'])
    coord_s = sondes[sonde][0]
    dist = geopy.distance.geodesic(coord_e, coord_s).km
    return pd.Series([row['id'], radius, dist])

In [8]:
sondes = {
          'WK545': ((45.39, 141.68),[x for x in range(1992, 2004)]),
          'WK546' : ((45.16, 141.75),[x for x in range(2001, 2021)]),
          'AK539' : ((39.725, 140.053),[x for x in range(1992, 1994)]),
          'TO535' : ((35.71, 139.45),[x for x in range(1992, 2003)]),
          'TO536' : ((35.71, 139.45),[x for x in range(2001, 2021)]),
          'YG431' : ((31.20, 130.62),[x for x in range(1992, 2021)]),
          'OK426' : ((26.28, 127.81),[x for x in range(1992, 2021)]),
          'SY951' : ((-69.00, 39.59),[x for x in range(1996, 2006)])
         }


columns_list = ['time','eq_lat','eq_lon','mag','earthquake_id','date','prep_zone','eq_sonde_dist', 'sonde','sonde_lat','sonde_lon']

temp_col_list = ['time', 'latitude', 'longitude', 'mag', 'id', 'date', 'prep_zone', 'dist', 'sonde','sonde_lat','sonde_lon']

In [64]:
col_list = ['time', 'latitude', 'longitude', 'mag', 'id', 'date', 'prep_zone_in', 'prep_zone_out', 'dist', 'sonde','sonde_lat','sonde_lon']
def filter_by_prep_zone(row, sonde, size1 = 1, size2 = 2):
    radius1 = 10**(0.43*row['mag'])*size1
    radius2 = 10**(0.43*row['mag'])*size2
    coord_e = (row['latitude'], row['longitude'])
    coord_s = sondes[sonde][0]
    dist = geopy.distance.geodesic(coord_e, coord_s).km
    return pd.Series([row['id'], radius1, radius2, dist])

def walk_sondes(sondes_folder, earthquakes_folder, res_path):
    df_res = pd.DataFrame(columns=col_list)
    sondes_names = os.listdir(sondes_folder)
    print(sondes_names)
    for name_s in sondes_names:
        if '.csv' not in name_s:
            continue
        path = sondes_folder +'/'+ name_s
        sonde_name = name_s.split('.')[0]
        print(path)
        df_s = pd.read_csv(path, sep=',')             
        df_sonde = walk_earthquakes(df_s, sonde_name, earthquakes_folder)
        df_res = pd.concat([df_res, df_sonde])
        df_res.drop_duplicates(subset=['id', 'sonde'], inplace=True)
        print(df_res.shape)
    df_res.to_csv(res_path, index=False)

def walk_earthquakes(df_s, sonde_name, earthquakes_file):
    df_res = pd.DataFrame(columns=col_list)
    print(sonde_name)
    df = pd.read_csv(earthquakes_file, sep=',')
    #filter by prep zone radius
    eq_filtered = df.apply(filter_by_prep_zone, sonde=sonde_name, size1=1, size2=2, axis=1)
    eq_filtered.columns = ['id', 'prep_zone_in', 'prep_zone_out', 'dist']
    df = df.merge(eq_filtered, on='id')
    # sondes in the small circle
    df_e = df[df.dist <= df.prep_zone_in]

    # sondes in the big circle
#     df_btw = df[(df.prep_zone_in <= df.dist) & (df.dist <= df.prep_zone_out)]
#     df_e = df_btw

    for d in df_e['date']:
        if d not in df_s.date.unique(): #checking if sonde was working on the day of the earthquake
            continue
        temp_e = df_e[df_e.date == d][['time', 'latitude', 'longitude', 'mag', 'id', 'date', 'prep_zone_in', 'prep_zone_out', 'dist']]
        temp_s = pd.DataFrame({'sonde': [sonde_name], 'sonde_lat' : [sondes[sonde_name][0][0]], 'sonde_lon': [sondes[sonde_name][0][1]]})
        temp_e.reset_index(drop=True, inplace=True)
        temp_s = pd.concat([temp_s for i in range(temp_e.shape[0])],axis = 0)          
        temp_s.reset_index(drop=True, inplace=True)
        temp = pd.concat([temp_e, temp_s], axis=1)
        df_res = pd.concat([df_res, temp])
    return df_res


In [None]:
walk_sondes('sondes_japan_data', 'earthquakes_data.csv', 'sondes_in_prep_zone_small.csv')
walk_sondes('sondes_japan_data', 'earthquakes_data.csv', 'sondes_in_prep_zone_big.csv')