In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
import requests
import re
from datetime import datetime
import pandas as pd
import geopandas as gpd

In [3]:
pd.set_option('display.max_colwidth', 100)

In [4]:
## Get the Data

# The National Observatory of Athens has an earthquake catalog; for each year there is a txt file with all the earthquakes in Greece

years = [2021, 2022]
noa_url = [f'https://www.gein.noa.gr/HTML/Noa_cat/CAT{year}.TXT' for year in years]
noa_url

['https://www.gein.noa.gr/HTML/Noa_cat/CAT2021.TXT',
 'https://www.gein.noa.gr/HTML/Noa_cat/CAT2022.TXT']

In [6]:
for url in noa_url:
    filename = f"Data/{url.split('/')[-1]}"
    with open(filename, mode='w') as f:
        try:
            f.write((response := requests.get(url)).text)
            response.status_code
        except Exception as e:
            print(e)
            print('Website currently not available')

1503489

200

1235289

200

In [27]:
# examine the data
with open('Data/CAT2022.TXT', 'r') as data:
    ll = data.readlines()
    # print(ll[:50])
    print(''.join(ll[:50]))

 DATE         TIME     LAT.   LONG.  DEPTH    MAGNITUDE            
                   (GMT)    (N)    (E)    (km)       (Local)
 2022 JAN  1   00 44 59.3 35.1732 25.3047    6         1.0
 2022 JAN  1   00 45 57.8 35.1077 25.2090   12         1.2
 2022 JAN  1   01 10 41.6 38.3437 22.0345    9         0.9
 2022 JAN  1   02 42 36.9 37.8625 23.0164   12         1.9
 2022 JAN  1   02 49 01.2 36.6385 21.4590   12         2.2
 2022 JAN  1   03 28 57.7 36.4517 21.7960    6         1.9
 2022 JAN  1   03 42 03.9 37.3114 20.5389   12         2.0
 2022 JAN  1   03 47 45.9 35.1787 25.3276    5         0.9
 2022 JAN  1   03 52 10.4 39.4972 20.6703    9         2.2
 2022 JAN  1   03 52 47.6 35.6168 27.2562   14         2.8
 2022 JAN  1   04 14 17.4 35.1448 25.3157   13         1.6
 2022 JAN  1   04 30 14.9 35.1759 25.3139    7         2.4
 2022 JAN  1   04 54 21.8 38.3611 22.2972   12         1.7
 2022 JAN  1   05 22 49.7 38.2814 23.3693   14         1.1
 2022 JAN  1   05 22 56.3 35.1494 25.2534   1

In [None]:
# the text file of earthquakes contains 6 columns separated by space
# cleaning will require removing the empty rows and distinguishing the columns as not every space delinates one column

In [7]:
def to_date(line):
    date_re = re.compile(r'(\d{4}\s+[A-Z]{3}\s+\d{1,2})')
    dates =  date_re.findall(line)
    return [datetime.strptime(d, '%Y %b %d').strftime('%d/%m/%Y') for d in dates]

In [8]:
def to_time(line):
    time_re = re.compile(r'\d{2}\s{1}\d{2}\s{1}[\d.]+')
    times = time_re.findall(line)
    return [datetime.strptime(t, '%H %M %S.%f').strftime('%H:%M:%S.%f') for t in times]

In [9]:
def to_latitude(line):
    lat_re = re.compile(r'([\d.]+)(?=\s[\d.]+\s{2,4})', flags = re.MULTILINE)
    lats = lat_re.findall(line)
    return [float(lat) for lat in lats]

In [10]:
def to_longitude(line):
    lon_re = re.compile(r'([\d.]+)(?=\s{2,}\d+\s{2,}[\d.]+$)', flags = re.MULTILINE)
    lons = lon_re.findall(line)
    return [float(lon) for lon in lons]

In [11]:
def to_depth(line):
    depth_re = re.compile(r'(\d+)(?=\s{5,}[\d.])')
    depths = depth_re.findall(line)
    return [int(d) for d in depths]


In [12]:
def to_magnitude(line):
    magn_re = re.compile(r'([\d.]+)$', flags = re.MULTILINE)
    magns = magn_re.findall(line)
    return [float(m) for m in magns]

In [22]:
def text2df(filename):
    '''
    '''
    with open(filename, 'r', encoding='UTF-8') as f:
        lines = '\n'.join(f.readlines())
        return pd.DataFrame({'DATE': to_date(lines),
                               'TIME(GMT)': to_time(lines),
                               'LAT (N)': to_latitude(lines),
                               'LONG (E)': to_longitude(lines),
                               'DEPTH(km)': to_depth(lines),
                               'MAGNITUDE(Local)': to_magnitude(lines)})

In [28]:
# from text lines to pandas (Geo)DataFrame...
eques_df_2021 = text2df('Data/CAT2021.TXT')
eques_df_2022 = text2df('Data/CAT2022.TXT')

In [29]:
eques_df_2021.head()
eques_df_2021.tail()
print(f"rows:{eques_df_2021.shape[0]}, columns:{eques_df_2021.shape[1]}")

Unnamed: 0,DATE,TIME(GMT),LAT (N),LONG (E),DEPTH(km),MAGNITUDE(Local)
0,01/01/2021,00:38:24.300000,38.3894,21.9832,8,1.2
1,01/01/2021,00:57:47.900000,38.3748,22.029,8,0.8
2,01/01/2021,01:09:25.700000,38.3693,22.029,7,1.6
3,01/01/2021,01:28:31.000000,40.4311,21.1546,17,2.1
4,01/01/2021,01:53:23.600000,34.9503,24.3283,45,1.9


Unnamed: 0,DATE,TIME(GMT),LAT (N),LONG (E),DEPTH(km),MAGNITUDE(Local)
25051,31/12/2021,22:55:36.600000,36.4554,27.1024,16,2.8
25052,31/12/2021,23:03:08.800000,38.3492,21.8285,8,0.9
25053,31/12/2021,23:31:57.500000,38.7584,23.3235,11,1.7
25054,31/12/2021,23:36:13.500000,35.0807,25.1921,12,1.7
25055,31/12/2021,23:36:34.000000,35.088,25.2168,13,1.9


rows:25056, columns:6


In [32]:
eques_df_2022.head()
eques_df_2022.tail()
print(f"rows:{eques_df_2021.shape[0]}, columns:{eques_df_2022.shape[1]}")

Unnamed: 0,DATE,TIME(GMT),LAT (N),LONG (E),DEPTH(km),MAGNITUDE(Local)
0,01/01/2022,00:44:59.300000,35.1732,25.3047,6,1.0
1,01/01/2022,00:45:57.800000,35.1077,25.209,12,1.2
2,01/01/2022,01:10:41.600000,38.3437,22.0345,9,0.9
3,01/01/2022,02:42:36.900000,37.8625,23.0164,12,1.9
4,01/01/2022,02:49:01.200000,36.6385,21.459,12,2.2


Unnamed: 0,DATE,TIME(GMT),LAT (N),LONG (E),DEPTH(km),MAGNITUDE(Local)
20581,31/12/2022,23:14:29.100000,38.6339,23.5561,18,1.0
20582,31/12/2022,23:16:03.500000,41.2459,23.7309,12,1.4
20583,31/12/2022,23:16:06.100000,39.689,20.6149,7,1.5
20584,31/12/2022,23:54:03.500000,41.7549,23.4352,11,1.4
20585,31/12/2022,23:57:32.700000,38.2736,22.1365,13,0.7


rows:25056, columns:6


In [31]:
# combine earthquakes data frames
eques_df = pd.concat([eques_df_2021, eques_df_2022], ignore_index = True)
eques_df.shape

(45642, 6)