In [2]:
import geopandas as gpd
import pandas as pd
import numpy as np
import os

DATA_DIR = os.path.join(os.getcwd(), 'data')
PREPROCESSED_DATA_DIR = os.path.join(os.getcwd(), 'preprocessed_data')

## Region

In [3]:
region_gdf = gpd.read_file(os.path.join(DATA_DIR, 'region.csv'))
print(region_gdf.shape)
region_gdf.head()

(12, 9)


Unnamed: 0,Geo Point,Geo Shape,Year,Official Code Country,Official Name Country,Official Code Region,Official Name Region,Iso 3166-3 Area Code,Type
0,"56.83715295957758, -4.181586568158648","{""coordinates"": [[[[-5.113774648244052, 55.256...",2022,S92000003,Scotland,S92000003,Scotland,GBR,region
1,"51.28089219358066, -0.5340635597193183","{""coordinates"": [[[[-1.311992216637758, 50.767...",2022,E92000001,England,E12000008,South East,GBR,region
2,"54.60835687629359, -6.683128305224262","{""coordinates"": [[[[-5.534856755313333, 54.681...",2022,N92000002,Northern Ireland,N92000002,Northern Ireland,GBR,region
3,"51.00117456613325, -3.1306561466497556","{""coordinates"": [[[[-6.349065489364785, 49.898...",2022,E92000001,England,E12000009,South West,GBR,region
4,"52.92698963069219, -0.8056146554420468","{""coordinates"": [[[[0.267432090574174, 52.8131...",2022,E92000001,England,E12000004,East Midlands,GBR,region


In [4]:
# Drop unnecessary columns
preserve_cols = ['Geo Point', 'Geo Shape', 'Official Code Country', 'Official Name Country', 'Official Code Region', 'Official Name Region']
cols = region_gdf.columns.tolist()

for col in preserve_cols:
  cols.remove(col)

region_gdf = region_gdf.drop(cols, axis=1)

# Rename cols
region_gdf.rename(columns={"Geo Point": "point", "Geo Shape" : "shape", "Official Code Country" : "country_code", "Official Name Country" : "country_name", "Official Code Region" : "region_code", "Official Name Region": "region_name"}, inplace = True)
region_gdf.head()

Unnamed: 0,point,shape,country_code,country_name,region_code,region_name
0,"56.83715295957758, -4.181586568158648","{""coordinates"": [[[[-5.113774648244052, 55.256...",S92000003,Scotland,S92000003,Scotland
1,"51.28089219358066, -0.5340635597193183","{""coordinates"": [[[[-1.311992216637758, 50.767...",E92000001,England,E12000008,South East
2,"54.60835687629359, -6.683128305224262","{""coordinates"": [[[[-5.534856755313333, 54.681...",N92000002,Northern Ireland,N92000002,Northern Ireland
3,"51.00117456613325, -3.1306561466497556","{""coordinates"": [[[[-6.349065489364785, 49.898...",E92000001,England,E12000009,South West
4,"52.92698963069219, -0.8056146554420468","{""coordinates"": [[[[0.267432090574174, 52.8131...",E92000001,England,E12000004,East Midlands


In [5]:
region_gdf['country_code'] = region_gdf['country_code'].astype('string')
region_gdf['country_name'] = region_gdf['country_name'].astype('string')
region_gdf['region_code'] = region_gdf['region_code'].astype('string')
region_gdf['region_name'] = region_gdf['region_name'].astype('string')

region_gdf.dtypes

point                   object
shape                   object
country_code    string[python]
country_name    string[python]
region_code     string[python]
region_name     string[python]
dtype: object

In [6]:
region_gdf.to_csv(os.path.join(PREPROCESSED_DATA_DIR, 'region.csv'), index=False)
region_gdf.to_json(os.path.join(PREPROCESSED_DATA_DIR, "region.json"), indent=2, orient="records")

## District

In [7]:
district_gdf = gpd.read_file(os.path.join(DATA_DIR, 'district.csv'))
district_gdf.head()

Unnamed: 0,Geo Point,Geo Shape,Year,Official Code Country,Official Name Country,Official Code Region,Official Name Region,Official Code County/Unitary district,Official Name County/Unitary district,Official Code Local authority district,Official Name Local authority district,Iso 3166-3 Area Code,Type
0,"52.10369321305276, -2.745146096225513","{""coordinates"": [[[-2.618037526843188, 52.3069...",2022,E92000001,England,E12000005,West Midlands,E06000019,"Herefordshire,County of",E06000019,"Herefordshire,County of",GBR,unitary authority
1,"56.47799448020245, -2.970453045191277","{""coordinates"": [[[-2.838647896439604, 56.4740...",2022,S92000003,Scotland,S92000003,Scotland,S12000042,Dundee City,S12000042,Dundee City,GBR,council area
2,"52.62558440059879, -2.176661874919075","{""coordinates"": [[[-2.026807116830768, 52.7363...",2022,E92000001,England,E12000005,West Midlands,E10000028,Staffordshire,E07000196,South Staffordshire,GBR,non-metropolitan district
3,"51.77205779662589, -0.3424608100426959","{""coordinates"": [[[-0.273559738557496, 51.8353...",2022,E92000001,England,E12000006,East of England,E10000015,Hertfordshire,E07000240,St Albans,GBR,non-metropolitan district
4,"51.93468869093822, 0.27970400537559986","{""coordinates"": [[[0.389665590516818, 52.03645...",2022,E92000001,England,E12000006,East of England,E10000012,Essex,E07000077,Uttlesford,GBR,non-metropolitan district


In [None]:
# Drop unnecessary columns
preserve_cols = ['Geo Point', 'Geo Shape', 'Official Code Region', 'Official Code Local authority district', 'Official Name Local authority district', 'Type']
cols = district_gdf.columns.tolist()

for col in preserve_cols:
  cols.remove(col)

district_gdf = district_gdf.drop(cols, axis=1)

# Rename cols
district_gdf.rename(columns={"Geo Point": "point", "Geo Shape" : "shape", "Official Code Region" : "region_code", "Official Code Local authority district" : "district_code", "Official Name Local authority district": "district_name", "Type" : "district_type"}, inplace = True)
district_gdf.head()

Unnamed: 0,point,shape,region_code,district_code,district_name,district_type
0,"52.10369321305276, -2.745146096225513","{""coordinates"": [[[-2.618037526843188, 52.3069...",E12000005,E06000019,"Herefordshire,County of",unitary authority
1,"56.47799448020245, -2.970453045191277","{""coordinates"": [[[-2.838647896439604, 56.4740...",S92000003,S12000042,Dundee City,council area
2,"52.62558440059879, -2.176661874919075","{""coordinates"": [[[-2.026807116830768, 52.7363...",E12000005,E07000196,South Staffordshire,non-metropolitan district
3,"51.77205779662589, -0.3424608100426959","{""coordinates"": [[[-0.273559738557496, 51.8353...",E12000006,E07000240,St Albans,non-metropolitan district
4,"51.93468869093822, 0.27970400537559986","{""coordinates"": [[[0.389665590516818, 52.03645...",E12000006,E07000077,Uttlesford,non-metropolitan district


In [9]:
district_gdf['region_code'] = district_gdf['region_code'].astype('string')
district_gdf['district_code'] = district_gdf['district_code'].astype('string')
district_gdf['district_name'] = district_gdf['district_name'].astype('string')
district_gdf['district_type'] = district_gdf['district_type'].astype('string')

district_gdf.dtypes

point                    object
shape                    object
region_code      string[python]
district_code    string[python]
district_name    string[python]
district_type    string[python]
dtype: object

In [10]:
district_gdf.to_csv(os.path.join(PREPROCESSED_DATA_DIR, 'district.csv'), index=False)
district_gdf.to_json(os.path.join(PREPROCESSED_DATA_DIR, "district.json"), indent=2, orient="records")

## Road

In [4]:
road_gdf = gpd.read_file(os.path.join(DATA_DIR, 'road.csv'))
road_gdf.head()

Unnamed: 0,Geo Point,Geo Shape,FCsubtype,inspireId,beginLifes,F_CODE,ICC,EXS,LLE,LTN,...,NLN2,RST,RSU,RTE,RTN,RTT,TEN,TOL,TUC,Shape_Leng
0,"51.34516449999985, 1.315162999999799","{""coordinates"": [[1.314668499999811, 51.343289...",1,_EG.EGM.RoadL:a79578a6-daf4-4976-842a-fc91f51b...,2015-10-26,AP030,GB,28,1,-29999,...,N_A,1,1,N_A,B2048,15,2,1,7,0.003878223949999
1,"51.48927235975003, -0.105407526584104","{""coordinates"": [[-0.10995250000019, 51.485084...",1,_EG.EGM.RoadL:b12f2cc9-8717-49a9-bf72-27c09b3f...,2015-10-26,AP030,GB,28,1,-29999,...,N_A,1,1,N_A,A3,14,2,1,7,0.012361714849999
2,"51.62412912628692, -0.016065054325416002","{""coordinates"": [[-0.018420000000191002, 51.61...",1,_EG.EGM.RoadL:5b77c1d3-d155-42e1-85a8-57013269...,2015-10-26,AP030,GB,28,1,-29999,...,N_A,1,1,N_A,A112,14,2,1,7,0.01844667143
3,"51.71837558549718, -2.539171670812823","{""coordinates"": [[-2.5336670000002073, 51.7191...",1,_EG.EGM.RoadL:df95f691-553f-45b1-b289-29fad022...,2015-10-26,AP030,GB,28,1,-29999,...,N_A,1,1,N_A,A48(T),14,2,1,7,0.011130961869999
4,"53.42508095134617, -1.363785815449348","{""coordinates"": [[-1.364227000000199, 53.42567...",1,_EG.EGM.RoadL:7792b1bc-d4c0-43fa-ab69-bf620ada...,2015-10-26,AP030,GB,28,1,-29999,...,N_A,1,1,N_A,A630,14,2,1,7,0.001479113029999


In [11]:
# Drop unnecessary columns
preserve_cols = ['Geo Point', 'Geo Shape', 'Shape_Leng']
cols = road_gdf.columns.tolist()

for col in preserve_cols:
  cols.remove(col)

road_gdf = road_gdf.drop(cols, axis=1)

# Rename cols
road_gdf.rename(columns={"Geo Point": "point", "Geo Shape" : "line", "Shape_Leng" : "length"}, inplace = True)
road_gdf['id'] = range(1, len(road_gdf)+1)
road_gdf.head()

Unnamed: 0,point,line,length,id
0,"51.34516449999985, 1.315162999999799","{""coordinates"": [[1.314668499999811, 51.343289...",0.003878223949999,1
1,"51.48927235975003, -0.105407526584104","{""coordinates"": [[-0.10995250000019, 51.485084...",0.012361714849999,2
2,"51.62412912628692, -0.016065054325416002","{""coordinates"": [[-0.018420000000191002, 51.61...",0.01844667143,3
3,"51.71837558549718, -2.539171670812823","{""coordinates"": [[-2.5336670000002073, 51.7191...",0.011130961869999,4
4,"53.42508095134617, -1.363785815449348","{""coordinates"": [[-1.364227000000199, 53.42567...",0.001479113029999,5


In [12]:
road_gdf['length'] = road_gdf['length'].astype(float)
road_gdf['id'] = road_gdf['id'].astype(int)

road_gdf.dtypes

point      object
line       object
length    float64
id          int32
dtype: object

In [13]:
road_gdf.to_csv(os.path.join(PREPROCESSED_DATA_DIR, 'road.csv'), index=False)
road_gdf.to_json(os.path.join(PREPROCESSED_DATA_DIR, "road.json"), indent=2, orient="records")

## Earthquake

In [111]:
earthquake_gdf = gpd.read_file(os.path.join(DATA_DIR, 'earthquake.csv'))
earthquake_gdf.head()

Unnamed: 0,yyyy-mm-dd,hh:mm:ss.ss,lat,lon,depth,ML,Nsta,RMS,intensity,induced,locality
0,2014-11-22,21:14:39.1,54.267,-2.486,7.1,0.9,5,0.5,,,SEDBERGH
1,2014-11-25,06:38:05.9,56.4,-4.817,7.8,0.8,5,0.2,,,INVERLOCHY
2,2014-11-25,06:38:32.2,56.411,-4.825,7.7,0.6,3,0.4,,,INVERLOCHY
3,2014-12-01,03:56:32.1,53.855,-3.764,4.3,0.9,7,0.1,,,IRISH SEA
4,2014-12-03,21:57:05.2,55.802,-3.187,6.4,2.0,12,0.2,3.0,,PENICUIK


In [112]:
# Drop unnecessary columns
cols_to_drop = ['intensity', 'induced']

earthquake_gdf = earthquake_gdf.drop(cols_to_drop, axis=1)

# Rename cols
earthquake_gdf.rename(columns={"yyyy-mm-dd": "date", "hh:mm:ss.ss" : "time", "Shape_Leng" : "length", 'ML' : 'magnitude_range', 'Nsta' : "n_station"}, inplace = True)
earthquake_gdf['id'] = range(1, len(earthquake_gdf)+1)
earthquake_gdf['point'] = None
earthquake_gdf.head()

Unnamed: 0,date,time,lat,lon,depth,magnitude_range,n_station,RMS,locality,id,point
0,2014-11-22,21:14:39.1,54.267,-2.486,7.1,0.9,5,0.5,SEDBERGH,1,
1,2014-11-25,06:38:05.9,56.4,-4.817,7.8,0.8,5,0.2,INVERLOCHY,2,
2,2014-11-25,06:38:32.2,56.411,-4.825,7.7,0.6,3,0.4,INVERLOCHY,3,
3,2014-12-01,03:56:32.1,53.855,-3.764,4.3,0.9,7,0.1,IRISH SEA,4,
4,2014-12-03,21:57:05.2,55.802,-3.187,6.4,2.0,12,0.2,PENICUIK,5,


In [113]:
for index, row in earthquake_gdf.iterrows():
  earthquake_gdf.at[index, "point"] = f"{row['lat'].strip()}, {row['lon'].strip()}"
  earthquake_gdf.at[index, "time"] = earthquake_gdf.at[index, "time"].split(".")[0].strip()

earthquake_gdf = earthquake_gdf.drop(['lat', 'lon'], axis=1)
earthquake_gdf['locality'] = earthquake_gdf['locality'].str.title()
earthquake_gdf.head()

Unnamed: 0,date,time,depth,magnitude_range,n_station,RMS,locality,id,point
0,2014-11-22,21:14:39,7.1,0.9,5,0.5,Sedbergh,1,"54.267, -2.486"
1,2014-11-25,06:38:05,7.8,0.8,5,0.2,Inverlochy,2,"56.400, -4.817"
2,2014-11-25,06:38:32,7.7,0.6,3,0.4,Inverlochy,3,"56.411, -4.825"
3,2014-12-01,03:56:32,4.3,0.9,7,0.1,Irish Sea,4,"53.855, -3.764"
4,2014-12-03,21:57:05,6.4,2.0,12,0.2,Penicuik,5,"55.802, -3.187"


In [116]:
# Drop empty string value
for col in earthquake_gdf.columns.tolist():
  earthquake_gdf = earthquake_gdf[earthquake_gdf[col] != " "]

print(earthquake_gdf.shape)
earthquake_gdf.head()

(3003, 9)


Unnamed: 0,date,time,depth,magnitude_range,n_station,RMS,locality,id,point
0,2014-11-22,21:14:39,7.1,0.9,5,0.5,Sedbergh,1,"54.267, -2.486"
1,2014-11-25,06:38:05,7.8,0.8,5,0.2,Inverlochy,2,"56.400, -4.817"
2,2014-11-25,06:38:32,7.7,0.6,3,0.4,Inverlochy,3,"56.411, -4.825"
3,2014-12-01,03:56:32,4.3,0.9,7,0.1,Irish Sea,4,"53.855, -3.764"
4,2014-12-03,21:57:05,6.4,2.0,12,0.2,Penicuik,5,"55.802, -3.187"


In [117]:
earthquake_gdf['date'] = earthquake_gdf['date'].astype('string').str.strip()
earthquake_gdf['time'] = earthquake_gdf['time'].astype('string').str.strip()
earthquake_gdf['depth'] = earthquake_gdf['depth'].astype(float)
earthquake_gdf['magnitude_range'] = earthquake_gdf['magnitude_range'].astype(float)
earthquake_gdf['n_station'] = earthquake_gdf['n_station'].astype(int)
earthquake_gdf['RMS'] = earthquake_gdf['RMS'].astype(float)
earthquake_gdf['locality'] = earthquake_gdf['locality'].astype('string').str.strip()
earthquake_gdf['id'] = earthquake_gdf['id'].astype(int)
earthquake_gdf['point'] = earthquake_gdf['point'].astype(object)

earthquake_gdf.dtypes

date               string[python]
time               string[python]
depth                     float64
magnitude_range           float64
n_station                   int32
RMS                       float64
locality           string[python]
id                          int32
point                      object
dtype: object

In [118]:
earthquake_gdf.to_csv(os.path.join(PREPROCESSED_DATA_DIR, 'earthquake.csv'), index=False)
earthquake_gdf.to_json(os.path.join(PREPROCESSED_DATA_DIR, "earthquake.json"), indent=2, orient="records")