-
Notifications
You must be signed in to change notification settings - Fork 0
/
import_shp.py
executable file
·83 lines (73 loc) · 2.96 KB
/
import_shp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
#!/usr/bin/env python
import pandas
import re
from datetime import datetime
from geopandas import read_file
merge_datasets = [
['Dns', 'ftl'],
['NN_', 'dn_', 'SCDI']
]
df = read_file("dataset/SCDi.shp")
# reproject to EPSG:4326 for Kepler
df = df.to_crs("EPSG:4326")
# get years and dataset headlines from file
years = set()
datasets = set()
dataframes = {}
for col in df.columns:
data_year_pattern = r"([a-zA-Z_]{3,4})([0-9]{4})"
match = re.match(data_year_pattern, col)
if match:
datasets.add(match.group(1))
years.add(match.group(2))
# generate merged datasets, in addition to single datasets
datasets = list(datasets) + merge_datasets
# create geodataframes from all non-zero points, add timestamps
for dataset in datasets:
yearly_frames = []
for year in years:
print(dataset)
print(year)
columns = {dataset} if isinstance(dataset, str) else dataset
# pick selected columns from the whole set
cols = [column + year for column in columns]
# filter points that have at least one non-zero value in any of the datasets
not_null_query = ""
for col in cols:
# a single not null column is enough
if not_null_query:
not_null_query += "|"
# filter truthy points
if df[col].dtype == 'O':
not_null_query += f"(({col} != \'NA\') & ({col} != \'NANA\') & ({col}.notnull()))"
elif df[col].dtype == 'float64' or 'int64':
not_null_query += f"({col} > 0)"
data_to_add = df.query(not_null_query, engine='python')
yearly_data = data_to_add[[*cols, 'BorderCell', 'countries', 'geometry']]
# rename column to get rid of year suffix
for col, dataset_name in zip(cols, columns):
yearly_data = yearly_data.rename({col: dataset_name}, axis='columns')
# a bit stupid to have to have hours in yearly data, but here we go
timestamp = datetime(year=int(year), month=12, day=31, hour=12, minute=0)
yearly_data.insert(0, 'datetime', timestamp)
# BorderCell will have to be reformatted so Kepler detects bool values
yearly_data['BorderCell'] = yearly_data['BorderCell'].apply(
lambda value: True if value == 'Yes' else False)
yearly_data = yearly_data.set_index('datetime', append=True)
yearly_frames.append(yearly_data)
gdf = pandas.concat(yearly_frames).sort_index()
# Kepler also wants point data for heatmaps etc.
centroids = gdf['geometry'].apply(
lambda geom: geom.centroid
)
gdf['lon'] = centroids.apply(
lambda centroid: centroid.x
)
gdf['lat'] = centroids.apply(
lambda centroid: centroid.y
)
dataset_name = dataset if isinstance(dataset, str) else '_'.join(dataset)
gdf.to_file(f"frontend/public/data/{dataset_name}.json", driver="GeoJSON")
gdf.to_csv(f"frontend/public/data/{dataset_name}.csv")
dataframes[dataset_name] = gdf
print(dataframes)