# Overview
This notebook's main goal is to expand the training dataset by augmentation means

In [27]:
import geopandas as gpd
import pandas as pd
import numpy as np
import json
import h3
import folium
import osmnx as ox
from shapely import wkt
from folium.plugins import HeatMap
from shapely.geometry import Polygon
from shapely.geometry import Point
from shapely.geometry import LineString
from shapely.geometry import MultiPolygon
import os
from area import area
from pyproj import Geod

In [28]:
MOSCOW = "Moscow"
RES1 = 8 # the resolution of the hypothetical hexagones to build for each given point
RES2 = 9 # resolution 1 degree larger: provide hexagons that will serve as building blocks for the extrapulated hexagones

## General Functions

In [29]:
def_cols = ['h3', 'lat', 'lng']

# given a geometry it returns lat, lng and h3 index with resolution 8
def get_h3_geo(g, res):
    lng = g.x if isinstance(g, Point) else g.centroid.x     
    lat = g.y if isinstance(g, Point) else g.centroid.y
    h3_addr = h3.geo_to_h3(lat=lat, lng=lng, resolution=res)
    return h3_addr, lat, lng

# funciton to apply on a dataframe
def get_h3(row, res):
    g = row['geometry']
    row['h3'], row['lat'], row['lng'] = get_h3_geo(g, res)
    return row

def osm_query(city, tag):
    gdf = ox.geometries_from_place(city, tag).reset_index()
    print(gdf.shape)
    return gdf

In [30]:
import random
random.seed(0)
def get_data(tag, n, res):
    g = osm_query(MOSCOW, tag)
    
    size = len(g)
    indices = random.choices(range(size), k=n)
    
    indices = list(set(indices))
    print(len(indices))
    
    def get_data_get_h3(row):
        return get_h3(row, res)
    
    return g.apply(get_data_get_h3, axis=1).loc[indices, def_cols]

In [31]:
# we want to extract points that satisfy the following tags
tags_education = [{"amenity": "college"}, {"amenity": "school"}, {"amenity": "university"}]
tags_parking = [{"amenity":"parking"}]
tags_bus = [{"highway":"bus_stop"}]
tags_financial = [{"amenity": "atm"}, {"amenity": "bank"}, {"amenity": "bureau_de_change"}]
acc_tags = [{'building' : 'apartments'}, {'building' : 'hotel'}, {'building' : 'house'}]
commercial_tags = [{'building' : 'commercial'}, {'building' : 'retail'}, {'building' : 'supermarket'}] # excludes kiosks
health_care_tags = [{"amenity":"hospital"}, {"amenity": "clinic"}, {"amenity": "pharmacy"}]
enter_tags = [{"amenity":"nightclub"}, {"amenity": "cinema"}, {"amenity": "community_centre"}]
sus_tags = [{"amenity":"bar"}, {"amenity":"cafe"}, {"amenity":"fast_food"}, {"amenity":"food_court"}, {"amenity":"pub"}, {"amenity":"restaurant"}]
highway_tags = [{"highway":"primary"}, {"highway":"secondary"}, {"highway":"tertiary"}, {"highway":"residential"}, {"highway":"pedestrian"}]

religious_tags = [{"building":"church"}]
parks_tags = [{"boundary": "national_park"}, {"boundary":"protected_area"}]
air_tags = [{"aeroway": "aerodrome"}, {"aeroway": "apron"}, {"aeroway": "hangar"}] 

In [32]:
all_tags = [{"amenity": "college"}, {"amenity": "school"}, {"amenity": "university"}, {"amenity":"parking"}, {"highway":"bus_stop"}, 
            {"amenity": "atm"}, {"amenity": "bank"}, {"amenity": "bureau_de_change"}, {'building' : 'apartments'}, {'building' : 'hotel'}, {'building' : 'house'},
            {'building' : 'commercial'}, {'building' : 'retail'}, {'building' : 'supermarket'}, {"amenity":"hospital"}, {"amenity": "clinic"}, {"amenity": "pharmacy"}, 
            {"amenity":"nightclub"}, {"amenity": "cinema"}, {"amenity": "community_centre"}, 
            {"amenity":"bar"}, {"amenity":"cafe"}, {"amenity":"fast_food"}, {"amenity":"food_court"}, {"amenity":"pub"}, {"amenity":"restaurant"}, 
            {"highway":"primary"}, {"highway":"secondary"}, {"highway":"tertiary"}, {"highway":"residential"}, {"highway":"pedestrian"},
            {"building":"church"}, 
            {"boundary": "national_park"}, {"boundary":"protected_area"}, {"aeroway": "hangar"}, 
            {"amenity": "social_facility"}, {"amenity": "nursing_home"}, {"amenity": "grave_yard"}]

In [33]:
# add_tags = [{"amenity": "social_facility"}, {"amenity": "nursing_home"}, {"amenity": "grave_yard"}]

In [34]:
# d = get_data(tags_education[0], 500, RES2)
# d_h3_9 = set(d['h3'].values)

In [35]:
# data_9_h3 = set(data_9['h3'].values)

# print(len(data_9_h3.intersection(d_h3_9)))

In [36]:
data_9 = pd.read_excel(os.path.join("osm_features", f"training_data_{str(RES2)}.xlsx")).set_index('h3')
h3_9_set = set(list(data_9.index))

In [37]:
def get_hexagone_datapoint(row):
    h3_9 = row['h3'] 
    # gets all seven hexagones with distance less or equal to 1: the ones we need to build the extrapulated hexagone
    neighbors_h3s = h3.k_ring(h3_9, 1)
    col_dict = {} 
    common_indices =  set(list(neighbors_h3s)).intersection(h3_9_set)
    if len(common_indices) > 0:
        row_data = data_9.drop(columns=['lat', 'lng']).loc[list(common_indices), :]        
        for col in row_data.columns:
            col_dict[col] = row_data[col].sum()
        try:
            df = pd.DataFrame(col_dict, index=list(range(1)))
        except IndexError:
            return 
        # add the coordinates
        df['lat'] = row['lat']
        df['lng'] = row['lng']
        return df

In [38]:
# test = get_data(tags_education[0], 10, RES2)

In [39]:
# test_feat = pd.DataFrame(data=[], columns=data_9.columns)
# for id, row in test.iterrows():
#     new_row = get_hexagone_datapoint(row)
#     test_feat = pd.concat([test_feat, new_row])


In [40]:
def osm_augmented_tag(tag):
    tag_data = get_data(tag, 300, RES2)
    tag_feat = pd.DataFrame(data=[], columns=data_9.columns)
    for _ , row in tag_data.iterrows():
        new_row = get_hexagone_datapoint(row)
        if new_row is not None:
            tag_feat = pd.concat([tag_feat, new_row])
    return tag_feat

def osm_augmented_all(all_tags):
    augmented_osm_feats = pd.DataFrame(data=[], columns=data_9.columns)
    for tag in all_tags:
        print(tag)
        augmented_osm_feats = pd.concat([augmented_osm_feats, osm_augmented_tag(tag)], ignore_index=True)
    return augmented_osm_feats

In [41]:
# additional_aug_data = osm_augmented_all(add_tags)

In [43]:
augmented_osm_data = osm_augmented_all(all_tags)

{'amenity': 'college'}
(281, 88)
183
{'amenity': 'school'}
(1949, 152)
281
{'amenity': 'university'}
(380, 194)
216
{'amenity': 'parking'}
(15069, 139)
299
{'highway': 'bus_stop'}
(9539, 115)
292
{'amenity': 'atm'}
(2144, 91)
283
{'amenity': 'bank'}
(2047, 118)
284
{'amenity': 'bureau_de_change'}
(129, 39)
113
{'building': 'apartments'}
(30283, 168)
297
{'building': 'hotel'}
(121, 108)
110
{'building': 'house'}
(3040, 71)
284
{'building': 'commercial'}
(1864, 179)
278
{'building': 'retail'}
(2072, 205)
271
{'building': 'supermarket'}
(19, 36)
19
{'amenity': 'hospital'}
(264, 100)
185
{'amenity': 'clinic'}
(1361, 158)
269
{'amenity': 'pharmacy'}
(3483, 116)
287
{'amenity': 'nightclub'}
(90, 43)
86
{'amenity': 'cinema'}
(128, 76)
111
{'amenity': 'community_centre'}
(323, 123)
198
{'amenity': 'bar'}
(674, 93)
259
{'amenity': 'cafe'}
(3743, 189)
286
{'amenity': 'fast_food'}
(3222, 186)
290
{'amenity': 'food_court'}
(59, 43)
59
{'amenity': 'pub'}
(329, 81)
199
{'amenity': 'restaurant'}
(246

In [44]:
augmented_osm_data.to_excel(os.path.join('osm_features', 'augmented_data_8.xlsx'))

## Add Population and Metro stations data

In [45]:
# first set the augmented_osm_data h3 column to resolution 8
def set_h3_to_8(row):
    row['h3'] = h3.geo_to_h3(lat=row['lat'], lng=row['lng'], resolution=RES1)
    return row

def set_h3_to_9(row):
    row['h3'] = h3.geo_to_h3(lat=row['lat'], lng=row['lng'], resolution=RES2) # make sure to use RES2 and not RES1
    return row
    

augmented_osm_data = augmented_osm_data.apply(set_h3_to_8, axis=1)

In [46]:
# time to add the population and metro station data
pop_metro = pd.read_excel("training_data_y.xlsx").loc[:, ['h3', 'TotalPassengers', 'population']] # the h3 index of this dataset is 8

In [47]:
# the h3 index in the
augmented_data = pd.merge(augmented_osm_data, pop_metro, on='h3', how='left')
# print(augmented_data.isna().sum())
augmented_data = augmented_data.drop('h3', axis=1).fillna(0)

In [48]:
augmented_data = augmented_data.drop_duplicates(subset=['lat', 'lng'])

In [49]:
augmented_data.to_excel("augmented_no_y.xlsx")

## Adding the target variable

In [50]:
# first import the file with the count of postamats in each hexagon of resolution 9
post_count_9 = pd.read_excel('postmats_count_9.xlsx').set_index('h3')

augmented_data = augmented_data.apply(set_h3_to_9, axis=1)

In [51]:
aug_h3_9 = set(augmented_data['h3'].values)
pos_h3_9 = set(list(post_count_9.index))

print(len(pos_h3_9.intersection(aug_h3_9)))
def posts_count_datapoint(row):
    h3_9 = row['h3']
    neighbors_h3 = h3.k_ring(h3_9, 1)
    count = 0
    for n in neighbors_h3:
        try:
            count += post_count_9.loc[n, 'y']
        except KeyError:
            pass
    row['y'] = count
    return row

1246


In [52]:
augmented_data = augmented_data.apply(posts_count_datapoint, axis=1)
print((augmented_data['y'] > 0).sum())
augmented_data.to_excel('augmented_data_y.xlsx')

6226


In [62]:
# augmented_data['y'].value_counts()
original_training_data = pd.read_excel('training_data_y.xlsx').fillna(0)

In [63]:
print((original_training_data['y'] > 0).sum())

955


In [64]:
print((original_training_data['y'] <= 0).sum())

1120


In [74]:
f_t_d = pd.concat([original_training_data, augmented_data], ignore_index=True)  
# pd.read_excel('final_training_data_y')

In [75]:
print(f_t_d.shape)
print((f_t_d['y'] > 0).sum())

(9635, 38)
7181


In [76]:
df_neg = f_t_d[f_t_d['y'] <= 0]
print(df_neg.shape)

(2454, 38)


In [77]:
df_pos = f_t_d[f_t_d['y'] > 0]
random_indices = random.sample(list(f_t_d.index), 3600)
df_pos = f_t_d.loc[random_indices, :]

In [78]:
f_t_d = pd.concat([df_pos, df_neg], ignore_index=True)#.drop(columns=['h3'])

In [79]:
f_t_d.to_excel('final_training_dataset.xlsx')