# POI data processing
1. Construct a postgres database called osm_de, with extra extensions installed: postgis, hstore.
2. Run the below command line.
`osm2pgsql -d osm_de -U postgres -W -H localhost -P 5433 -S D:\nine-euro-ticket-de\src\osm2pgsql-1.9.1-x64\osm2pgsql-bin\flex-config\pois.lua -O flex D:\nine-euro-ticket-de\dbs\geo\germany-latest.osm.pbf`

In [None]:
%load_ext autoreload
%autoreload 2
%cd D:\d-ticket-de

In [2]:
# Load libs
import pandas as pd
import os
os.environ['USE_PYGEOS'] = '0'
import openai
import workers
import geopandas as gpd
from tqdm.notebook import tqdm
import sqlalchemy
import time
import numpy as np

In [5]:
ylist = [2022, 2023]
mlist = [2, 3, 4, 5]
ty = np.random.choice(mlist, size=3)
print(ty)

[2 5 3]


In [3]:
# Data location
user = workers.keys_manager['database']['user']
password = workers.keys_manager['database']['password']
port = workers.keys_manager['database']['port']
db_name = workers.keys_manager['database']['name']
engine = sqlalchemy.create_engine(f'postgresql://{user}:{password}@localhost:{port}/{db_name}?gssencmode=disable')

In [4]:
# Data location for OSM data of Sweden (April 18, 2024)
db_name_osm = workers.keys_manager['osmdb']['name']
engine_osm = sqlalchemy.create_engine(f'postgresql://{user}:{password}@localhost:{port}/{db_name_osm}?gssencmode=disable')

## 1. Load POI data
'historic' 'tourism' 'leisure' 'sport' 'shop' 'office' 'craft' - include all.

'amenity' - exclude some that won't have longer stay, e.g., recycle point.

'emergency'- exclude all.

In [46]:
# Get pois from database
gdf_pois = gpd.GeoDataFrame.from_postgis(sql="""SELECT osm_id, "class", subclass, name, geom FROM pois;""", con=engine_osm)
gdf_pois.head()

Unnamed: 0,osm_id,class,subclass,name,geom
0,9451269159,amenity,ferry_terminal,RORO5,POINT (2554819.461 8359838.918)
1,1416066420,amenity,ferry_terminal,Skandinavienkai,POINT (2337267.391 7664175.572)
2,412686570,amenity,ferry_terminal,,POINT (2353159.305 7495854.037)
3,324043489,historic,boundary_stone,25,POINT (1540117.623 6234635.781)
4,897267627,historic,boundary_stone,24,POINT (1540142.692 6234642.874)


In [47]:
gdf_pois = gdf_pois.to_crs(4326)
gdf_pois.loc[:, 'x'] = gdf_pois.loc[:, 'geom'].x
gdf_pois.loc[:, 'y'] = gdf_pois.loc[:, 'geom'].y
gdf_pois.loc[:, 'coords'] = gdf_pois.apply(lambda row: ','.join([str(row['y']), str(row['x'])]), axis=1)

In [7]:
df_pois_tp = gdf_pois.groupby(['class', 'subclass']).size().to_frame(name='count').reset_index()
print(len(df_pois_tp))

6177


In [8]:
print(gdf_pois['class'].unique())

['amenity' 'historic' 'tourism' 'leisure' 'emergency' 'sport' 'shop'
 'office' 'craft']


In [13]:
df_pois_tp.sort_values(by=['count'], ascending=False).to_csv('dbs/poi/class_subclass.csv', index=False)

### 1.1 Check some classes

In [37]:
gdf_pois.loc[(gdf_pois['class'] == 'amenity') & (gdf_pois['subclass'] == 'proposed'), ['osm_id', 'name', 'coords']]

Unnamed: 0,osm_id,name,coords
1443840,3264405239,,"51.48904219917399,9.1474892"
1443849,3923231120,,"51.48885519917401,9.147852199999999"
3215165,3145342124,,"51.57029369916337,9.895554999999998"


In [None]:
53.82281899915785,9.2869184

### 1.2 Mark each combination
keep = 1, include

keep = 0, exclude

## 2. Clean up class-subclass combinations

In [5]:
df_pois_tp = pd.read_csv('dbs/poi/class_subclass_marked.csv')
df_pois_tp = df_pois_tp.loc[df_pois_tp['keep'] == 1, :]
len(df_pois_tp), df_pois_tp['count'].sum()

(5471, 2127451)

### 2.1 Subclass = yes
Make the subclass same as the class.

In [6]:
df_pois_tp.loc[:, 'subclass'] = df_pois_tp.apply(lambda row: row['class'] \
    if row['subclass'] == 'yes' else row['subclass'], axis=1)

### 2.2 Create preliminary tags

In [7]:
df_pois_tp.loc[:, 'tag'] = df_pois_tp.apply(lambda row: row['class'] \
    if row['class'] in ('office', 'craft') else row['subclass'], axis=1)

In [48]:
df_tags = df_pois_tp.groupby(['class', 'tag'])['count'].sum().reset_index()
df_tags.sort_values(by=['tag', 'count'], ascending=False, inplace=True)

In [50]:
df_tags.to_csv('dbs/poi/class_tag.csv', index=False)

### 2.3 Merge, clean, and label tags
This is done manually outside this notebook.

In [8]:
df_tags = pd.read_csv('dbs/poi/class_tag_marked.csv')

In [9]:
df_pois_tp = pd.merge(df_pois_tp, df_tags[['class', 'tag', 'class_f', 'tag_f', 'remove']], on=['class', 'tag'], how='left')
df_pois_tp = df_pois_tp.loc[df_pois_tp['remove'].isna()]
df_pois_tp.drop(columns=['remove'], inplace=True)
print(len(df_pois_tp), df_pois_tp.tag_f.nunique())

5347 1560


In [10]:
# Fix two errors
df_pois_tp.loc[(df_pois_tp['class']=='leisure') &\
               (df_pois_tp['subclass']=='sports_centre'), ['class_f', 'tag_f']] = \
    ('leisure', 'sports_centre')
df_pois_tp.loc[(df_pois_tp['class']=='shop') &\
               (df_pois_tp['subclass']=='till'), ['class_f', 'tag_f']] = \
    ('shop', 'shop')

In [94]:
for _, g in df_pois_tp.groupby('class_f'):
    if _ == 'shop':
        print(', '.join(g['tag_f'].unique()))

shop, hairdresser, bakery, clothes, supermarket, car_repair, car, ice_cream, bicycle_rental, bicycle_repair_station, car_rental, boat_rental, car_pooling, internet_cafe, healthcare, ski_rental, fixme, boat_sharing, foot_care, charity, freeshop, insurance, printshop, brewery, motorcycle_repair, rental, tools, craft, art, kick-scooter_rental, cargobike_rental, segway, caravan, boutique, bookmaker, beverages, scooter, cosmetics, cabinet_maker, beekeeping, bed, beauty, retail, boat, ticket, copyshop, construction, translation_service, trolley_rental, carpentry, cleaning, tailor, machinery, agency, kitchenware, optician, juice, joinery, toys, towing, travel_agency, tv, winery, wine, telecommunication, repair_shop, sun_studio, stall, shoe_repair, shisha, souvenir, car_service, car_sharing, call_shop, buggy_rental, boot_rental, bobbycar_rental, boat_repair, butcher, data, coffee, computer, crane, 3d_printing, cart_rental, baby_goods, dog, nail_salon, moving, lettershop, interior_decoration, l

In [100]:
df_tags_f = df_pois_tp.groupby(['class_f', 'tag_f'])['count'].sum().reset_index().sort_values(by=['class_f', 'count'], ascending=False)

### 2.4 Use GPT-4 learned categories for labeling

In [13]:
df_cat = pd.read_excel('dbs/poi/categories.xlsx')
df_cat.loc[:, 'label'] = df_cat.apply(lambda row: row['subcategory'] if row['type'] == 0\
    else '-'.join([row['category'], row['subcategory']]), axis=1)
df_cat.iloc[0]

class_f                     amenity; shop
category                  Food & Beverage
subcategory                    Restaurant
type                                    1
label          Food & Beverage-Restaurant
Name: 0, dtype: object

In [16]:
# Part 1 - office, historic, craft
df_pois_tp1 = df_pois_tp.loc[df_pois_tp['class_f'].isin(['historic', 'office', 'craft']), :].copy()
df_pois_tp2 = df_pois_tp.loc[~df_pois_tp['class_f'].isin(['historic', 'office', 'craft']), :].copy()
df_pois_tp1.loc[:, 'label'] = df_pois_tp1['class_f'].apply(lambda x: x.capitalize())
df_pois_tp1.head()

Unnamed: 0,class,subclass,count,keep,tag,class_f,tag_f,label
0,craft,hvac,2007,1.0,craft,craft,craft,Craft
1,historic,historic,13691,1.0,historic,historic,historic,Historic
2,office,office,4437,1.0,office,office,office,Office
18,historic,memorial,70555,1.0,memorial,historic,memorial,Historic
19,historic,wayside_cross,47444,1.0,wayside_cross,historic,wayside_cross,Historic


In [33]:
# Part 2 - amenity, leisure, tourism, shop
def poi_category(x, categories_str=None):
    flag = 0
    while flag != 1:
        try:
            response = openai.ChatCompletion.create(
              model="gpt-4",
              messages=[
                {
                  "role": "system",
                  "content": f"You will be presented with points of interest tags from OpenStreetMap and your job is to provide the most suitable tag from the following list. Choose ONLY from the list of tags provided here:\n\n{categories_str}"
                },
                {
                  "role": "user",
                  "content": x
                }
              ],
              temperature=0,
              max_tokens=1024,
              top_p=1,
              frequency_penalty=0,
              presence_penalty=0
            )
            flag = 1
            cate = response.choices[0].message.content
        except:
            time.sleep(1)
    return cate

In [34]:
label_dict = dict()

In [35]:
for c in ('amenity', 'leisure', 'tourism', 'shop', 'sport'):
    if c not in label_dict:
        print(f'Working on {c}...')
        categories_str = ', '.join(df_cat.loc[df_cat.class_f.str.contains(c), 'label'].values)
        label_dict[c] = dict()
        for _, row in tqdm(df_pois_tp2.loc[df_pois_tp2['class_f']==c, :].iterrows(), 
                           desc=f'Labeling {c}'):
            if row['tag_f'] not in label_dict[c]:
                label_dict[c][row['tag_f']] = poi_category(row['tag_f'], categories_str=categories_str)
    else:
        print(f'Finished labeling {c}.')

Working on amenity...


Labeling amenity: 0it [00:00, ?it/s]

Working on leisure...


Labeling leisure: 0it [00:00, ?it/s]

Working on tourism...


Labeling tourism: 0it [00:00, ?it/s]

Working on shop...


Labeling shop: 0it [00:00, ?it/s]

Working on sport...


Labeling sport: 0it [00:00, ?it/s]

In [36]:
df_pois_tp2.loc[:, 'label'] = df_pois_tp2.apply(lambda row: label_dict[row['class_f']][row['tag_f']], axis=1)
df_pois_tp2.head()

Unnamed: 0,class,subclass,count,keep,tag,class_f,tag_f,label
3,shop,shop,2862,1.0,shop,shop,shop,Retail & Fashion-Retail stores
4,tourism,tourism,1047,1.0,tourism,tourism,tourism,Outdoor & Recreational areas-Tourist attractions
5,amenity,restaurant,101330,1.0,restaurant,amenity,restaurant,Food & Beverage-Restaurant
6,amenity,place_of_worship,69258,1.0,place_of_worship,amenity,place_of_worship,Community & Social Services-Place of worship
7,amenity,fast_food,39610,1.0,fast_food,amenity,fast_food,Food & Beverage-Fast food


In [37]:
df_pois_tp_p = pd.concat([df_pois_tp1, df_pois_tp2])
df_pois_tp_p.to_csv('dbs/poi/class_f_tag_f_categories.csv', index=False)

## 3. Enrich POI data

In [48]:
df_pois_tp_p = pd.read_csv('dbs/poi/class_f_tag_f_categories_marked.csv')
df_pois_tp_p['label'] = df_pois_tp_p['label'].fillna(df_pois_tp_p['theme'])
df_pois_tp_p.head()

Unnamed: 0,class,subclass,count,keep,tag,class_f,tag_f,theme,label
0,amenity,game_feeding,683,1,game_feeding,amenity,game_feeding,Community & Social Services,Animal
1,amenity,stables,663,1,stables,amenity,stables,Community & Social Services,Animal
2,amenity,feeding_place,532,1,feeding_place,amenity,feeding_place,Community & Social Services,Animal
3,amenity,animal_shelter,398,1,animal_shelter,amenity,animal_shelter,Community & Social Services,Animal
4,amenity,animal_breeding,330,1,animal_breeding,amenity,animal,Community & Social Services,Animal


In [49]:
print('Before cleaning', len(gdf_pois))
gdf_pois = pd.merge(gdf_pois, 
                    df_pois_tp_p[['class', 'subclass', 'theme', 'label']], 
                    on=['class', 'subclass'],
                    how='inner')
print('After cleaning', len(gdf_pois))

Before cleaning 4609785
After cleaning 2086633


In [53]:
gdf_pois.head()

Unnamed: 0,osm_id,class,subclass,name,geom,x,y,coords,theme,label
0,324043489,historic,boundary_stone,25,POINT (13.83511 48.76237),13.835112,48.762367,"48.76236679975128,13.835111999999999",Historic,Historic
1,897267627,historic,boundary_stone,24,POINT (13.83534 48.76241),13.835337,48.762409,"48.76240879975127,13.8353372",Historic,Historic
2,323299682,historic,boundary_stone,22,POINT (13.83580 48.76255),13.835799,48.762546,"48.762545699751215,13.8357987",Historic,Historic
3,897267707,historic,boundary_stone,23,POINT (13.83551 48.76253),13.835508,48.762527,"48.76252669975122,13.8355078",Historic,Historic
4,323777930,historic,boundary_stone,21,POINT (13.83614 48.76274),13.836139,48.762737,"48.76273659975117,13.8361394",Historic,Historic


In [54]:
gdf_pois[['osm_id', 'class', 'subclass', 'name', 'theme', 'label', 'geom']].\
    to_postgis('poi', engine, if_exists='replace')  # Options: 'replace', 'append', 'fail'

## 4 Check unique locations in detected stops

In [56]:
batch_up = 300
df_u_list = []
for batch in tqdm(range(0, batch_up), desc='Loading stops'):
    df_u_list.append(pd.read_parquet(f'dbs/stops_p/stops_p_{batch}.parquet').\
                     drop_duplicates(subset=['latitude', 'longitude'])[['latitude', 'longitude']].\
                     reset_index(drop=True))
df_u = pd.concat(df_u_list)
df_u.drop_duplicates(subset=['latitude', 'longitude'], inplace=True)
len(df_u)

Loading stops:   0%|          | 0/300 [00:00<?, ?it/s]

129624268

In [57]:
df_u.loc[:, 'loc_id'] = range(0, len(df_u))
df_u.to_sql('unique_stops', engine, schema='stops', index=False, if_exists='append', method='multi', chunksize=10000)

129624268

In [51]:
batch = 9
df = pd.read_parquet(f'dbs/stops_p/stops_p_{batch}.parquet')
df.head()

Unnamed: 0,device_aid,loc,latitude,longitude,size,batch,dur,localtime,l_localtime,date,h_s,year,weekday,week,seq
4734889,00003c01-688e-4993-8fd7-6b37ec224a13,29,52.2946,8.8976,15,9,198.933333,2023-05-26 22:45:08+02:00,2023-05-27 02:04:04+02:00,2023-05-26,22,2023,4,21,1
4738591,00003c01-688e-4993-8fd7-6b37ec224a13,26,52.2833,8.9167,122,9,566.733333,2023-06-02 09:18:52+02:00,2023-06-02 18:45:36+02:00,2023-06-02,9,2023,4,22,2
2308571,00003c01-688e-4993-8fd7-6b37ec224a13,26,52.2833,8.9167,44,9,293.9,2023-06-03 00:33:09+02:00,2023-06-03 05:27:03+02:00,2023-06-03,0,2023,5,22,3
573889,00003c01-688e-4993-8fd7-6b37ec224a13,26,52.2833,8.9167,110,9,368.983333,2023-06-03 11:06:33+02:00,2023-06-03 17:15:32+02:00,2023-06-03,11,2023,5,22,4
1271468,00003c01-688e-4993-8fd7-6b37ec224a13,26,52.2833,8.9167,2,9,179.983333,2023-06-04 21:55:45+02:00,2023-06-05 00:55:44+02:00,2023-06-04,21,2023,6,22,5


In [57]:
device2group = {x:np.random.randint(1, 21) for x in list(df.device_aid.unique())}
df.loc[:, 'home2grp'] = df['device_aid'].map(device2group)
df.iloc[0]

device_aid     00003c01-688e-4993-8fd7-6b37ec224a13
loc                                              29
latitude                                    52.2946
longitude                                    8.8976
size                                             15
batch                                             9
dur                                      198.933333
localtime                 2023-05-26 22:45:08+02:00
l_localtime               2023-05-27 02:04:04+02:00
date                                     2023-05-26
h_s                                              22
year                                           2023
weekday                                           4
week                                             21
seq                                               1
home2grp                                         17
Name: 4734889, dtype: object