In [1]:
import geopandas as gpd
import osmium as osm
import shapely as shp

import pandas as pd

import numpy as np


In [2]:
class OSMHandler(osm.SimpleHandler):
    def __init__(self):
        osm.SimpleHandler.__init__(self)
        self.osm_data = []
        self.coords = 0
        self.tag_list = []
        
    def tag_inventory(self, elem, elem_type):
        self.tag_list = []
        for tag in elem.tags:
            self.tag_list.append((tag.k, tag.v))
        
        if (elem_type == "node"):
            self.osm_data.append([elem_type, 
                                   self.coords,
                                   len(self.tag_list),
                                   self.tag_list])
        elif (elem_type == "way"):
            self.osm_data.append([elem_type, 
                                   0,
                                   len(self.tag_list),
                                   self.tag_list])
        else:
            self.osm_data.append([elem_type, 
                                  self.coords,
                                  len(self.tag_list),
                                  self.tag_list])
            
    
    def node(self, n):    
        self.coords = shp.Point((n.location.x, n.location.y))
        self.tag_inventory(n, "node")
        
        del n
        
        
    def way(self, w):
        test = []
        for i in range(len(w.nodes)):
            test.append((w.nodes[i].location.x, w.nodes[i].location.y))
        
        # this only works if we have more than 4 coordinates if not then throw this data entry away
        if (4 <= len(test)):
            self.coords = shp.Polygon(test.copy())
            self.tag_inventory(w, "way")
        
        # if you dont do this the libary sometimes, throws erros, maybe only jupyter issues needs furtehr testing
        del w
        del test

    # its really hard to get the location out of relations
    def relation(self, r):   
        self.tag_inventory(r, "relation")
        
        del r

In [20]:
class LessDataOSMHandler(osm.SimpleHandler):
    def __init__(self, category_whitelist):
        osm.SimpleHandler.__init__(self)
        self.osm_data = []
        self.tag_list = []
        
        self.category_whitelist = category_whitelist
        
        self.wkbfab = osm.geom.WKBFactory()
        
        self.shapely_location = 0
        
        self.categories_of_osm_element = []
        
    def tag_inventory(self, elem, elem_type):
        self.tag_list = []
        for tag in elem.tags:
            self.tag_list.append((tag.k, tag.v))
        
        self.osm_data.append([elem_type,
                            #self.shapely_location,
                            len(self.tag_list),
                            np.asarray(self.tag_list, dtype=str),
                            np.asarray(self.categories_of_osm_element, dtype=np.uint16)])
            
    
    def node(self, n):   
        self.categories_of_osm_element = []
        
        # check if the osm_element applies to a category
        for category in self.category_whitelist:
            name = category[0]
            whitelist= category[1]
            
            if n.tags.get(whitelist[0]): #== whitelist[1]:
                self.categories_of_osm_element.append(name)
        
        if self.categories_of_osm_element:
            self.shapely_location = shp.Point((n.location.x, n.location.y))
            self.tag_inventory(n, "node")
        
        del n

#     def way(self, w):
#         self.categories_of_osm_element = []
            
#          # check if the osm_element applies to a category
#         for category in self.category_whitelist:
#             name = category[0]
#             whitelist= category[1]
            
#             if w.tags.get(whitelist[0]) == whitelist[1]:
#                     self.categories_of_osm_element.append(name)
        
#         if self.categories_of_osm_element:
#             self.tag_inventory(w, "way")

#     # its really hard to get the location out of relations
#     def relation(self, r):   
#         self.categories_of_osm_element = []
            
#           # check if the osm_element applies to a category
#         for category in self.category_whitelist:
#             name = category[0]
#             whitelist= category[1]
            
#             if r.tags.get(whitelist[0]) == whitelist[1]:
#                     self.categories_of_osm_element.append(name)
                
#         if self.categories_of_osm_element:   
#             self.tag_inventory(r, "relation")
    
    def area(self, a): 
        self.categories_of_osm_element = []
            
          # check if the osm_element applies to a category
        for category in self.category_whitelist:
            name = category[0]
            whitelist= category[1]
            
            if a.tags.get(whitelist[0]): #== whitelist[1]:
                self.categories_of_osm_element.append(name)
                
        if self.categories_of_osm_element:
            # create location/multipolygon
            wkbshape = self.wkbfab.create_multipolygon(a)
            self.shapely_location = shp.wkb.loads(wkbshape, hex=True)
            
            if a.from_way:
                self.tag_inventory(a, "area-way")
            else:
                self.tag_inventory(a, "area-relation")

In [21]:
# defien a whitelist
category_1_whitelist = [1, ("building", "*")]
category_2_whitelist = [2, ("landuse", "forest")]
category_3_whitelist = [3, ("natural", "water")]
category_4_whitelist = [4, ("type", "multipolygon")]
category_5_whitelist = [5, ("building", "false")]
category_6_whitelist = [6, ("water", "river")]

category_list = [category_1_whitelist, category_2_whitelist, 
                 category_3_whitelist, category_4_whitelist, 
                 category_5_whitelist, category_6_whitelist]

In [22]:
osmhandler = LessDataOSMHandler(category_list)
# scan the input file and fills the handler list accordingly
osmhandler.apply_file("../../data/andorra-latest.osm.pbf", locations=True)

In [23]:
# transform the list into a pandas DataFrame
data_colnames = ['osm_type', 'ntags', 'tags', 'categories']
df_osm = gpd.GeoDataFrame(osmhandler.osm_data, columns=data_colnames)
# df_osm = tag_genome.sort_values(by=['type', 'id', 'ts'])

In [24]:
len(df_osm)

9545

In [30]:
df_osm["tags"][100]

array([['ele', '2450'],
       ['mountain_pass', 'yes'],
       ['name', 'Collada de Juclar'],
       ['natural', 'saddle'],
       ['wikidata', 'Q21329828']], dtype='<U17')

In [11]:
df_osm.count()

osm_type      5671
ntags         5671
tags          5671
categories    5671
dtype: int64

In [51]:
df_osm

Unnamed: 0,osm_type,ntags,tags,categories
0,way,1,"[(landuse, forest)]",[category2]
1,way,1,"[(landuse, forest)]",[category2]
2,way,1,"[(landuse, forest)]",[category2]
3,way,1,"[(landuse, forest)]",[category2]
4,way,2,"[(natural, water), (water, reservoir)]",[category3]
...,...,...,...,...
666,relation,2,"[(landuse, forest), (type, multipolygon)]","[category2, category4]"
667,relation,2,"[(landuse, forest), (type, multipolygon)]","[category2, category4]"
668,relation,2,"[(landuse, forest), (type, multipolygon)]","[category2, category4]"
669,relation,2,"[(landuse, forest), (type, multipolygon)]","[category2, category4]"


In [52]:
df_osm.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 671 entries, 0 to 670
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   osm_type    671 non-null    object
 1   ntags       671 non-null    int64 
 2   tags        671 non-null    object
 3   categories  671 non-null    object
dtypes: int64(1), object(3)
memory usage: 21.1+ KB


## Test Tag filtering

Teste how blacklist and whitelist work in a geodataframe

In [10]:
df_osm["tags"][368940]

[('natural', 'water'), ('type', 'multipolygon'), ('water', 'river')]

In [11]:
# get the tag entry, from the last row
df_osm["tags"][368940]

[('natural', 'water'), ('type', 'multipolygon'), ('water', 'river')]

In [12]:
# get a single tag fro mthe tag entry
df_osm["tags"][368940][0]

('natural', 'water')

In [22]:
try:
    %load_ext autotime
except:
    !pip install ipython-autotime
    %load_ext autotime

The autotime extension is already loaded. To reload it, use:
  %reload_ext autotime
time: 2.34 ms (started: 2023-01-20 23:04:59 +01:00)


In [27]:
df1=pd.DataFrame({'building1':['true', 'none', 'false', 'true', 'false', 'none'], 'building2':['true', 'none', 'false', 'true', 'false', 'none']})

time: 1.56 ms (started: 2023-01-20 23:07:35 +01:00)


In [28]:
df1

Unnamed: 0,building1,building2
0,true,true
1,none,none
2,false,false
3,true,true
4,false,false
5,none,none


time: 13.8 ms (started: 2023-01-20 23:07:37 +01:00)


In [10]:
# defien a whitelist
category_1_whitelist = [1, ("building", "true")]
category_2_whitelist = [2, ("landuse", "forest")]
category_3_whitelist = [3, ("natural", "water")]

category_list = [category_1_whitelist, category_2_whitelist, category_3_whitelist]

In [None]:
osmhandler = LessDataOSMHandler(category_list)
# scan the input file and fills the handler list accordingly
osmhandler.apply_file("../../data/andorra-latest.osm.pbf", locations=True)

In [None]:
# transform the list into a pandas DataFrame
data_colnames = ['osm_type', 'ntags', 'tags', 'categories']
df_osm = gpd.GeoDataFrame(osmhandler.osm_data, columns=data_colnames)
# df_osm = tag_genome.sort_values(by=['type', 'id', 'ts'])

TypeError: object of type 'int' has no len()

In [None]:
df_osm

In [8]:
df_osm.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 557 entries, 0 to 556
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   osm_type    557 non-null    object
 1   ntags       557 non-null    int64 
 2   tags        557 non-null    object
 3   categories  557 non-null    object
dtypes: int64(1), object(3)
memory usage: 17.5+ KB


Osmhandler has the following attributes:
- osm_type_list
- ntags_list
- tag_list_finish
- category_list

In [27]:
print(len(np.asarray(osmhandler.osm_type_list, dtype=str)))
print(len(osmhandler.ntags_list))
print(len(osmhandler.tag_list_finish))
print(len(osmhandler.category_list))

671
671
671
671


In [28]:
data_colnames = ['osm_type', 'ntags', 'tags', 'categories']
df_osm = gpd.GeoDataFrame([np.asarray(osmhandler.osm_type_list, dtype=str), 
                          osmhandler.ntags_list,
                          osmhandler.tag_list_finish,
                          osmhandler.category_list], columns=data_colnames)

ValueError: 4 columns passed, passed data had 671 columns

In [88]:
osmhandler.osm_data

[['way',
  1,
  array([['landuse', 'forest']], dtype='<U7'),
  array([2], dtype=uint16)],
 ['way',
  1,
  array([['landuse', 'forest']], dtype='<U7'),
  array([2], dtype=uint16)],
 ['way',
  1,
  array([['landuse', 'forest']], dtype='<U7'),
  array([2], dtype=uint16)],
 ['way',
  1,
  array([['landuse', 'forest']], dtype='<U7'),
  array([2], dtype=uint16)],
 ['way',
  2,
  array([['natural', 'water'],
         ['water', 'reservoir']], dtype='<U9'),
  array([3], dtype=uint16)],
 ['way',
  1,
  array([['landuse', 'forest']], dtype='<U7'),
  array([2], dtype=uint16)],
 ['way',
  1,
  array([['landuse', 'forest']], dtype='<U7'),
  array([2], dtype=uint16)],
 ['way',
  1,
  array([['landuse', 'forest']], dtype='<U7'),
  array([2], dtype=uint16)],
 ['way',
  1,
  array([['landuse', 'forest']], dtype='<U7'),
  array([2], dtype=uint16)],
 ['way',
  1,
  array([['landuse', 'forest']], dtype='<U7'),
  array([2], dtype=uint16)],
 ['way',
  4,
  array([['layer', '2'],
         ['name', "Llac d'Eng