In [8]:
import geopandas as gpd
import osmium as osm
import shapely as shp

import pandas as pd

import numpy as np

import pathlib

In [9]:
class LessDataOSMHandler(osm.SimpleHandler):
    def __init__(self, category_whitelist):
        osm.SimpleHandler.__init__(self)
        self.osm_data = []
        self.tag_list = []
        
        self.category_whitelist = category_whitelist
        
        self.wkbfab = osm.geom.WKBFactory()
        
        self.shapely_location = 0
        
        self.categories_of_osm_element = []
        
    def tag_inventory(self, elem, elem_type):
        self.tag_list = []
        for tag in elem.tags:
            self.tag_list.append((tag.k, tag.v))
        
        self.osm_data.append([elem_type,
                            self.shapely_location,
                            np.asarray(self.tag_list, dtype=str),
                            np.asarray(self.categories_of_osm_element, dtype=np.uint16)])
            
    
    def node(self, n):   
        self.categories_of_osm_element = []
        
        # check if the osm_element applies to a category
        for category in self.category_whitelist:
            name = category[0]
            whitelist= category[1]
            
            if n.tags.get(whitelist[0]): #== whitelist[1]:
                self.categories_of_osm_element.append(name)
        
        if self.categories_of_osm_element:
            self.shapely_location = shp.Point((n.location.x, n.location.y))
            self.tag_inventory(n, "node")
        
        del n
    
    def area(self, a): 
        self.categories_of_osm_element = []
            
          # check if the osm_element applies to a category
        for category in self.category_whitelist:
            name = category[0]
            whitelist= category[1]
            
            if a.tags.get(whitelist[0]): #== whitelist[1]:
                self.categories_of_osm_element.append(name)
                
        if self.categories_of_osm_element:
            # create location/multipolygon
            wkbshape = self.wkbfab.create_multipolygon(a)
            self.shapely_location = shp.wkb.loads(wkbshape, hex=True)
            
            if a.from_way:
                self.tag_inventory(a, "area-way")
            else:
                self.tag_inventory(a, "area-relation")

In [19]:
# defien a whitelist
category_1_whitelist = [1, ("building", "*")]
category_2_whitelist = [2, ("landuse", "forest")]
category_3_whitelist = [3, ("natural", "water")]
category_4_whitelist = [4, ("type", "multipolygon")]
category_5_whitelist = [5, ("building", "false")]
category_6_whitelist = [6, ("water", "river")]

category_list = [category_1_whitelist, category_2_whitelist, 
                 category_3_whitelist, category_4_whitelist, 
                 category_5_whitelist, category_6_whitelist]

In [25]:
osmhandler = LessDataOSMHandler(category_list)
# scan the input file and fills the handler list accordingly
osmhandler.apply_file("../data/andorra-latest.osm.pbf", locations=True)

In [26]:
# transform the list into a pandas DataFrame
data_colnames = ['osm_type', 'location', 'tags', 'categories']
df_osm = gpd.GeoDataFrame(osmhandler.osm_data, columns=data_colnames)
# df_osm = tag_genome.sort_values(by=['type', 'id', 'ts'])

In [27]:
df_osm.tail(5)

Unnamed: 0,osm_type,location,tags,categories
9540,area-way,"MULTIPOLYGON (((1.6698645 42.5922718, 1.669969...","[[building, yes]]","[1, 5]"
9541,area-way,"MULTIPOLYGON (((1.5257934 42.5651596, 1.525963...","[[building, yes]]","[1, 5]"
9542,area-way,"MULTIPOLYGON (((1.5249968 42.5656255, 1.525063...","[[building, yes]]","[1, 5]"
9543,area-way,"MULTIPOLYGON (((1.5244791 42.5656187, 1.524512...","[[building, yes]]","[1, 5]"
9544,area-way,"MULTIPOLYGON (((1.494446 42.491348, 1.4945119 ...","[[building, yes]]","[1, 5]"


In [28]:
df_osm.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 9545 entries, 0 to 9544
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   osm_type    9545 non-null   object
 1   location    9545 non-null   object
 2   tags        9545 non-null   object
 3   categories  9545 non-null   object
dtypes: object(4)
memory usage: 298.4+ KB


Osmhandler has the following attributes:
- osm_type_list
- ntags_list
- tag_list_finish
- category_list

## Backup Handler

In [7]:
class OSMHandler(osm.SimpleHandler):
    def __init__(self):
        osm.SimpleHandler.__init__(self)
        self.osm_data = []
        self.coords = 0
        self.tag_list = []
        
    def tag_inventory(self, elem, elem_type):
        self.tag_list = []
        for tag in elem.tags:
            self.tag_list.append((tag.k, tag.v))
        
        if (elem_type == "node"):
            self.osm_data.append([elem_type, 
                                   self.coords,
                                   len(self.tag_list),
                                   self.tag_list])
        elif (elem_type == "way"):
            self.osm_data.append([elem_type, 
                                   0,
                                   len(self.tag_list),
                                   self.tag_list])
        else:
            self.osm_data.append([elem_type, 
                                  self.coords,
                                  len(self.tag_list),
                                  self.tag_list])
            
    
    def node(self, n):    
        self.coords = shp.Point((n.location.x, n.location.y))
        self.tag_inventory(n, "node")
        
        del n
        
        
    def way(self, w):
        test = []
        for i in range(len(w.nodes)):
            test.append((w.nodes[i].location.x, w.nodes[i].location.y))
        
        # this only works if we have more than 4 coordinates if not then throw this data entry away
        if (4 <= len(test)):
            self.coords = shp.Polygon(test.copy())
            self.tag_inventory(w, "way")
        
        # if you dont do this the libary sometimes, throws erros, maybe only jupyter issues needs furtehr testing
        del w
        del test

    # its really hard to get the location out of relations
    def relation(self, r):   
        self.tag_inventory(r, "relation")
        
        del r

## Backup Handler 2

In [9]:
class LessDataOSMHandler(osm.SimpleHandler):
    def __init__(self, category_whitelist):
        osm.SimpleHandler.__init__(self)
        self.osm_data = []
        self.tag_list = []
        
        self.category_whitelist = category_whitelist
        
        self.wkbfab = osm.geom.WKBFactory()
        
        self.shapely_location = 0
        
        self.categories_of_osm_element = []
        
    def tag_inventory(self, elem, elem_type):
        self.tag_list = []
        for tag in elem.tags:
            self.tag_list.append((tag.k, tag.v))
        
        self.osm_data.append([elem_type,
                            #self.shapely_location,
                            len(self.tag_list),
                            np.asarray(self.tag_list, dtype=str),
                            np.asarray(self.categories_of_osm_element, dtype=np.uint16)])
            
    
    def node(self, n):   
        self.categories_of_osm_element = []
        
        # check if the osm_element applies to a category
        for category in self.category_whitelist:
            name = category[0]
            whitelist= category[1]
            
            if n.tags.get(whitelist[0]): #== whitelist[1]:
                self.categories_of_osm_element.append(name)
        
        if self.categories_of_osm_element:
            self.shapely_location = shp.Point((n.location.x, n.location.y))
            self.tag_inventory(n, "node")
        
        del n

    def way(self, w):
        self.categories_of_osm_element = []
            
         # check if the osm_element applies to a category
        for category in self.category_whitelist:
            name = category[0]
            whitelist= category[1]
            
            if w.tags.get(whitelist[0]) == whitelist[1]:
                    self.categories_of_osm_element.append(name)
        
        if self.categories_of_osm_element:
            self.tag_inventory(w, "way")

    # its really hard to get the location out of relations
    def relation(self, r):   
        self.categories_of_osm_element = []
            
          # check if the osm_element applies to a category
        for category in self.category_whitelist:
            name = category[0]
            whitelist= category[1]
            
            if r.tags.get(whitelist[0]) == whitelist[1]:
                    self.categories_of_osm_element.append(name)
                
        if self.categories_of_osm_element:   
            self.tag_inventory(r, "relation")