In [1]:
# Import libraries
import pandas as pd
import xml.etree.cElementTree as et
import sys
import re
reload(sys)

# Set default encoding for strings to 'utf-8'
sys.setdefaultencoding('utf-8')

# Set OSM path
osm = "./singapore-sample.osm"

# Function to extract elements from xml file based on tags
def get_elem(osm, tags = ('node', 'way', 'relation')):
    '''
        Args:
            osm: OSM/XML file
            tags: tags to match in 'osm'
            
        Returns:
            Extracted elements
    '''
    # Parse OSM file, iterate through the file from start to end beginning from the root
    context = et.iterparse(osm, events = ('start', 'end'))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()

# Function to process OSM file and returns data frames of extracted values
def process_file(osm):
    '''
        Args: 
            osm: OSM/XML file
        
        Returns:
            Dataframes of values extracted from 'osm'
    '''
    # Lists to store values
    nodes = []
    node_tags = []
    ways = []
    way_nodes = []
    way_tags = []
    
    # Regular expression to match for string (a-z with : followed by a-z)
    match = re.compile(r'^[a-z]+:[a-z]')
    
    # Iterate through values from get_elem function
    for element in get_elem(osm, tags = ('node', 'way')):
        if element.tag == 'node':
            
            # Store values in dictionary
            node_values = {
                'id': element.attrib['id'], 
                'lat': element.attrib['lat'], 
                'lon': element.attrib['lon'], 
                'user': element.attrib['user'], 
                'uid': element.attrib['uid'], 
                'version': element.attrib['version'], 
                'changeset': element.attrib['changeset'], 
                'timestamp': element.attrib['timestamp']
            }
            
            # Append dictionary to list
            nodes.append(node_values)
            
            # Iterate through the sub-attributes of elements with tag 'node'
            for sub_attr in element:
                
                # If sub-attribute matches expression
                if match.search(sub_attr.attrib['k']):
                    node_tag_values = {
                        'id': element.attrib['id'],
                        'key': sub_attr.attrib['k'].split(':')[1],
                        'type': sub_attr.attrib['k'].split(':')[0],
                        'value': sub_attr.attrib['v']
                    }
                    node_tags.append(node_tag_values)
                else:
                    node_tag_values = {
                        'id': element.attrib['id'],
                        'key': sub_attr.attrib['k'],
                        'type': 'default',
                        'value': sub_attr.attrib['v']
                    }
                    node_tags.append(node_tag_values)
        
        # Iterate through the sub-attributes of elements with tag 'way'
        elif element.tag == 'way':

            # Store values in dictionary
            way_values = {
                'id': element.attrib['id'],  
                'user': element.attrib['user'], 
                'uid': element.attrib['uid'], 
                'version': element.attrib['version'], 
                'changeset': element.attrib['changeset'], 
                'timestamp': element.attrib['timestamp']
            }
            
            # Append dictionary to list
            ways.append(way_values)            
            
            position = 0
            for sub_attr in element:
                if sub_attr.tag == 'nd':
                    way_node_field_values = {
                        'id': element.attrib['id'],
                        'node_id': sub_attr.attrib['ref'],
                        'position': position
                    }
                    position += 1
                    way_nodes.append(way_node_field_values)
                    
                elif sub_attr.tag == 'tag':
                    
                    # If sub-attribute matches expression
                    if match.search(sub_attr.attrib['k']):
                        way_tag_values = {
                            'id': element.attrib['id'],
                            'key': sub_attr.attrib['k'].split(':')[1],
                            'type': sub_attr.attrib['k'].split(':')[0],
                            'value': sub_attr.attrib['v']
                        }
                        way_tags.append(way_tag_values)
                    else:
                        way_tag_values = {
                            'id': element.attrib['id'],
                            'key': sub_attr.attrib['k'],
                            'type': 'default',
                            'value': sub_attr.attrib['v']
                        }
                        way_tags.append(way_tag_values)
                    
    # Return lists
    return nodes, node_tags, ways, way_nodes, way_tags

nodes, nodes_tags, ways, way_nodes, way_tags = process_file(osm)

# Convert lists to pandas dataframes
nodes_pd = pd.DataFrame(nodes)
nodes_tags_pd = pd.DataFrame(nodes_tags)
ways_pd = pd.DataFrame(ways)
ways_nodes_pd = pd.DataFrame(way_nodes)
ways_tags_pd = pd.DataFrame(way_tags)

# Convert dataframes to csvs
nodes_pd.to_csv('./nodes_pd.csv')
nodes_tags_pd.to_csv('./nodes_tags_pd.csv')
ways_pd.to_csv('./ways_pd.csv')
ways_nodes_pd.to_csv('./ways_nodes_pd.csv')
ways_tags_pd.to_csv('./ways_tags_pd.csv')