In [8]:
# -*- coding: utf-8 -*-

import xml.etree.ElementTree as ET  # Use cElementTree or lxml if too slow

OSM_FILE = "shanghai_china.osm"  # 可根据地区更换osm文件
SAMPLE_FILE = "shanghai4.osm"

k = 70 # Parameter: take every k-th top level element

def get_element(osm_file, tags=('node', 'way', 'relation')):
    """Yield element if it is the right type of tag

    Reference:
    http://stackoverflow.com/questions/3095434/inserting-newlines-in-xml-file-generated-via-xml-etree-elementtree-in-python
    """
    context = iter(ET.iterparse(osm_file, events=('start', 'end')))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()


with open(SAMPLE_FILE, 'wb') as output:
    output.write('<?xml version="1.0" encoding="UTF-8"?>\n')
    output.write('<osm>\n  ')

    # Write every kth top level element
    for i, element in enumerate(get_element(OSM_FILE)):
        if i % k == 0:
            output.write(ET.tostring(element, encoding='utf-8'))

    output.write('</osm>')

In [9]:
# 查看osm文件的大小
ls -l shanghai4.osm

-rw-r--r--  1 lihan  staff  11131809  8 15 16:26 shanghai4.osm


In [10]:
# 查看详细文件
less shanghai4.osm

In [15]:
# 迭代解析处理地图文件，并找出有什么样的标记，以及有多少个，以便了解预计在地图中的每个类别有多少数据
import xml.etree.cElementTree as ET
import pprint
SAMPLE_FILE = "shanghai4.osm"
def count_tags(filename):
    elem_dict = {}
    for _, elem in ET.iterparse(filename, events=("start",)):
        if elem.tag in elem_dict:
            elem_dict[elem.tag] += 1
        else:
            elem_dict[elem.tag] = 1
    return elem_dict

In [19]:
def test():

    tags = count_tags('shanghai4.osm')
    pprint.pprint(tags)
       

if __name__ == "__main__":
    test()

{'member': 1331,
 'nd': 60639,
 'node': 51893,
 'osm': 1,
 'relation': 47,
 'tag': 17990,
 'way': 6481}


In [20]:
# 看看未取样前文件的标记及数据
def test():

    tags = count_tags('shanghai_china.osm')
    pprint.pprint(tags)
       

if __name__ == "__main__":
    test()

{'bounds': 1,
 'member': 56053,
 'nd': 4298484,
 'node': 3632460,
 'osm': 1,
 'relation': 3330,
 'tag': 1249009,
 'way': 453680}


In [21]:
# 检查每个“<标记>”的“k”值，看看是否存在潜在问题
import re
lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')


def key_type(element, keys):
    if element.tag == "tag":
        key = element.attrib['k']
        if lower.match(key):
            keys['lower'] +=1
        elif lower_colon.search(key):
            keys['lower_colon'] += 1
        elif problemchars.search(key):
            keys['problemchars'] += 1
        else:
            keys['other'] += 1
        pass
        
    return keys

In [30]:
def process_map(filename):
    keys = {"lower": 0, "lower_colon": 0, "problemchars": 0, "other": 0}
    for _, element in ET.iterparse(filename):
        keys = key_type(element, keys)

    return keys

In [31]:
def test():
    keys = process_map('shanghai4.osm')
    pprint.pprint(keys)
    


if __name__ == "__main__":
    test()

{'lower': 16207, 'lower_colon': 1749, 'other': 34, 'problemchars': 0}


In [32]:
def test():
    keys = process_map('shanghai_china.osm')
    pprint.pprint(keys)
    


if __name__ == "__main__":
    test()

{'lower': 1126241, 'lower_colon': 120051, 'other': 2715, 'problemchars': 2}


In [41]:
# 查看为地图做贡献的用户
def get_user(element):
    if 'uid' in element.attrib:
        return element.attrib['uid']
    
def process_map(filename):
    users = set()
    for _, element in ET.iterparse(filename):
        if get_user(element):
            users.add(get_user(element))

    return users


def test():

    users = process_map('shanghai4.osm')
    pprint.pprint(users)



if __name__ == "__main__":
    test()

set(['1001766',
     '101611',
     '1016648',
     '102398',
     '1045590',
     '104808',
     '1051550',
     '105260',
     '1056443',
     '1056611',
     '1068692',
     '106914',
     '1069176',
     '1073950',
     '110639',
     '1114156',
     '1115268',
     '1122708',
     '1123861',
     '1144604',
     '11463',
     '1164',
     '116515',
     '1167947',
     '1179821',
     '118021',
     '1185631',
     '1187510',
     '1189602',
     '1190212',
     '119281',
     '119348',
     '1195808',
     '1200724',
     '1207628',
     '121036',
     '1213049',
     '121406',
     '1215325',
     '1219875',
     '1227959',
     '1232293',
     '1236135',
     '1238487',
     '1238664',
     '1240849',
     '1245973',
     '124788',
     '1260280',
     '1263561',
     '1293194',
     '1294163',
     '131048',
     '1311287',
     '1314388',
     '131670',
     '131968',
     '13203',
     '13257',
     '1328943',
     '136645',
     '136807',
     '1370163',
     '13721',
     

In [56]:
# 更改地图中名字是英文的问题，替换为中文

# -*- coding: utf-8 -*-
import xml.etree.cElementTree as ET
from collections import defaultdict
import re
import pprint

a = u'\u5b81\u6ce2\u5e02\u5987\u5973\u513f\u7ae5\u533b\u9662'
b = u'\u4e0a\u6d77\u5e02\u4e16\u754c\u5916\u56fd\u8bed\u4e2d\u5b66'
c = u'\u4e2d\u56fd\u5efa\u8bbe\u94f6\u884c'
d = u'\u661f\u5df4\u514b'
OSMFILE = "shanghai4.osm"
name_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)
expected = ["a", "b"]

mapping = { "Ningbo Women & Children Hospital": "a",
            "World Foreign Language Middle School":"b",
           "China Construction Bank":"c",
           "Starbucks":"d"
            }

def audit_name_type(name_types, tag_name):
    m = name_type_re.search(tag_name)
    if m:
        name_type = m.group()
        if name_type not in expected:
            name_types[name_type].add(tag_name)


def is_tag_name(elem):
    return (elem.attrib['k'] == "name")


def audit(osmfile):
    osm_file = open(osmfile, "r")
    name_types = defaultdict(set)
    for event, elem in ET.iterparse(osm_file, events=("start",)):

        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_tag_name(tag):
                    audit_name_type(name_types, tag.attrib['v'])
    osm_file.close()
    return name_types


def update_name(name, mapping):
    changeword=mapping.keys()
    for word in mapping:
        if name(word): 
            return name.replace(word, mapping[word])

        
def test():
    n_types = audit(OSMFILE)
    pprint.pprint(dict(n_types))
    for n_type, ways in n_types.iteritems():
        for name in ways:
            better = update_name
            print name, "=>", better
            


if __name__ =='__main__':
    test()

{u'03\u7701\u9053': set([u'03\u7701\u9053']),
 u'04\u680b': set([u'04\u680b']),
 u'1#\u6559\u5b66\u697c\uff08\u5de5\u5546\u5b66\u9662\uff09': set([u'1#\u6559\u5b66\u697c\uff08\u5de5\u5546\u5b66\u9662\uff09']),
 '1(1)': set(['1(1)']),
 '10': set(['10']),
 u'104\u56fd\u9053': set([u'104\u56fd\u9053']),
 '107': set(['107']),
 u'10\u533a': set([u'\u7d2b\u7af9\u82d110\u533a']),
 u'11': set([u'\u672c\u79d1\u751f\u516c\u5bd311']),
 u'11#\u5b66\u751f\u5bbf\u820d': set([u'11#\u5b66\u751f\u5bbf\u820d']),
 u'1168\u5f047\u652f\u5f04': set([u'1168\u5f047\u652f\u5f04']),
 '1188': set(['Am Tor des Compounds Jia Hua Riviera auf der Puming Rd. 1188']),
 '12': set(['12']),
 u'12-14\u53f7\u697c': set([u'12-14\u53f7\u697c']),
 '123': set(['123']),
 '13': set(['13']),
 u'13\u53f7\u81ea\u884c\u8f66\u79df\u8d41\u70b9': set([u'\u6587\u91ce\u5df713\u53f7\u81ea\u884c\u8f66\u79df\u8d41\u70b9']),
 u'13\u680b': set([u'\u9999\u6a1f\u56ed13\u680b']),
 '14': set(['14']),
 u'149\u53f7\u81ea\u884c\u8f66\u79df\u8d41\u70

In [54]:
# 修正文中部分有问题的拼音

# -*- coding: utf-8 -*-
import xml.etree.cElementTree as ET
from collections import defaultdict
import re
import pprint
OSMFILE = "shanghai4.osm"
name_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)
a = u'liǎng àn kā fēi'
b = u'yín zhōu qū'
c = u'xīn qiáo zhèn'
d = u'dōng bà zhèn'
expected = ["a", "b","c","d"]

mapping = { "Liang'an Kafei": "a",
            "Yinzhou Qu":"b",
           "Xinqiao Zhen":"c",
           "Dongba Zhen":"d"
            }

def audit_name_type(name_types, tag_name):
    m = name_type_re.search(tag_name)
    if m:
        name_type = m.group()
        if name_type not in expected:
            name_types[name_type].add(tag_name)

def is_tag_name(elem):
    return (elem.attrib['k'] == "name:zh_pinyin")

def audit(osmfile):
    osm_file = open(osmfile, "r")
    name_types = defaultdict(set)
    for event, elem in ET.iterparse(osm_file, events=("start",)):

        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_tag_name(tag):
                    audit_name_type(name_types, tag.attrib['v'])
    osm_file.close()
    return name_types

def update_name(name, mapping):
    changeword=mapping.keys()
    for word in mapping:
        if name(word): 
            return name.replace(word, mapping[word])
def test():
    n_types = audit(OSMFILE)
    pprint.pprint(dict(n_types))
    
    for n_type, ways in n_types.iteritems():
        for name in ways:
            better = update_name1
            print name, "=>", better
            


if __name__ == '__main__':
    test()

{u'D\xe0d\xe0o': set([u'Li\u01cengg\u01ceng D\xe0d\xe0o',
                      u'L\xf3ngd\u014dng D\xe0d\xe0o']),
 u'G\u0101oji\xe0l\xf9': set([u'Lu\xf3sh\u0101n G\u0101oji\xe0l\xf9']),
 u'G\u0101os\xf9': set([u'H\xe1ngch\xe1ng G\u0101os\xf9',
                        u'H\xf9h\xe1ng G\u0101os\xf9',
                        u'H\xf9h\xe1ngy\u01d2ng G\u0101os\xf9',
                        u'R\xe0och\xe9ng G\u0101os\xf9']),
 u'G\u014dngl\xf9': set([u'Chu\u0101nb\u011bi G\u014dngl\xf9',
                         u'J\u016bnm\xedn G\u014dngl\xf9']),
 u'Ji\u0113': set([u'B\u011bish\u0101n Ji\u0113']),
 'Kafei': set(["Liang'an Kafei"]),
 'Lu': set(['Tanghuang Lu']),
 u'L\xf2ng': set([u'J\u016bnm\xednl\xf9 232 L\xf2ng',
                  u'Zh\u0113nd\xe0l\xf9 1 L\xf2ng']),
 u'L\xf9': set([u'B\u01ceosh\xed Y\u012b L\xf9',
                u'Ch\xe9nhu\u012b L\xf9',
                u'D\xe9sh\xe8ng L\xf9',
                u'F\u0101ngch\u016bn L\xf9',
                u'Hu\xe1nk\u0113 L\xf9',
           

In [58]:
# 导出为csv格式

import csv
import codecs
import pprint
import re
import xml.etree.cElementTree as ET

import cerberus

import schema1

OSM_PATH = "shanghai4.osm"

NODES_PATH = "nodes.csv"
NODE_TAGS_PATH = "nodes_tags.csv"
WAYS_PATH = "ways.csv"
WAY_NODES_PATH = "ways_nodes.csv"
WAY_TAGS_PATH = "ways_tags.csv"

LOWER_COLON = re.compile(r'^([a-z]|_)+:([a-z]|_)+')
PROBLEMCHARS = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

SCHEMA = schema1.schema




NODE_FIELDS = ['changeset','id', 'lat', 'lon','timestamp', 'uid','user','version']
NODE_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_FIELDS = ['changeset','id','timestamp','uid','user', 'version']
WAY_NODES_FIELDS = ['id', 'node_id', 'position']
WAY_TAGS_FIELDS = ['id', 'key', 'value', 'type']



def shape_element(element, node_attr_fields=NODE_FIELDS, way_attr_fields=WAY_FIELDS,
                  problem_chars=PROBLEMCHARS, default_tag_type='regular'):
    """Clean and shape node or way XML element to Python dict"""

    node_attribs = {}
    way_attribs = {}
    way_nodes = []
    tags = []  # Handle secondary tags the same way for both node and way elements
def shape_tag(el, tag): 
    tag = {
        'id'   : el.attrib['id'],
        'key'  : tag.attrib['k'],
        'value': tag.attrib['v'],
        'type' : 'regular'
    }
    
    if LOWER_COLON.match(tag['key']):
        tag['type'], _, tag['key'] = tag['key'].partition(':')
        
    return tag
    
def shape_way_node(el, i, nd):
    return {
        'id'       : el.attrib['id'],
        'node_id'  : nd.attrib['ref'],
        'position' : i
    }


def shape_element(el, node_attr_fields=NODE_FIELDS, way_attr_fields=WAY_FIELDS):
                      
    tags = [shape_tag(el, t) for t in el.iter('tag')]

    if el.tag == 'node':
        node_attribs = {f: el.attrib[f] for f in node_attr_fields}
        
        return {'node': node_attribs, 'node_tags': tags}
        
    elif el.tag == 'way':
        way_attribs = {f: el.attrib[f] for f in way_attr_fields}
        
        way_nodes = [shape_way_node(el, i, nd) 
                     for i, nd 
                     in enumerate(el.iter('nd'))]
   
        return {'way': way_attribs, 'way_nodes': way_nodes, 'way_tags': tags}

    



# ================================================== #
#               Helper Functions                     #
# ================================================== #
def get_element(osm_file, tags=('node', 'way', 'relation')):
    """Yield element if it is the right type of tag"""

    context = ET.iterparse(osm_file, events=('start', 'end'))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()


def validate_element(element, validator, schema=SCHEMA):
    """Raise ValidationError if element does not match schema"""
    if validator.validate(element, schema) is not True:
        field, errors = next(validator.errors.iteritems())
        message_string = "\nElement of type '{0}' has the following errors:\n{1}"
        error_string = pprint.pformat(errors)
        
        raise Exception(message_string.format(field, error_string))


class UnicodeDictWriter(csv.DictWriter, object):
    """Extend csv.DictWriter to handle Unicode input"""

    def writerow(self, row):
        super(UnicodeDictWriter, self).writerow({
            k: (v.encode('utf-8') if isinstance(v, unicode) else v) for k, v in row.iteritems()
        })

    def writerows(self, rows):
        for row in rows:
            self.writerow(row)


# ================================================== #
#               Main Function                        #
# ================================================== #
def process_map(file_in, validate):
    """Iteratively process each XML element and write to csv(s)"""

    with codecs.open(NODES_PATH, 'w') as nodes_file, \
         codecs.open(NODE_TAGS_PATH, 'w') as nodes_tags_file, \
         codecs.open(WAYS_PATH, 'w') as ways_file, \
         codecs.open(WAY_NODES_PATH, 'w') as way_nodes_file, \
         codecs.open(WAY_TAGS_PATH, 'w') as way_tags_file:

        nodes_writer = UnicodeDictWriter(nodes_file, NODE_FIELDS)
        node_tags_writer = UnicodeDictWriter(nodes_tags_file, NODE_TAGS_FIELDS)
        ways_writer = UnicodeDictWriter(ways_file, WAY_FIELDS)
        way_nodes_writer = UnicodeDictWriter(way_nodes_file, WAY_NODES_FIELDS)
        way_tags_writer = UnicodeDictWriter(way_tags_file, WAY_TAGS_FIELDS)

        nodes_writer.writeheader()
        node_tags_writer.writeheader()
        ways_writer.writeheader()
        way_nodes_writer.writeheader()
        way_tags_writer.writeheader()

        validator = cerberus.Validator()

        for element in get_element(file_in, tags=('node', 'way')):
            el = shape_element(element)
            if el:
                if validate is True:
                    validate_element(el, validator)

                if element.tag == 'node':
                    nodes_writer.writerow(el['node'])
                    node_tags_writer.writerows(el['node_tags'])
                elif element.tag == 'way':
                    ways_writer.writerow(el['way'])
                    way_nodes_writer.writerows(el['way_nodes'])
                    way_tags_writer.writerows(el['way_tags'])


if __name__ == '__main__':
    # Note: Validation is ~ 10X slower. For the project consider using a small
    # sample of the map when validating.
    process_map(OSM_PATH, validate=True)