In [None]:
# -*- coding: utf-8 -*-

import csv
import codecs
import pprint
import re
import xml.etree.cElementTree as ET
from pypinyin import pinyin, lazy_pinyin
#import cerberus

#import schema
mapping = { "St": "Street","St.": "Street","ST": "Street","street":"Street","st": "Street",'Jie':'Street',
            'jie':'Street',
            "Rd": "Road","raod":"Road","road": "Road","Lu":'Road',
            "Ln":"Lane",
            "BLVD": "Boulevard",
            "Acenue": "Avenue", "Ave": "Avenue","avenue": "Avenue", "Av": "Avenue",
            "Hwy": "Highway",
            "Blvd": "Boulevard",
            "Ct": "Court",
            "E": "East","S": "South","W": "West","N": "North","S.": "South",
            "NE": "Northeast","NW": "Northwest","SE": "Southeast","SW": "Southwest",
            "Dadao": "DaDao"
            }


OSM_PATH = "/Users/chebyshev/Downloads/THmap"

NODES_PATH = "/Users/chebyshev/Desktop/DA/P3/nodes.csv"
NODE_TAGS_PATH = "/Users/chebyshev/Desktop/DA/P3/nodes_tags.csv"
WAYS_PATH = "/Users/chebyshev/Desktop/DA/P3/ways.csv"
WAY_NODES_PATH = "/Users/chebyshev/Desktop/DA/P3/ways_nodes.csv"
WAY_TAGS_PATH = "/Users/chebyshev/Desktop/DA/P3/ways_tags.csv"

LOWER_COLON = re.compile(r'^([a-z]|_)+:([a-z]|_)+')
PROBLEMCHARS = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')
zhPattern = re.compile(u'[\u4e00-\u9fa5]+')
#SCHEMA = schema.schema

# Make sure the fields order in the csvs matches the column order in the sql table schema
NODE_FIELDS = ['id', 'lat', 'lon', 'user', 'uid', 'version', 'changeset', 'timestamp']
NODE_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_FIELDS = ['id', 'user', 'uid', 'version', 'changeset', 'timestamp']
WAY_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_NODES_FIELDS = ['id', 'node_id', 'position']


def shape_element(element, node_attr_fields=NODE_FIELDS, way_attr_fields=WAY_FIELDS,
                  problem_chars=PROBLEMCHARS, default_tag_type='regular'):
    """Clean and shape node or way XML element to Python dict"""

    node_attribs = {}
    way_attribs = {}
    way_nodes = []
    tags = []
    way_tags=[]# Handle secondary tags the same way for both node and way elements
    #######################
    lower = re.compile(r'^([a-z]|_)*$')
    lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
    PROBLEM_CHARS = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

    
    if element.tag == 'node':
                
        node_attribs ={"id": None,"lat":None,"lon":None,"uid":None,
        "user":None,"version":None,"changeset":None,"timestamp":None }
        
        node_attribs["id"]=element.attrib['id']
        node_attribs["lat"] = element.attrib['lat']
        node_attribs["lon"]= element.attrib['lon']
        node_attribs["uid"]= element.attrib['uid']
        node_attribs["user"]= Chinese_English(element.attrib['user'])
        node_attribs["version"]= element.attrib['version']
        node_attribs["changeset"]= element.attrib['changeset']
        node_attribs["timestamp"]= element.attrib['timestamp']
 #############
        
        for t in element.iter("tag"):
            tagss ={"id": None,"key":None,"value":None,"type":None }
            #print element.attrib['id']
            tagss["id"] =element.attrib['id']
            tagss["value"] =update_name(t.attrib['v'])
            #如果标记“k”值包含存在问题的字符，则应该忽略该标记
            if problem_chars.match(t.attrib["k"]) is not None:
                continue
            #http://www.runoob.com/python/att-string-index.html
            #如果标记“k”值包含“:”，则“:”前面的字符应该设为标记类型，“:” 后面的字符应该设为标记键
            #如果“k”值中包含其他“:”，则应该忽略这些“:”并保留为标记键的一部分
            elif ":" in t.attrib["k"]:
                index = t.attrib["k"].index(":")   
                tagss["type"] = t.attrib['k'][:index]
                tagss["key"] = t.attrib['k'][index+1:]
                        
#           t.attrib["k"][:5] == "addr:":
#                tagss["key"] = t.attrib['k'][5:]
#                tagss["type"] = t.attrib['k'][:3]
            
            else:
                tagss["key"] = t.attrib['k']
                tagss["type"] = default_tag_type
            

            tags.append(tagss)        
        return {'node': node_attribs, 'node_tags': tags}
        
        
    elif element.tag == 'way':
        way_attribs ={"id": None,"user":None,"uid":None,
        "version":None,"changeset":None,"timestamp":None }
        way_attribs["id"]=element.attrib['id']
        way_attribs["user"]= Chinese_English(element.attrib['user'])
        way_attribs["uid"]= element.attrib['uid']        
        way_attribs["version"]= element.attrib['version']
        way_attribs["changeset"]= element.attrib['changeset']
        way_attribs["timestamp"]= element.attrib['timestamp']
        
     #   
        position =0
        for tags in element.iter():
            if tags.tag == 'nd': 
                way_nodess ={"id": None,"node_id":None,"position":None }
                way_nodess["id"]=update_name(element.attrib['id'])
                way_nodess["node_id"]= tags.attrib['ref']
                way_nodess["position"]= position
                position +=1
                way_nodes.append(way_nodess)
        

        
        for t in element.iter("tag"):
            tagss ={"id": None,"key":None,"value":None,"type":None }
            #print element.attrib['id']
            tagss["id"] =element.attrib['id']
            tagss["value"] =update_name(t.attrib['v'])
            
            if problem_chars.match(t.attrib["k"]) is not None:
                continue
            elif t.attrib["k"][:5] == "addr:":
                tagss["key"] = t.attrib['k'][5:]
                tagss["type"] = t.attrib['k'][0:3]
            else:
                tagss["key"] = t.attrib['k']
                tagss["type"] = default_tag_type
            way_tags.append(tagss)
            
        return {'way': way_attribs, 'way_nodes': way_nodes, 'way_tags': way_tags}

def update_name(name):
    name = Chinese_English(name)
    name = name.split()
    for i in range(len(name)):
        if name[i] in mapping:
            name[i] = mapping[name[i]]
    name = " ".join(name)  #http://blog.csdn.net/chixujohnny/article/details/53301995
    return name 
def Chinese_English(name): #https://zhidao.baidu.com/question/1958984257226168500.html
    match = zhPattern.search(name)
    if match:
        return ''.join(lazy_pinyin(name))
    else:
        return name
def Traditional2Simplified(sentence):#http://blog.csdn.net/wds2006sdo/article/details/53583367

    sentence = Converter('zh-hans').convert(sentence)
    return sentence

   
# ================================================== #
#               Helper Functions                     #
# ================================================== #
def get_element(osm_file, tags=('node', 'way', 'relation')):
    """Yield element if it is the right type of tag"""

    context = ET.iterparse(osm_file, events=('start', 'end'))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()


#def validate_element(element, validator, schema=SCHEMA):
#    """Raise ValidationError if element does not match schema"""
#    if validator.validate(element, schema) is not True:
#        field, errors = next(validator.errors.iteritems())
 #       message_string = "\nElement of type '{0}' has the following errors:\n{1}"
 #       error_string = pprint.pformat(errors)
 #       
 #       raise Exception(message_string.format(field, error_string))


#class UnicodeDictWriter(csv.DictWriter, object):
#    """Extend csv.DictWriter to handle Unicode input"""

#    def writerow(self, row):
#        super(UnicodeDictWriter, self).writerow({
#            k: (v.encode('utf-8') if isinstance(v, unicode) else v) for k, v in row.iteritems()
#        })

#    def writerows(self, rows):
#        for row in rows:
  #          self.writerow(row)


# ================================================== #
#               Main Function                        #
# ================================================== #
def process_map(file_in, validate):
    """Iteratively process each XML element and write to csv(s)"""

    with codecs.open(NODES_PATH, 'w') as nodes_file, \
         codecs.open(NODE_TAGS_PATH, 'w') as nodes_tags_file, \
         codecs.open(WAYS_PATH, 'w') as ways_file, \
        codecs.open(WAY_NODES_PATH, 'w') as way_nodes_file, \
         codecs.open(WAY_TAGS_PATH, 'w') as way_tags_file:

        nodes_writer = UnicodeDictWriter(nodes_file, NODE_FIELDS)
        node_tags_writer = UnicodeDictWriter(nodes_tags_file, NODE_TAGS_FIELDS)
        ways_writer = UnicodeDictWriter(ways_file, WAY_FIELDS)
        way_nodes_writer = UnicodeDictWriter(way_nodes_file, WAY_NODES_FIELDS)
        way_tags_writer = UnicodeDictWriter(way_tags_file, WAY_TAGS_FIELDS)

        nodes_writer.writeheader()
        node_tags_writer.writeheader()
        ways_writer.writeheader()
        way_nodes_writer.writeheader()
        way_tags_writer.writeheader()

        validator = cerberus.Validator()

        for element in get_element(file_in, tags=('node', 'way')):
            el = shape_element(element)
            if el:
               
                if element.tag == 'node':
                    nodes_writer.writerow(el['node'])
                    node_tags_writer.writerows(el['node_tags'])
                elif element.tag == 'way':
                    ways_writer.writerow(el['way'])
                    way_nodes_writer.writerows(el['way_nodes'])
                    way_tags_writer.writerows(el['way_tags'])


if __name__ == '__main__':
    # Note: Validation is ~ 10X slower. For the project consider using a small
    # sample of the map when validating.
    process_map(OSM_PATH, validate=True)