In [2]:
import xml.etree.ElementTree as ET
import codecs
import json
import re

In [2]:
def get_root(fname):
    tree = ET.parse(fname)
    return tree.getroot()

input_ankara = "ankara_turkey.osm"
root = get_root(input_ankara)

In [3]:
def count_tags(root,tag):
    tag_count = 0
    for i in root.findall(tag):
        tag_count += 1
    return tag_count


def count_users(root):
    user = set()
    for i in root.findall('node') :
        try:
            user.add(i.attrib['user'])
        except:
            pass
    for i in root.findall('way'):
        try:
            user.add(i.attrib['user'])
        except:
            pass
    return  len(user) 
 

In [4]:
counts = {}
for i in ['node', 'way', 'relation' ]:
    counts[i] = count_tags(root,i)
counts['user'] = count_users(root)
print counts

{'node': 638467, 'relation': 220, 'user': 500, 'way': 92056}


In [7]:
def extend_abbrevations(address):
    extended_address = ''
    abbrevations = { 'Cd.': 'Caddesi', 'Cad.':'Caddesi', 'Cd':'Caddesi', 'Sk.':'Sokak', 
                    'Mah.':'Mahallesi', 'Blv.':u'Bulvar\u0131', 'Bul.':u'Bulvar\u0131'}
    for word in address.split():
        
        try:
            extended_address += abbrevations[word] + ' ' #check whether this word is an abbrevation 
            # print "before cleaning: ", address, "\t after cleaning: ", extended_address
        except:
            extended_address += word+ ' '
    return extended_address[:-1] #discard the last character, whitespace

In [21]:
def extend_abbrevations_split_by_multiple(address):
    extended_address = ''
    abbrevations = { 'Cd.': 'Caddesi', 'Cad.':'Caddesi', 'Cd':'Caddesi', 'Sk.':'Sokak', 
                    'Mah.':'Mahallesi', 'Blv.':u'Bulvar\u0131', 'Bul.':u'Bulvar\u0131'}
    splitted_address = re.findall(r'[^,;\s]+', address)
    for word in splitted_address:
        print word
        try:
            extended_address += abbrevations[word] + ' ' #check whether this word is an abbrevation 
            # print "before cleaning: ", address, "\t after cleaning: ", extended_address
        except:
            extended_address += word+ ' '
    return extended_address[:-1] #discard the last character, whitespace


In [9]:
def capitalize_turkish_title(address):
     
    capitalized_address = ''
    connectors = ['ve', 'de', 'ile', 'ki', 'ya', 'ya da'] #this words should not be capitalized
    for word in address.split(' ',';'):
         
        capitalized_address += word + ' ' if word in connectors else word.title()+' '
        
     
    if address.islower():
        print 'before capitalization: ', address, '\t after capitalization: ', capitalized_address

    return capitalized_address[:-1]  #discard the last character, whitespace

In [5]:
def correct_postcode(postcode):  #postcodes must start with 06, and must be 5 digits long
    if len(postcode) != 5: # found short postcodes like 0600, 65 in the data
        print 'postcode length is wrong', postcode
        return None
    elif re.match(r'(06\d\d\d)',postcode)  == None: #check if all digits (i.e. not Esma 3 Sokak) and starts with 06
        print 'invalid postcode', postcode
        return None
    return postcode
    

## Problems ##
1) Abbreviated street, avenue names (e.g. 'Cad.', 'Cd', 'Sk.')

2) Capitalization problems (e.g. 'cad.' , u'kalekap\u0131s\u0131' , 'ANKARA') 

3) Postcodes should consist of exactly 5 digits, and should start with 06.

In [86]:
global k 
k=0

In [1]:
us_ct = 0
uniq_users = set()
def get_value(element, attribute): #inspired from http://goo.gl/1n3xO5, assigns value if the field exists
    try:
        return element.attrib[attribute]
    except:
        return None
    
def shape_element(element):
    global uniq_users
    node = {}
    lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
    problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

    if element.tag == "node" or element.tag == "way" :
        node['type'] = element.tag
        node['id'] =  element.attrib['id']
        
        node['created'] = { 'timestamp' : get_value(element,'timestamp'),
                            'version': get_value(element,'version'),
                            'changeset': get_value(element,'changeset'), 
                            'user':get_value(element,'user'), 
                            'uid': get_value(element,'uid')}
        node['visible'] =  get_value(element,'visible')
        
        try:    
            node['pos'] = [float(get_value(element,'lat')), float(get_value(element,'lon'))]
        except:
            pass
        
        try:
            uniq_users.add(get_value(element,'user'))
        except:
            pass
        
        
        for i in element:
        
            if element.tag=="way":
                try:
                    node.setdefault('node_refs',[]).append(i.attrib['ref'])        #http://stackoverflow.com/questions/12905999/python-dict-how-to-create-key-or-append-an-element-to-key
                except:
                    pass
            
            try:
                if "addr" in i.attrib['k']:
                    addr, field = lower_colon.match(i.attrib['k']).group().split(':') 
                    length = len(lower_colon.match(i.attrib['k']).group().split(':'))
                    v = i.attrib['v']

                    if  length <= 2 and addr=='addr' and problemchars.match(v) == None:   
                        if field == 'postcode':
                            v = correct_postcode(v)
                                                    
                        #fix capitalizations, and abbrevations
                        
                        node.setdefault('address',{}).update({ field: capitalize_turkish_title(extend_abbrevations(v))  }) 
                else:
                     
                    if i.attrib['k'] != 'type': #some entries have <tag k = "type" v=...>
                        node[i.attrib['k']] = i.attrib['v']
                        #node[i.attrib['k']] =  capitalize_turkish_title(extend_abbrevations(i.attrib['v']))
                    
            except:
                pass
            
        return node
    else:
  
        return None


In [6]:

def process_map(file_in, pretty = False):
    k = 0
 
    file_out = "{0}.json".format(file_in)
    data = []
    with codecs.open(file_out, "w") as fo:
        for _, element in ET.iterparse(file_in):
            el  = shape_element(element)
            if k > 122150:
                pass
            if el:
                k += 1
                data.append(el)
                if pretty:
                    fo.write(json.dumps(el, indent=2)+"\n")
                else:
                    fo.write(json.dumps(el) + "\n")
    
    return data

In [None]:
clean_ankara = process_map("ankara_turkey.osm")

before capitalization:  city 	 after capitalization:  City 
before capitalization:  yes 	 after capitalization:  Yes 
before capitalization:  country 	 after capitalization:  Country 
before capitalization:  fuel 	 after capitalization:  Fuel 
before capitalization:  wikipedia 	 after capitalization:  Wikipedia 
before capitalization:  aerodrome 	 after capitalization:  Aerodrome 
before capitalization:  village 	 after capitalization:  Village 
before capitalization:  town 	 after capitalization:  Town 
before capitalization:  village 	 after capitalization:  Village 
before capitalization:  town 	 after capitalization:  Town 
before capitalization:  motorway_junction 	 after capitalization:  Motorway_Junction 
before capitalization:  motorway_junction 	 after capitalization:  Motorway_Junction 
before capitalization:  toll_booth 	 after capitalization:  Toll_Booth 
before capitalization:  motorway_junction 	 after capitalization:  Motorway_Junction 
before capitalization:  motorway_j

In [12]:
ct=0
for i in clean_ankara:
    if ct >310:
        break
    
 
    try:
        #if 'ABD' in i['name']:
        #    print i['name'].encode('utf-8')
        # i['address']
        ct += 1
        if i['address']['city'].islower():
            print i['address']['city']
        #    ct +=1 
        #if i['type']=='node' or i['type']=='way':
                 
    except:
        pass
print ct


311


In [45]:
def capitalize_turkish_title(word):
    capitalized_word = ''
    connectors = ['ve', 'de', 'ile', 'ki', 'ya', 'ya da']
    for w in word.split():
        capitalized_word += w + ' ' if w in connectors else w.title()+' '
    
    return capitalized_word[:-1]  #discard the last space

In [46]:
capitalize_turkish_title('sda saf Esam ve')

'Sda Saf Esam ve'

In [11]:
extend_abbrevations("Hipodrom Cad.;Talat Paşa Bulvarı")

Hipodrom
Cad.;Talat
Paşa
Bulvarı


'Hipodrom Cad.;Talat Pa\xc5\x9fa Bulvar\xc4\xb1'

In [17]:
"Hipodrom Cad.;Talat Paşa Bulvarı".replace(";"split("\s;")

['Hipodrom Cad.;Talat Pa\xc5\x9fa Bulvar\xc4\xb1']

In [19]:
str = 'a,,b,c,'
re.findall(r'[^,;\s]+', str)

['Hipodrom', 'Cad.', 'Talat', 'Pa\xc5\x9fa', 'Bulvar\xc4\xb1']

In [22]:
extend_abbrevations_split_by_multiple("Hipodrom Cad.;Talat Paşa Bulvarı")

Hipodrom
Cad.
Talat
Paşa
Bulvarı


'Hipodrom Caddesi Talat Pa\xc5\x9fa Bulvar\xc4\xb1'