In this notebook we take some of the functions implemented in the Constructing the Mappings Notebook in order to process the "cities15000.txt" file provided by the GeoNames database. This textfile contains the largest cities in the world which therefore would be the most likely. That is why we replaced our intermediate city mapping by this one.

In [6]:
import os
import numpy as np
import pandas as pd
import geocoder, geopy
import time
import unicodedata
import pickle
import contextlib
from tqdm import tqdm

Taking the helper functions from the Constructing the Mappings Notebook

In [35]:
#https://stackoverflow.com/questions/8694815/removing-accent-and-special-characters
def remove_accents(data):
    if data is None:
        return None
    else :
        clean = ''.join(x.lower().strip() for x in unicodedata.normalize('NFKD', data) if \
                unicodedata.category(x)[0] == 'L').lower()
        return clean

def string_formatting(string):
    string = string.replace("-", " ").replace(" ", ",").split(",")
    formatted_string = [remove_accents(x.lower()) for x in string]
    return string,formatted_string

def clean_sublist(x):
    return list(set(filter(None, np.hstack(x))))

def remove_accents_in_sublist(l):
    return list(map(lambda x:remove_accents(x.lower()),l))
    
def remove_accents_in_list(lists):
    return list(map(lambda x:remove_accents_in_sublist(x),lists))

def clean_and_remove_accents_in_list(lists):
    return list(map(lambda x:clean_sublist(remove_accents_in_sublist(x)),lists))

def convert_df_to_dict(df, do_prints = False):
    
    # Converting the dataframe values to list and cleaning them
    t = time.time()
    df_list = list(map(lambda x:clean_sublist(x),df.values.tolist()))
    if do_prints : print("Converting to list :", time.time()-t)

    # Removing all the accents from the elements in the list
    t = time.time()
    df_variants = clean_and_remove_accents_in_list(df_list)
    if do_prints : print("Getting variants :", time.time()-t)
    
    # Combining the lists with original spellings and without accents
    t = time.time()
    df_all =  list(map(lambda x: list(set(df_list[x] + df_variants[x])),range(len(df))))
    if do_prints : print("Combining Lists :", time.time()-t)
        
    # Getting all the keys
    t = time.time()
    keys = list(map(lambda x: [df.index[x]]*(len(df_all[x])),range(len(df_all))))
    if do_prints : print("Getting all keys :", time.time()-t)
      
    
    # Creating the dataframe
    t = time.time()
    mapping = pd.DataFrame(index = sum(df_all, []),data=sum(keys, []))
    #mapping = dict(zip(sum(df_all, []),sum(keys, [])))
    if do_prints : print("Converting to dict :", time.time()-t)
        
    return mapping

def extract_alternate_names(x):
    try:
        out = x.split(",")
        return out
    except:
        return []

Taking a variant of the process dataframe function so that it better correspond to the new text file

In [11]:
def process_dataframe(full_filename, do_prints = False):
    # Load the text file as a csv
    
    dtypes = [int,str, str,str,float,float,str,str,\
             str,str,str,str, str,str,int,str,\
             str,str,str]
    
    columns = ["geonameid","name", "asciiname","alternatenames",\
               "latitude","longitude","feature class","feature code",\
               "country code","cc2","admin1 code","admin2 code",\
               "admin3 code","admin4 code","population","elevation",\
               "dem","timezone","modification date"]
    
    cities = pd.read_csv(full_filename, sep = "\t", header=None, names=columns, dtype = dict(zip(columns,dtypes)))
        
    if do_prints: print("Loaded")
    
    # Keep only the relevant columns
    cities = cities[["name","asciiname", "alternatenames", "country code","population"]]
    
    # Format the given columns
    cities["name"] = cities["name"].apply(lambda x: extract_alternate_names(x))
    cities["asciiname"] = cities["asciiname"].apply(lambda x: extract_alternate_names(x))
    cities["alternatenames"] = cities["alternatenames"].astype("object")
    cities["alternatenames"] = cities["alternatenames"].apply(lambda x: extract_alternate_names(x))
    
    
    # Store the population and cities dataframes
    if do_prints: print("Processed")
    pop = cities.copy()
    
    pop.drop(["country code"], axis = 1, inplace = True)
    cities.drop(["population"], axis = 1, inplace = True)
    
    cities.set_index("country code", inplace = True)
    pop.set_index("population", inplace = True)
    
    if do_prints: print("Indexed")
    
    return cities, pop

In [20]:
interm_city, interm_pop = process_dataframe(os.path.join(os.getcwd(), "Mapping Files","cities15000.txt"), do_prints = False)

In [14]:
interm_city.head(20)

Unnamed: 0_level_0,name,asciiname,alternatenames
country code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AD,[les Escaldes],[les Escaldes],"[Ehskal'des-Ehndzhordani, Escaldes, Escaldes-E..."
AD,[Andorra la Vella],[Andorra la Vella],"[ALV, Ando-la-Vyey, Andora, Andora la Vela, An..."
AE,[Umm al Qaywayn],[Umm al Qaywayn],"[Oumm al Qaiwain, Oumm al Qaïwaïn, Um al Kawai..."
AE,[Ras al-Khaimah],[Ras al-Khaimah],"[Julfa, Khaimah, RKT, Ra's al Khaymah, Ra's al..."
AE,[Khawr Fakkān],[Khawr Fakkan],"[Fakkan, Fakkān, Khawr Fakkan, Khawr Fakkān, K..."
AE,[Dubai],[Dubai],"[DXB, Dabei, Dibai, Dibay, Doubayi, Dubae, Dub..."
AE,[Dibba Al-Fujairah],[Dibba Al-Fujairah],"[Al-Fujairah, BYB, Dibba Al-Fujairah, dba alfj..."
AE,[Dibba Al-Hisn],[Dibba Al-Hisn],"[BYB, Daba, Daba al-Hisn, Dabā, Dabā al-Ḥiṣn, ..."
AE,[Sharjah],[Sharjah],"[Al Sharjah, Ash 'Mariqah, Ash Shariqa, Ash Sh..."
AE,[Ar Ruways],[Ar Ruways],"[Ar Ru'ays, Ar Ruways, Ar Ru’ays, Ar-Ruvais, R..."


In [36]:
interm_city_df = convert_df_to_dict(interm_city, do_prints = False)

In [37]:
interm_pop_df = convert_df_to_dict(interm_pop, do_prints = False)

In [17]:
pickle_file = open(os.path.join(os.getcwd(), "cities15000.pickle"), 'wb')
pickle.dump(interm_city_dict, pickle_file, protocol=4)
pickle_file.close()

In [22]:
pickle_file = open(os.path.join(os.getcwd(), "cities15000_pop.pickle"), 'wb')
pickle.dump(interm_pop_dict, pickle_file, protocol=4)
pickle_file.close()

In [24]:
empty_pop_dict = {}
full_city_mappings = {}
full_pop_mappings = {}
n_empty_pop = 0
n_cities_tot = 0


# Adding conditionnally the pickle content
for city_name, country_code in city_pkl.items():
    n_cities_tot += 1
    if pop_pkl.get(city_name) == None :
        pop_pkl.update({city_name : 0 })
        empty_pop_dict.update({city_name : 'Pas de data' })
        n_empty_pop +=1

    if (full_city_mappings.get(city_name) == None):
        full_city_mappings.update({city_name : country_code })
        full_pop_mappings.update({city_name : pop_pkl[city_name] })
    else : 
        if (pop_pkl[city_name] > full_pop_mappings[city_name]) :
            full_city_mappings.update({city_name : country_code })
            full_pop_mappings.update({city_name : pop_pkl[city_name] })

In [38]:
full_city_mappings['Paris']

'ZA'

ValueError: can not merge DataFrame with instance of type <class 'dict'>

In [68]:
empty_pop_dict = {}
full_city_mappings = {}
full_pop_mappings = {}
n_empty_pop = 0
n_cities_tot = 0

citylist = list(interm_city_df.index)
# Adding conditionnally the pickle content
for i in range(len(interm_city_df)):
    city_name = citylist[i]
    country_code = interm_city_df[0].iloc[i]
    n_cities_tot += 1
    #if pop_pkl.get(city_name) == None :
    #    pop_pkl.update({city_name : 0 })
    #    empty_pop_dict.update({city_name : 'Pas de data' })
    #    n_empty_pop +=1

    if (full_city_mappings.get(city_name) == None):
        full_city_mappings.update({city_name : country_code })
        full_pop_mappings.update({city_name : interm_pop_df[0].iloc[i] })
    else : 
        if (interm_pop_df[0].iloc[i] > full_pop_mappings[city_name]) :
            full_city_mappings.update({city_name : country_code })
            full_pop_mappings.update({city_name : interm_pop_df[0].iloc[i] })

In [79]:
import json
json.dump(full_city_mappings,open(os.path.join('countries15000_with_pop.json'),'w'))