In [None]:
import pandas as pd
import recordlinkage as rl
from   recordlinkage.preprocessing import clean


In [1]:
# --------------------------------------------------------------------
#      cities - GeoNames
# --------------------------------------------------------------------
# geonameid         : integer id of record in geonames database
# name              : name of geographical point (utf8) varchar(200)
# asciiname         : name of geographical point in plain ascii characters, varchar(200)
# alternatenames    : alternatenames, comma separated, ascii names automatically transliterated, convenience attribute from alternatename table, varchar(10000)
# latitude          : latitude in decimal degrees (wgs84)
# longitude         : longitude in decimal degrees (wgs84)
# feature class     : see http://www.geonames.org/export/codes.html, char(1)
# feature code      : see http://www.geonames.org/export/codes.html, varchar(10)
# country code      : ISO-3166 2-letter country code, 2 characters
# cc2               : alternate country codes, comma separated, ISO-3166 2-letter country code, 200 characters
# admin1 code       : fipscode (subject to change to iso code), see exceptions below, see file admin1Codes.txt for display names of this code; varchar(20)
# admin2 code       : code for the second administrative division, a county in the US, see file admin2Codes.txt; varchar(80) 
# admin3 code       : code for third level administrative division, varchar(20)
# admin4 code       : code for fourth level administrative division, varchar(20)
# population        : bigint (8 byte int) 
# elevation         : in meters, integer
# dem               : digital elevation model, srtm3 or gtopo30, average elevation of 3''x3'' (ca 90mx90m) or 30''x30'' (ca 900mx900m) area in meters, integer. srtm processed by cgiar/ciat.
# timezone          : the iana timezone id (see file timeZone.txt) varchar(40)
# modification date : date of last modification in yyyy-MM-dd format



In [None]:
def df_import_geo_cities(citiesFile):
    geocols = ['geonameid', 'name', 'asciiname', 'alternatenames', 'latitude', 'longitude', 'feature_class', 'feature_code', 'country_code', 'cc2', 'admin1_code', 'admin2_code', 'admin3_code', 'admin4_code', 'population', 'elevation', 'dem', 'timezone', 'modification date']   
    cities = pd.read_csv(citiesFile,
                        sep='\t', 
                        header=None,
                        #engine='python',
                        low_memory=False,
                        names = geocols,
                        infer_datetime_format=True
                        )

    cities.drop(columns=['feature_class','admin1_code', 'admin2_code', 'admin3_code', 'admin4_code', 'population', 'elevation', 'dem', 'timezone', 'modification date'], inplace=True)
    return cities


In [None]:
# ----------------------------------------------
#   Country - GeoNames
# ----------------------------------------------

def df_import_geo_country(countryFile):
    country = pd.read_csv(countryFile,
                        sep='\t', 
                        #engine='python',
                        low_memory=False,
                        infer_datetime_format=True)
    return country

       
# country[country.Country == 'Argentina'].head(10)
# countrygid.head(10)


In [None]:
# ----------------------------------------------
# alternate names - GeoNames
# ----------------------------------------------
# alternateNameId   : the id of this alternate name, int
# geonameid         : geonameId referring to id in table 'geoname', int
# isolanguage       : iso 639 language code 2- or 3-characters; 4-characters 'post' for postal codes and 'iata','icao' and faac for airport codes, fr_1793 for French Revolution names,  abbr for abbreviation, link to a website (mostly to wikipedia), wkdt for the wikidataid, varchar(7)
# alternate name    : alternate name or name variant, varchar(400)
# isPreferredName   : '1', if this alternate name is an official/preferred name
# isShortName       : '1', if this is a short name like 'California' for 'State of California'
# isColloquial      : '1', if this alternate name is a colloquial or slang term. Example: 'Big Apple' for 'New York'.
# isHistoric        : '1', if this alternate name is historic and was used in the past. Example 'Bombay' for 'Mumbai'.
# from              : from period when the name was used
# to                : to period when the name was used



In [None]:
def df_import_geo_alternate_names(alternateFile):     
    alternatecols = ['alternateNameId','geonameid','isolanguage','alternate','isPreferredName','isShortName','isColloquial','isHistoric','from','to',]
    alternaNames = pd.read_csv(alternateFile,
                        sep='\t', 
                        header=None,
                        #engine='python',
                        low_memory=False,
                        names = alternatecols,
                        infer_datetime_format=True
                        )
    return alternaNames

# alternaNames.head(10)

# El unico nombre que se repite es San Martín !!!!!!
# CountryEs.CountryES.value_counts()
# alternateES[alternateES.alternate == 'San Martín' ]

# Ejemplo de problema que genera duplicados
# CountryEs[CountryEs.CountryES == 'San Martín' ]


In [None]:
def genera_country():
    # Carga Paises
    country = df_import_geo_country('.\\DataSets\\GeoNames\\CountryInfo.csv') 
    countrygid = country.geonameid

    # Carga  los nombres en otros idiomas
    alternaNames = df_import_geo_alternate_names('.\\DataSets\\GeoNames\\alternateNamesV2.txt')

    # Selecciona los de los idiomas mas usados
    alternate_x = alternaNames.loc[(alternaNames.geonameid.isin(countrygid)) & (alternaNames.isolanguage.isin(['es','en','de','it','fr'])),['geonameid','alternate']] 

    # Arma la tabla de conversion
    country_alt = pd.merge(country, alternate_x, left_on='geonameid', right_on='geonameid')[['Country','alternate','ISO', 'ISO3', 'fips', 'Phone']].rename(index=str,columns={"Country": "country_corregido", "alternate": "country_alt"})    

    # Graba archivo
    country_alt.to_csv('.\\DataSets\\GeoNames\\country_alt.csv')
    del alternate_x
    del alternaNames    
    del country_alt
    return 



In [None]:
def df_import_country(countryFile):
    # Carga desde archivo procesado previamente (mas rapido)
    country_alt = pd.read_csv(countryFile,error_bad_lines=False,low_memory=False)[['country_corregido','country_alt']].drop_duplicates()

    # Corrige el archivo de paises recibidos por las dudas
    country_alt['country_corregido'] = rl.preprocessing.cleaning.clean(
        country_alt.country_corregido,
        remove_brackets=False,
        strip_accents='unicode'
    )
    country_alt['country_alt'] = rl.preprocessing.cleaning.clean(
        country_alt.country_alt,
        remove_brackets=False,
        strip_accents='unicode'
    )

    country_alt.drop_duplicates(inplace=True)
    
    # # Congo
    country_alt.drop(280,inplace=True)

    # # Kongo
    country_alt.drop(287,inplace=True)

    # Isla de San Martín
    country_alt.drop(1576,inplace=True)

    # Saint-Martin
    country_alt.drop(1573,inplace=True)

    # San Martín
    country_alt.drop(1577,inplace=True)
    
    return country_alt
