In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import os

In [2]:
df = pd.read_csv('../../data/dataset-string-similarity.txt', sep='\t', header=None)

In [3]:
df.columns = [
    'name1',
    'name2',
    'target',
    'geonameid_1',
    'geonameid_2',
    'lang1',
    'lang2',
    'cc1',
    'cc2'
]

In [4]:
indexes = set(df['geonameid_1']) | set(df['geonameid_2'])

In [5]:
print(f'Number of Indexes: {len(indexes)}')

Number of Indexes: 2360743


- geonameid         : integer id of record in geonames database
- name              : name of geographical point (utf8) varchar(200)
- asciiname         : name of geographical point in plain ascii characters, varchar(200)
- alternatenames    : alternatenames, comma separated, ascii names automatically transliterated, convenience attribute - - from alternatename table, varchar(10000)
- latitude          : latitude in decimal degrees (wgs84)
- longitude         : longitude in decimal degrees (wgs84)
- feature class     : see http://www.geonames.org/export/codes.html, char(1)
- feature code      : see http://www.geonames.org/export/codes.html, varchar(10)
- country code      : ISO-3166 2-letter country code, 2 characters
- cc2               : alternate country codes, comma separated, ISO-3166 2-letter country code, 200 characters
- admin1 code       : fipscode (subject to change to iso code), see exceptions below, see file admin1Codes.txt for display names of this code; varchar(20)
- admin2 code       : code for the second administrative division, a county in the US, see file admin2Codes.txt; varchar(80) 
- admin3 code       : code for third level administrative division, varchar(20)
- admin4 code       : code for fourth level administrative division, varchar(20)
- population        : bigint (8 byte int) 
- elevation         : in meters, integer
- dem               : digital elevation model, srtm3 or gtopo30, average elevation of 3''x3'' (ca 90mx90m) or 30''x30'' (ca 900mx900m) area in meters, integer. srtm processed by cgiar/ciat.
- timezone          : the iana timezone id (see file timeZone.txt) varchar(40)
- modification date : date of last modification in yyyy-MM-dd format


In [6]:
cols = ['geonameid', 'name', 'ascii_name', 'alternate_names',
        'latitude', 'longitude', 'feature_class', 'feature_code',
        'country_code', 'cc2', 'admin1_code', 'admin2_code',
        'admin3_code', 'admin4_code', 'population', 'elevation',
        'dem', 'timezone', 'modification_date']

In [7]:
all_countries = pd.read_csv('../../data/allCountries.txt',
                            sep='\t',
                            header=None,
                            names=cols) 

  interactivity=interactivity, compiler=compiler, result=result)


In [8]:
all_countries = all_countries.where((pd.notnull(all_countries)), None)

In [9]:
drop_cols = ['latitude', 'longitude', 'feature_class', 'feature_code',
             'admin1_code', 'admin2_code', 'admin3_code', 'admin4_code', 
             'population', 'elevation', 'dem', 'timezone', 'modification_date']

In [10]:
all_countries.drop(columns=drop_cols, axis=1, inplace=True)

In [11]:
all_countries.to_csv('../../data/all_countries_filtered_columns.csv', index=False, encoding='utf-8')

In [12]:
all_countries.set_index('geonameid', inplace=True)

In [13]:
all_countries_filtered = all_countries.loc[indexes]

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  """Entry point for launching an IPython kernel.


In [14]:
all_countries_filtered.sample(10)

Unnamed: 0_level_0,name,ascii_name,alternate_names,country_code,cc2
geonameid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
8242218,Ban Bang Wang,Ban Bang Wang,"Ban Bang Wang,ban bang wang,บ้านบางวัง",TH,
9532754,Rāh Jūyān,Rah Juyan,"Mazra`eh-ye Rah Juyan,Mazra‘eh-ye Rāh Jūyān,Ra...",IR,
7537207,Helong Huanggou Linchang,Helong Huanggou Linchang,"Helong Huanggou Linchang,he long huang gou lin...",CN,
1810510,Gaojiazhai,Gaojiazhai,"Gaojiazhai,Kao-chia-chai,gao jia zhai,高家寨",CN,
10427219,Ryōsei,Ryosei,"Ryosei,Ryōsei,ling xi,綾西",JP,
490247,Sosnovka,Sosnovka,"Kashlyach'ye,Kashlyach’ye,Koshlyach'ye,Koshlya...",RU,
258461,Leventiés,Leventies,"Levendies,Levendiés,Leventies,Leventiés,Λεβεντιές",GR,
2684859,Övre Bondestad,Ovre Bondestad,"Bondestad,OEvre Bondestad,Ovre Bondestad,Övre ...",SE,SE
11190734,Abuli P’oshut,Abuli P'oshut,"Abuli P'oshut,Abuli P’oshut,Աբուլի փոշուտ",AM,
8057771,Tlyavgulovo,Tlyavgulovo,"Tljavgulovo,Tlyavgulovo,Тлявгулово",RU,


In [15]:
all_countries_filtered.to_csv('../../data/all_countries_filtered_rows_&_columns.csv', index=True, encoding='utf-8')

In [23]:
all_countries_filtered.sample(10)

Unnamed: 0_level_0,name,ascii_name,alternate_names,country_code,cc2
geonameid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2204091,Madre,Madre,"Madre,Mandre,Mbualu Peak,Nabouwalu,Nambouwalu",FJ,
728915,Murgash,Murgash,"Murgasch,Murgash,Мургаш",BG,
10188458,Lugouxian,Lugouxian,"Lugouxian,lu gou xian,路沟岘",CN,
9629741,Dongfangshun,Dongfangshun,"Dongfangshun,dong fang shun,东方顺",CN,
1699090,Science City of Muñoz,Science City of Munoz,"Lungsod ng Munoz,Lungsod ng Muñoz,Munoz,Munoz ...",PH,
8570416,Hejiaxinji,Hejiaxinji,"Hejiaxinji,he jia xin ji,何家新集",CN,
1803422,Linhai,Linhai,"LHC,Lin-hai-hsien,Linhai,T'ai-chou,Taichow,Tai...",CN,
7676365,Huai Sing,Huai Sing,"Huai Sing,hwy sing,ห้วยซิง",TH,
940433,Wespark,Wespark,"Wespark,West Fort,West Park",ZA,
11236341,Dujiabo Shequ,Dujiabo Shequ,"Dujiabo Shequ,Dujiabocun,du jia po cun,du jia ...",CN,
