In [1]:
import pandas as pd
import editdistance
import geopy.distance as dst

## Read new and old pandas dataframe

In [2]:
df = pd.read_csv('/code/gocode/src/geosure/mdf20180806.tsv', encoding='utf-16',sep='\t')
df_na = pd.read_csv('/Users/jorge/Desktop/mdf_na_new.csv', encoding='latin1')
df_na.columns = [r.lower() for r in df_na.columns]
# df_tx = pd.read_csv('/Users/jorge/Desktop/mdf_texas.csv')

print(df.shape)
print(df_na.shape)
# df_na = pd.read_csv('/Users/jorge/Desktop/na_mdf.csv')

(23471, 25)
(4159, 23)


## Organized needed columns

In [9]:
old_columns = df.columns[:-7].tolist()
new_columns = df.columns[-7:].tolist()

print(old_columns)
print('----------')
print(new_columns)


['geosure_id', 'record', 'continent', 'wiki_continent', 'country', 'wiki_country', 'country_code', 'province', 'wiki_city', 'city', 'city_original', 'city_type', 'district', 'language', 'language_secondary', 'population', 'latitude', 'longitude']
----------
['composite', 'physical', 'women', 'theft', 'political', 'health', 'lgbtq']


In [11]:
df_na_old = df.loc[(df['continent'] == 'North America')].copy().reset_index(drop=True)
print(df_na_old.shape)
df_na_old.head()

(1020, 25)


Unnamed: 0,geosure_id,record,continent,wiki_continent,country,wiki_country,country_code,province,wiki_city,city,...,population,latitude,longitude,composite,physical,women,theft,political,health,lgbtq
0,G2000000,1,North America,Q49,Bermuda,Q23635,BM,Ontario,Q30985,Hamilton,...,1010.0,32.294816,-64.781375,22,18,17,23,25,30,50
1,G2000001,2,North America,Q49,Canada,Q16,CA,Alberta,Q2096,Edmonton,...,812201.0,53.544389,-113.490927,24,29,25,27,19,25,50
2,G2000002,3,North America,Q49,Canada,Q16,CA,Alberta,Q36312,Calgary,...,1096833.0,51.045325,-114.058101,24,27,23,26,19,25,50
3,G2000003,4,North America,Q49,Canada,Q16,CA,British Columbia,Q2132,Victoria,...,80017.0,48.428421,-123.365644,29,36,34,38,19,28,50
4,G2000004,5,North America,Q49,Canada,Q16,CA,British Columbia,Q24639,Vancouver,...,603502.0,49.261226,-123.113927,24,27,24,27,19,25,50


### Use coordinates

In [12]:
df_na_old['coords'] = list(zip(df_na_old['latitude'], df_na_old['longitude']))
df_na['coords'] = list(zip(df_na['latitude'], df_na['longitude']))

In [13]:
# For each new item, find in the bigger dataset of North America the id whose distance is the minimum.
def get_id(df_old, row):
    min_id = df_old['coords'].apply(lambda x: dst.vincenty(row['coords'], x).km).idxmin()
    return {'old_df_id': min_id, 'new_df_id': row.name}

min_dict = [get_id(df_na_old, row) for _, row in df_na.iterrows()]

## Find by distance, name

In [16]:
def compare_df(ids, df_old, df_new):
    row = df_old.iloc[ids['old_df_id']]
    row_new = df_new.iloc[ids['new_df_id']]
    res_dict = dict(geosure_id=row['geosure_id'],old_record=row['record'],new_record=row_new['record'], 
                    latlng_dist=dst.vincenty((row['latitude'], row['longitude']),
                                             (row_new['latitude'], row_new['longitude'])).km)
    res_dict.update({
        "old_city": row['city'],
        "new_city": row_new['city'],
        "old_lat": row['latitude'],
        "old_lng": row['longitude'],
        "new_lat": row_new['latitude'],
        "new_lng": row_new['longitude'],
        "new_district": row_new['district'],
        "old_district": row['district'],
        "old_ix": row.name,
        "new_ix": row_new.name,
    })
    
    for item in ['city', 'province', 'country', 'district']:
        res_dict.update({'{0}_dst'.format(item): editdistance.eval(row[item], row_new[item])})
    
    return res_dict

dicts = [compare_df(x, df_na_old, df_na) for x in min_dict]
df_comp = pd.DataFrame(dicts)

print(df_comp.shape)
df_comp.head()

Exception ignored in: 'editdistance.bycython.eval'
TypeError: object of type 'float' has no len()


(4159, 18)


Unnamed: 0,city_dst,country_dst,district_dst,geosure_id,latlng_dist,new_city,new_district,new_ix,new_lat,new_lng,new_record,old_city,old_district,old_ix,old_lat,old_lng,old_record,province_dst
0,0,5,0,G2000000,0.0,Hamilton,Overall,0,32.294816,-64.781375,1,Hamilton,Overall,0,32.294816,-64.781375,1,0
1,0,0,0,G2000002,0.0,Calgary,Overall,1,51.045325,-114.058101,2,Calgary,Overall,2,51.045325,-114.058101,3,0
2,0,0,0,G2000001,0.0,Edmonton,Overall,2,53.544389,-113.490927,3,Edmonton,Overall,1,53.544389,-113.490927,2,0
3,0,0,0,G2000008,0.0,Abbotsford,Overall,3,49.054587,-122.328026,4,Abbotsford,Overall,8,49.054587,-122.328026,9,0
4,0,0,0,G2000006,0.0,Burnaby,Overall,4,49.229491,-123.002575,5,Burnaby,Overall,6,49.229491,-123.002575,7,0


### Get all geosure_ids whose difference in district distance slower than two. It means we let go subtle difference between district names

In [17]:
comp_good = df_comp.loc[(df_comp['latlng_dist'] == 0) &
                        (df_comp['district_dst'] <= 1)]
ids_to_replace = list(comp_good[['old_ix', 'new_ix']].T.to_dict().values())
print(len(ids_to_replace))

902


### Construct dataframe

In [25]:
def create_df_dict(item, df_new, df_old):
    new_dict = df_old.iloc[item['old_ix']][old_columns].to_dict()
    new_dict.update(df_new.iloc[item['new_ix']][new_columns].to_dict())
    return new_dict

dicts = [create_df_dict(i, df_na, df) for i in ids_to_replace]
df_replace = pd.DataFrame(dicts)
df_replace = df_replace[df.columns]
df_replace.head()

Unnamed: 0,geosure_id,record,continent,wiki_continent,country,wiki_country,country_code,province,wiki_city,city,...,population,latitude,longitude,composite,physical,women,theft,political,health,lgbtq
0,G1000000,1,Central America,Q27611,Saint-Pierre & Miquelon,Q142,PM,,Q185678,Saint-Pierre,...,3000.0,46.781013,-56.177646,22,18,17,23,25,30,24
1,G1000002,3,Central America,Q27611,Antigua and Barbuda,Q781,AG,,Q36262,St. John's,...,51247.0,17.117528,-61.845557,26,27,23,26,19,25,33
2,G1000001,2,Central America,Q27611,Anguilla,Q145,AI,,Q30994,The Valley,...,1169.0,18.220554,-63.068615,27,29,25,27,19,25,35
3,G1000008,9,Central America,Q27611,Bahamas,Q778,BS,,Q2467,Nassau,...,,25.082661,-77.360437,26,26,23,27,19,27,32
4,G1000006,7,Central America,Q27611,Bahamas,Q778,BS,,Q2467,Nassau,...,246329.0,25.06,-77.345,27,27,24,27,19,27,33


## Drop items from dataframe. 

## Include new scores from removed

In [26]:
geosure_ids = df_replace['geosure_id'].tolist()
print("Len geosure ids: {}".format(len(geosure_ids)))

df_new = df.loc[~df['geosure_id'].isin(geosure_ids)].copy()

print('New shape {0}'.format(df_new.shape))

df_new = pd.concat([df_new, df_replace], axis=0)

df_new = df_new.reset_index()
df_new.sort_values(by='geosure_id', inplace=True)

print('New shape after concat {0}'.format(df_new.shape))

Len geosure ids: 902
New shape (22569, 25)
New shape after concat (23471, 26)


### Compute difference between scores

In [44]:
def check_diff(df, df_new):
    diffs = (df[new_columns].as_matrix().sum(axis=1) - df_new[new_columns].as_matrix().sum(axis=1))
    return (diffs != 0).sum()
    
print("Number of rows changed: {0}".format(check_diff(df, df_new)))

Number of rows changed: 901


## Include new items from NA

In [49]:
comp_bad = df_comp.loc[(df_comp['latlng_dist'] != 0)]
records_to_add = comp_bad['new_record']
print(len(records_to_add))

df_add = df_na.loc[df_na['record'].isin(records_to_add)]
print(df_add.shape)

3252
(3252, 24)


## Get geosure id counter for North America

In [50]:
gs_id = df.loc[df['continent'] == 'North America'].iloc[-1]['geosure_id']
int_id = int(gs_id.replace('G', '')) + 1

df_add['geosure_id'] = pd.Series(['G' + str(x) for x in range(int_id, int_id + df_add.shape[0])],
                                  index=df_add.index)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


## Get missing items

### Get country codes for all

In [51]:
countries = df.groupby(['country_code', 'country']).size().reset_index(name='count')
countries.drop(labels=['count'], inplace=True, axis=1)
countries.head()

Unnamed: 0,country_code,country
0,AD,Andorra
1,AE,United Arab Emirates
2,AF,Afghanistan
3,AG,Antigua and Barbuda
4,AI,Anguilla


In [52]:
df_add = df_add.merge(countries, how='inner', on='country')
# df_add.columns = [r.lower() for r in df_na_rem.columns]
df_add = df_add[df_na_old.columns.tolist()]

### Concatenate with last dataframe.

In [53]:
df_new = pd.concat([df_new, df_add], axis=0)

df_new.sort_values(by='geosure_id', inplace=True)

# Update: Fix houston and Dallas

In [59]:
old_df = df.copy()
df = df_new.copy()
df_tx = pd.read_csv('/Users/jorge/Desktop/mdf_texas.csv')
print(df.shape)
print(df_tx.shape)

(26723, 27)
(151, 23)


In [61]:
# df_tx_old = df.loc[(df['city'].isin(['Houston', 'Dallas'])) & (df['country'] == 'United States')].copy().reset_index(drop=True)
df_tx_old = df.loc[(df['city'].isin(['Houston', 'Dallas']))].copy().reset_index(drop=True)
print(df_tx_old.shape)

(144, 27)


In [62]:
df_tx_old['coords'] = list(zip(df_tx_old['latitude'], df_tx_old['longitude']))
df_tx['coords'] = list(zip(df_tx['latitude'], df_tx['longitude']))

In [63]:
min_dict = [get_id(df_tx_old, row) for _, row in df_tx.iterrows()]

In [65]:
dicts = [compare_df(x, df_tx_old, df_tx) for x in min_dict]
df_comp = pd.DataFrame(dicts)

print(df_comp.shape)
df_comp.head()

(151, 18)


Unnamed: 0,city_dst,country_dst,district_dst,geosure_id,latlng_dist,new_city,new_district,new_ix,new_lat,new_lng,new_record,old_city,old_district,old_ix,old_lat,old_lng,old_record,province_dst
0,0,0,0,G2000341,0.0,Dallas,Overall,0,32.78014,-96.800451,3610,Dallas,Overall,1,32.78014,-96.800451,342,0
1,0,0,0,G2003866,0.0,Dallas,Arts District,1,32.789731,-96.798262,3611,Dallas,Arts District,91,32.789731,-96.798262,3611,0
2,0,0,0,G2003867,0.0,Dallas,Bishop Arts,2,32.747256,-96.83041,3612,Dallas,Bishop Arts,92,32.747256,-96.83041,3612,0
3,0,0,0,G2003868,0.0,Dallas,Bluff View,3,32.855947,-96.83242,3613,Dallas,Bluff View,93,32.855947,-96.83242,3613,0
4,0,0,0,G2003869,0.0,Dallas,Bryan Place,4,32.792382,-96.788218,3614,Dallas,Bryan Place,94,32.792382,-96.788218,3614,0


In [66]:
comp_good = df_comp.loc[(df_comp['latlng_dist'] == 0) &
                        (df_comp['district_dst'] <= 1)]
ids_to_replace = list(comp_good[['old_ix', 'new_ix']].T.to_dict().values())
print(len(ids_to_replace))

140


In [102]:
dicts = [create_df_dict(i, df_tx, df_tx_old) for i in ids_to_replace]
df_replace = pd.DataFrame(dicts)
df_replace.head()

Unnamed: 0,city,city_original,city_type,composite,continent,country,country_code,district,geosure_id,health,...,physical,political,population,province,record,theft,wiki_city,wiki_continent,wiki_country,women
0,Dallas,Dallas,,45,North America,United States,US,Overall,G2000341,45,...,50,37,1320939.0,Texas,342,50,Q16557,Q49,Q30,36
1,Dallas,Dallas,,51,North America,United States,US,Arts District,G2003866,52,...,66,41,,Texas,3611,63,Q16557,Q49,Q30,42
2,Dallas,Dallas,,42,North America,United States,US,Bishop Arts,G2003867,45,...,45,35,,Texas,3612,54,Q16557,Q49,Q30,35
3,Dallas,Dallas,,25,North America,United States,US,Bluff View,G2003868,31,...,20,27,,Texas,3613,24,Q16557,Q49,Q30,24
4,Dallas,Dallas,,56,North America,United States,US,Bryan Place,G2003869,52,...,66,41,,Texas,3614,63,Q16557,Q49,Q30,42


## Drop items from dataframe. 

## Include new scores from removed

In [103]:
geosure_ids = df_replace['geosure_id'].tolist()
print("Len geosure ids: {}".format(len(geosure_ids)))

df_new = df.loc[~df['geosure_id'].isin(geosure_ids)].copy()

print('New shape {0}'.format(df_new.shape))

df_new = pd.concat([df_new, df_replace], axis=0)

df_new = df_new.reset_index()
df_new.sort_values(by='geosure_id', inplace=True)

print('New shape after concat {0}'.format(df_new.shape))

Len geosure ids: 140
New shape (26583, 27)
New shape after concat (26723, 28)


### Compute difference between scores

In [104]:
print("Number of rows changed: {0}".format(check_diff(df, df_new)))

Number of rows changed: 117


## Include new items from Texas

In [105]:
comp_bad = df_comp.loc[(df_comp['latlng_dist'] != 0)]
records_to_add = comp_bad['new_record']
print(len(records_to_add))

df_add = df_na.loc[df_na['record'].isin(records_to_add)]
print(df_add.shape)

9
(9, 24)


## Get geosure id counter for North America

In [106]:
gs_id = df.loc[df['continent'] == 'North America'].iloc[-1]['geosure_id']
int_id = int(gs_id.replace('G', '')) + 1

df_add['geosure_id'] = pd.Series(['G' + str(x) for x in range(int_id, int_id + df_add.shape[0])],
                                  index=df_add.index)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


## Get missing items

### Get country codes for all

In [107]:
df_add = df_add.merge(countries, how='inner', on='country')
# df_add.columns = [r.lower() for r in df_na_rem.columns]
df_add = df_add[df_na_old.columns.tolist()]

### Concatenate with last dataframe.

In [108]:
df_new = pd.concat([df_new, df_add], axis=0)

df_new.sort_values(by='geosure_id', inplace=True)

# Saving results

In [116]:
df_new.to_csv('/code/gocode/src/geosure/mdf20180825.tsv', index=False, sep='\t', encoding='utf-16')