# Using KD-tree to query wikidata

In [1]:
import glob
import pandas as pd
import numpy as np
import os
import pandas as pd
import pyproj
from scipy import spatial

## Read Wikidata and create KD-tree

In [2]:
# Read Wikidata, distributed over several CSV files
df_wiki = pd.concat(map(pd.read_csv, 
                   glob.glob(os.path.join('', "/home/mcollardanuy/PlaceLinking/wikidata/extracted/*csv"))))
df_wiki.head()

Unnamed: 0.1,Unnamed: 0,wikidata_id,english_label,instance_of,description_set,alias_dict,nativelabel,population_dict,area,hcounties,...,vchIDs,vob_placeIDs,vob_unitIDs,epns,os_grid_ref,connectswith,street_address,adjacent_stations,ukrailcode,connectline
0,0,Q56240745,,['Q8343784'],set(),{},,{},,[],...,,,,,,,,,,
1,1,Q56240764,,['Q8343784'],set(),{},,{},,[],...,,,,,,,,,,
2,2,Q56240768,,['Q8343784'],set(),{},,{},,[],...,,,,,,,,,,
3,3,Q56241069,,['Q23397'],{'lake in Canada'},{},,{},,[],...,,,,,,,,,,
4,4,Q56241445,fountain playground Rossfeld street,['Q43483'],set(),"{'en': ['fountain playground Rossfeldstrasse',...",,{},,[],...,,,,,,,"Rossfeldstrasse 21, 3004 Bern",,,


In [3]:
print(len(df_wiki))

1235000


In [4]:
# transform lat/lon ---> x,y,z
# x,y,z will be used to create the KD-tree
ecef = pyproj.Proj(proj='geocent', ellps='WGS84', datum='WGS84')
lla = pyproj.Proj(proj='latlong', ellps='WGS84', datum='WGS84')
x_wiki, y_wiki, z_wiki = pyproj.transform(lla, ecef,
                                          df_wiki["longitude"].to_numpy(),
                                          df_wiki["latitude"].to_numpy(),
                                          np.zeros(len(df_wiki["latitude"])),
                                          radians=False)

  if __name__ == '__main__':


In [5]:
# add x, y, z to df
df_wiki["x"] = x_wiki
df_wiki["y"] = y_wiki
df_wiki["z"] = z_wiki
df_wiki.head()

Unnamed: 0.1,Unnamed: 0,wikidata_id,english_label,instance_of,description_set,alias_dict,nativelabel,population_dict,area,hcounties,...,epns,os_grid_ref,connectswith,street_address,adjacent_stations,ukrailcode,connectline,x,y,z
0,0,Q56240745,,['Q8343784'],set(),{},,{},,[],...,,,,,,,,2185098.0,-5112594.0,-3114799.0
1,1,Q56240764,,['Q8343784'],set(),{},,{},,[],...,,,,,,,,1888197.0,-4964821.0,-3518870.0
2,2,Q56240768,,['Q8343784'],set(),{},,{},,[],...,,,,,,,,2390310.0,-5171445.0,-2857998.0
3,3,Q56241069,,['Q23397'],{'lake in Canada'},{},,{},,[],...,,,,,,,,1480053.0,-3927151.0,4786767.0
4,4,Q56241445,fountain playground Rossfeld street,['Q43483'],set(),"{'en': ['fountain playground Rossfeldstrasse',...",,{},,[],...,,,,"Rossfeldstrasse 21, 3004 Bern",,,,4323198.0,564922.3,4639653.0


In [6]:
# Create wiki_kdtree
wiki_kdtree = spatial.cKDTree(df_wiki[["x", "y", "z"]].to_numpy())

## Read GB1900

In [7]:
# Read GB1900
with open("/resources/gb1900/gb1900_gazetteer_complete_july_2018.csv", encoding='UTF-16') as f:
    df_gb1900 = pd.read_csv(f)

  interactivity=interactivity, compiler=compiler, result=result)


In [8]:
# transform lat/lon ---> x,y,z
x_gb, y_gb, z_gb = pyproj.transform(lla, ecef,
                                    df_gb1900["longitude"].to_numpy(),
                                    df_gb1900["latitude"].to_numpy(),
                                    np.zeros(len(df_gb1900["latitude"])),
                                    radians=False)

  """


## Find neighbours

Here, our query points are from GB1900: x_gb, y_gb and z_gb, we use wiki_kdtree to find the closest points to GB1900 labels.

In [9]:
# Define threshold (static)
num_neighbors = 20
# distance in m
#distance_upper_bound = 1000000

all_dists, all_indxs = wiki_kdtree.query(np.array([x_gb, y_gb, z_gb]).T,
                                         k=num_neighbors)
                                         #distance_upper_bound=distance_upper_bound)

In [11]:
# By having the indices (all_indxs), extract english_label
collect_candidates = None
for nqueries in range(all_indxs.shape[1]):
    print(nqueries, end=", ")
    if isinstance(collect_candidates, type(None)):
        collect_candidates = df_wiki.iloc[all_indxs[:, nqueries]][["english_label"]].to_numpy()
    else:
        collect_candidates = np.hstack([collect_candidates, 
                                        df_wiki.iloc[all_indxs[:, nqueries]][["english_label"]].to_numpy()])

0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 

## Examples

In [12]:
query = "King's Cross Sta."
df_query = df_gb1900[df_gb1900['final_text'].str.contains(query, case=False)]
df_query.head()

Unnamed: 0,pin_id,final_text,nation,local_authority,parish,osgb_east,osgb_north,latitude,longitude,notes
39869,57eebf702c66dca322009566,King's Cross Sta.,England,Camden,Camden (Un-parished),530210.590032,182959.512834,51.530538,-0.124229,


In [13]:
# Show found candidates
collect_candidates[df_query.index]

array([["King's Cross St. Pancras tube station", 'Drinking fountain',
        'Great Northern Hotel', "London King's Cross railway station",
        'The Meeting Place',
        'St Pancras Station and former Midland Grand Hotel',
        'Numbers 1-5 And Attached Railings', "King's Cross",
        'Camden Town Hall', 'Numbers 27-43 And Attached Railings',
        'Cotton library', 'British Library Philatelic Collections',
        'Argyle Square', 'British Library Music Collections',
        'Flying Scotsman', '34B, York Way',
        'Stanley Buildings, Flats Numbers 1-20',
        'Numbers 18 To 24 (Consecutive) And Attached Railings',
        'Paradigm', 'General Chiropractic Council']], dtype=object)

In [14]:
# distance
all_dists[df_query.index]

array([[ 38.70432957,  47.37242386,  66.41628643,  75.00399428,
         80.3092222 , 105.14018541, 165.21225408, 190.24086424,
        192.54473421, 209.61918716, 224.3178194 , 224.32688416,
        230.38687389, 233.84252575, 244.73778797, 264.41418907,
        267.45604416, 293.63843922, 296.31720355, 336.01542019]])

In [15]:
# Do we have the query in wikidata
df_wiki[df_wiki['english_label'].str.contains(query, case=False, na=False)][["english_label", "latitude", "longitude"]]

Unnamed: 0,english_label,latitude,longitude
