# GB1900, find neighbours using KD-tree

In [1]:
# %matplotlib notebook 

## Read the Gazetteer (gb1900_gazetteer_complete_july_2018.csv)

In [2]:
import numpy as np
import pandas as pd
import pyproj
from scipy import spatial

In [3]:
with open("./gb1900_gazetteer_complete_july_2018.csv", encoding='UTF-16') as f:
    df = pd.read_csv(f)

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
print("#items: ", len(df))
df.head()

#items:  2552459


Unnamed: 0,pin_id,final_text,nation,local_authority,parish,osgb_east,osgb_north,latitude,longitude,notes
0,52b34d8b695fe90005004e1e,F. P.,Wales,Powys,Llansilin,320836.712742,327820.182715,52.84205,-3.176744,
1,5800a6b92c66dcab3d061796,Parly. & Munl Boro. By.,England,City of London,,531794.825962,180705.741898,51.509918,-0.102246,
2,5800a6782c66dcab3d061786,S. Ps.,England,City of London,,531736.217116,180725.02773,51.510105,-0.103083,
3,57f684f42c66dcab3d01c0dd,Southwark Bridge Stairs,England,City of London,,532199.584123,180696.934434,51.509744,-0.09642,
4,57f685002c66dcab3d01c0e9,St. Paul's Pier,England,City of London,,531987.486097,180745.664556,51.510232,-0.099456,


## Convert lat lon to xyz

In [5]:
ecef = pyproj.Proj(proj='geocent', ellps='WGS84', datum='WGS84')
lla = pyproj.Proj(proj='latlong', ellps='WGS84', datum='WGS84')
x, y, z = pyproj.transform(lla, ecef, 
                           df["longitude"].to_numpy(), 
                           df["latitude"].to_numpy(), 
                           np.zeros(len(df["latitude"])), 
                           radians=False)

In [6]:
# add x, y, z to df
df["x"] = x
df["y"] = y
df["z"] = z

In [7]:
df.head()

Unnamed: 0,pin_id,final_text,nation,local_authority,parish,osgb_east,osgb_north,latitude,longitude,notes,x,y,z
0,52b34d8b695fe90005004e1e,F. P.,Wales,Powys,Llansilin,320836.712742,327820.182715,52.84205,-3.176744,,3854770.0,-213945.645048,5059946.0
1,5800a6b92c66dcab3d061796,Parly. & Munl Boro. By.,England,City of London,,531794.825962,180705.741898,51.509918,-0.102246,,3977779.0,-7098.456811,4969049.0
2,5800a6782c66dcab3d061786,S. Ps.,England,City of London,,531736.217116,180725.02773,51.510105,-0.103083,,3977762.0,-7156.526053,4969062.0
3,57f684f42c66dcab3d01c0dd,Southwark Bridge Stairs,England,City of London,,532199.584123,180696.934434,51.509744,-0.09642,,3977794.0,-6694.027028,4969037.0
4,57f685002c66dcab3d01c0e9,St. Paul's Pier,England,City of London,,531987.486097,180745.664556,51.510232,-0.099456,,3977752.0,-6904.747577,4969071.0


In [8]:
"""
# Only if you want to make sure that the units are in meters:
xyz_arr = df[["x", "y", "z"]].to_numpy()
np.sqrt(xyz_arr[:, 0]**2 + xyz_arr[:, 1]**2 + xyz_arr[:, 2]**2)[0]
"""

'\n# Only if you want to make sure that the units are in meters:\nxyz_arr = df[["x", "y", "z"]].to_numpy()\nnp.sqrt(xyz_arr[:, 0]**2 + xyz_arr[:, 1]**2 + xyz_arr[:, 2]**2)[0]\n'

## Create KD-tree

In [9]:
kdtree = spatial.cKDTree(df[["x", "y", "z"]].to_numpy())

In [10]:
num_neighbors = 101
distance_upper_bound = 1000000
all_dists, all_indxs = kdtree.query(df[["x", "y", "z"]].to_numpy(), 
                                    k=num_neighbors, 
                                    distance_upper_bound=distance_upper_bound)
print(all_dists)
print(all_indxs)

[[   0.          174.66294676  176.22419747 ... 1467.04769594
  1475.01279324 1490.72457565]
 [   0.           61.71014601   88.89388984 ...  765.7825225
   768.5968675   771.5126272 ]
 [   0.           61.71014601   95.19482027 ...  775.96427984
   778.49704072  779.57903563]
 ...
 [   0.          110.71863547  113.98883537 ...  759.10832524
   761.17554328  765.41077381]
 [   0.           38.35726828   88.52599591 ...  750.99274305
   758.78585747  758.90171996]
 [   0.          130.66335393  143.24452054 ... 1577.45582711
  1605.83045084 1611.57474683]]
[[      0  439055     218 ...  625921  420577  444417]
 [      1       2     103 ...  337015     110  336682]
 [      2       1  336420 ...  336697  336404  336433]
 ...
 [2552456 2552455      89 ...  336770  333857  333851]
 [2552457  336131  336139 ...  339735  336774  338783]
 [2552458 1083551 2552400 ...  439054  445636  419720]]


In [11]:
len(all_dists)

2552459

## inputs for label2vec

In [None]:
min_dist = 0.1
max_dist = 1000
number_context_words = 10

context_sents_list = []
center_words_list = []
for i in range(len(df)):
    if np.mod(i, 1000) == 0: print(i, end=" ")
        
    before_center = df.iloc[all_indxs[i, 
                                      (min_dist < all_dists[i]) * 
                                      (all_dists[i] < max_dist)]].final_text.to_list()[::2][::-1]
    
    if len(before_center) < number_context_words:
        continue
    before_center_cut = " ".join(before_center).split()[-number_context_words:]
    
    after_center = df.iloc[all_indxs[i, 
                                     (min_dist < all_dists[i]) * 
                                     (all_dists[i] < max_dist)]].final_text.to_list()[1::2]
    if len(after_center) < number_context_words:
        continue
    after_center_cut = " ".join(after_center).split()[:number_context_words]
    
    mysentence = before_center_cut
    #mysentence.append(center)
    mysentence.extend(after_center_cut)
    center_word = df.iloc[all_indxs[i]].final_text.values[0].split()
    for j in center_word:
        center_words_list.append(j.lower())
        context_sents_list.append(" ".join(mysentence).lower())

0 1000 2000 3000 4000 5000 6000 7000 8000 9000 10000 11000 12000 13000 14000 15000 16000 17000 18000 19000 20000 21000 22000 23000 24000 25000 26000 27000 28000 29000 30000 31000 32000 33000 34000 35000 36000 37000 38000 39000 40000 41000 42000 43000 44000 45000 46000 47000 48000 49000 50000 51000 52000 53000 54000 55000 56000 57000 58000 59000 60000 61000 62000 63000 64000 65000 66000 67000 68000 69000 70000 71000 72000 73000 74000 75000 76000 77000 78000 79000 80000 81000 82000 83000 84000 85000 86000 87000 88000 89000 90000 91000 92000 93000 94000 95000 96000 97000 98000 99000 100000 101000 102000 103000 104000 105000 106000 107000 108000 109000 110000 111000 112000 113000 114000 115000 116000 117000 118000 119000 120000 121000 122000 123000 124000 125000 126000 127000 128000 129000 130000 131000 132000 133000 134000 135000 136000 137000 138000 139000 140000 141000 142000 143000 144000 145000 146000 147000 148000 149000 150000 151000 152000 153000 154000 155000 156000 157000 158000 

In [None]:
df2train = pd.DataFrame(center_words_list, columns=["target"])
df2train["context"] = context_sents_list

In [None]:
df2train.head()

In [None]:
df2train.to_csv("./data2train_v001.csv")

# ======= PLAYGROUND

In [None]:
#np.shape(all_dists)[0] - np.sum(np.any(all_dists == np.inf, axis=1))
np.sum(np.any(all_dists == np.inf, axis=1))

In [None]:
mean_dists = np.mean(all_dists, axis=1)

In [None]:
args_mean_sort = np.argsort(mean_dists)
mean_dists_sorted = mean_dists[args_mean_sort]

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(15, 15))
plt.scatter(df["longitude"].values[args_mean_sort][:10000], 
            df["latitude"].values[args_mean_sort][:10000], 
            c=mean_dists_sorted[:10000], 
            cmap="hot_r",
            vmin=40,
            vmax=800
           )
plt.colorbar()
#plt.plot(np.mean(all_dists, axis=1))
plt.grid()
#plt.ylim(0, 10000)
plt.show()

In [None]:
plt.plot(np.mean(all_dists, axis=1)[mean_dists_sort][:1000000])
plt.ylim(0, 5000)
plt.show()