In [159]:
import geopandas as gpd

from pysal.explore import esda
from pysal.lib import weights
import matplotlib.pyplot as plt
import numpy as np

wk = gpd.read_parquet('data/wijken/wijken_stats.parquet')

In [160]:
wk.columns.values

cols_norm = ['L0_altieri_1_T', 'L1_altieri_1_T']

wk = wk.dropna(subset=cols_norm)

w = weights.KNN.from_dataframe(wk, k=8)
# Row-standardization
w.transform = "R"

featurelist = []

# make cols range from 0 to 1
for col in cols_norm:
    wk[col+'_norm'] = (wk[col] - wk[col].min()) / (wk[col].max() - wk[col].min())
    wk[f"w_{col}"] = weights.lag_spatial(w, wk[col])
    lisa = esda.moran.Moran_Local(wk[col], w)
    wk[f"{col}_Is"] = lisa.Is
    wk[f"{col}_Is_norm"] = (lisa.Is - lisa.Is.min()) / (lisa.Is.max() - lisa.Is.min())
    
    featurelist.append(f"{col}_norm")
    featurelist.append(f"{col}_Is_norm")
    
wk.to_parquet('data/wijken/wijken_stats_lisa.parquet')


In [161]:
sample = wk.sample(1)

print(f"{sample.gemeentenaam.values[0]} - {sample.wijknaam.values[0]}")

Dijk en Waard - Molenwijk


In [162]:
sim = wk.copy()

# calculate the distance between the sample and all other wijken, don't use the index
sim["distance"] = np.linalg.norm(sim[featurelist] - sample[featurelist].values[0], axis=1)

# sort by distance ascending
similarities = sim.sort_values("distance").reset_index(drop=True)

similarities[["gemeentenaam", "wijknaam", "distance"]].head(10)

Unnamed: 0,gemeentenaam,wijknaam,distance
0,Dijk en Waard,Molenwijk,0.0
1,Tilburg,Huibeven,0.025968
2,Hillegom,Hillegom Zuid,0.04388
3,Den Helder,Wijk 02 Stad binnen de Linie-West,0.055283
4,Brunssum,Wijk 05 Brunssum-Centrum,0.059255
5,Lansingerland,Wijk 27 Westpolder,0.060924
6,Almelo,Hofkamp,0.061546
7,Almere,Parkwijk,0.061791
8,Nieuwegein,Zuilenstein,0.062498
9,Almere,Indischebuurt,0.063758
