# 1. Accessibility design

In [2]:
import pandas as pd
import random
import osmnx as ox
import numpy as np
import geopandas as gpd
from shapely.geometry import Point
from shapely.ops import unary_union
import networkx as nx

In [None]:
# 1. Read LSOA center point and stations
lsoas = gpd.read_file("data/LSOA_21.geojson").to_crs(epsg=27700)
lsoas["centroid"] = lsoas.geometry.centroid

stations = pd.read_csv("data/transportation/londonOverground_stations.csv")
stations["geometry"] = gpd.GeoSeries.from_wkt(stations["geometry"])
stations = gpd.GeoDataFrame(stations, geometry="geometry", crs="EPSG:4326").to_crs(epsg=27700)

# 2. Merge LSOA boundaries and filter sites within the region.
union_poly = unary_union(lsoas.geometry.values)

stations_in = stations[stations.geometry.within(union_poly)].copy()
print(f"Number of original stations: {len(stations)}, Number of stations in the area: {len(stations_in)}")

# 3. Randomly sample N LSOAs
N = 50
sample_ls = lsoas.sample(N, random_state=42)

# Download/load the London walking network 
G = ox.graph_from_place("Greater London, UK", network_type="walk")
G = ox.project_graph(G, to_crs="EPSG:27700")

# 4. Calculate each sample LSOA to its nearest station in order:
#    — Straight-line distance d_euc (meters)
#    — Shortest path distance on the network d_net (meters)
# Than ratio = d_net / d_euc
ratios = []

# First, match the site to the nodes of the OSMnx graph.
stations_in["osmid"] = stations_in.geometry.apply(
    lambda p: ox.distance.nearest_nodes(G, p.x, p.y)
)

# Then LSOA also matches the graph node.
sample_ls["osmid"] = sample_ls.centroid.apply(
    lambda p: ox.distance.nearest_nodes(G, p.x, p.y)
)

for idx, row in sample_ls.iterrows():
    # Find the station with the shortest distance in stations_in.
    # Euclid
    dists = stations_in.geometry.distance(row.centroid)
    j = dists.idxmin()
    station = stations_in.loc[j]
    d_euc = dists.min()
    
    # Network path
    try:
        length = nx.shortest_path_length(
            G,
            row.osmid,
            station.osmid,
            weight="length"
        )
    except Exception:
        # If the path cannot be found, skip it.
        continue
    
    ratios.append(length / d_euc)

# 5. Calculate α: Take the median 
alpha = float(np.median(ratios))
print(f"Estimated impedance factor α = {alpha:.3f}")


Number of original stations: 113, Number of stations in the area: 107
Estimated impedance factor α = 1.264


In [None]:
# 1. Load LSOA boundaries, take center points
lsoas = gpd.read_file("data/LSOA_21.geojson")[["lsoa21cd","lsoa21nm","geometry"]]
lsoas = lsoas.to_crs(epsg=27700)
lsoas["centroid"] = lsoas.geometry.centroid

# 2. Load site CSV and parse WKT => GeoDataFrame
stations = pd.read_csv("data/transportation/londonOverground_stations.csv")
stations["geometry"] = gpd.GeoSeries.from_wkt(stations["geometry"])
stations_gdf = gpd.GeoDataFrame(
    stations, geometry="geometry", crs="EPSG:4326"
).to_crs(epsg=27700)

# 3. Extract coordinate arrays (N LSOA × 2) and (M stations × 2)
lsoa_pts = np.vstack(
    lsoas.centroid.apply(lambda p: (p.x, p.y)).values
)
stat_pts = np.vstack(
    stations_gdf.geometry.apply(lambda p: (p.x, p.y)).values
)

# 4. Calculate the Euclidean distance matrix dist_mat[i,j] (meters)
diff = lsoa_pts[:, None, :] - stat_pts[None, :, :]
dist_mat = np.linalg.norm(diff, axis=2)  # shape (N, M)

# 5. Extract the most recent K=3 stations
K = 3
idx_K = np.argpartition(dist_mat, K, axis=1)[:, :K]       # (N,3)
dists_K = np.take_along_axis(dist_mat, idx_K, axis=1)     # (N,3)

# 6. Construct new features
# Distance to nearest station（km）
lsoas["d_min_km"]   = dist_mat.min(axis=1) / 1000
lsoas["d_net_min_km"] = lsoas["d_min_km"] * alpha  # 

# Average distance to the nearest three stations（km）
lsoas["d_K_avg_km"] = dists_K.mean(axis=1) / 1000
lsoas["d_K_net_km"] = lsoas["d_K_avg_km"] * alpha

# Index decay accessibility（λ=1/km）
lam = 1.0
dists_K_km = dists_K / 1000
lsoas["access_idx"] = np.exp(-lam * dists_K_km).sum(axis=1)

# 7. Find the index of the “most recent” site, then grab its name and open_year.
nearest_idx = dist_mat.argmin(axis=1)  # shape (N,)
lsoas["nearest_station"]    = stations_gdf.iloc[nearest_idx]["name"].values
lsoas["nearest_open_year"]  = stations_gdf.iloc[nearest_idx]["opened"].values

In [4]:
lsoas = lsoas.drop(columns="centroid")

In [None]:
# lsoas.to_file("data/lsoa_distance.geojson", driver="GeoJSON", index=False)

# 2. Merge housing price data

In [6]:
# Load processed housing price data
price_path = 'data/houseprice/Houseprice_Processed.csv'
price_df = pd.read_csv(price_path)
price_df.rename(columns={'LSOA code': 'lsoa21cd', 'LSOA name': 'lsoa21nm'}, inplace=True)
lsoa_in_price = price_df['lsoa21cd'].unique()


In [7]:
# Convert housing price data into a long table
# Only retain lsoa21cd, lsoa21nm, and the time column.
id_vars = ['lsoa21cd', 'lsoa21nm']
time_cols = [col for col in price_df.columns if col not in id_vars and ('-' in col or col[:4].isdigit())]
price_simple = price_df[id_vars + time_cols]

price_long = price_simple.melt(
    id_vars=id_vars,
    value_vars=time_cols,
    var_name='date',
    value_name='median_price'
)
price_long['date'] = pd.to_datetime(price_long['date'])

In [8]:
price_long_merge = price_long.merge(
    lsoas[['lsoa21cd', 'd_min_km','d_net_min_km','d_K_avg_km','d_K_net_km','access_idx','nearest_station','nearest_open_year']],
    on='lsoa21cd',
    how='inner'
)

In [9]:
price_long_merge

Unnamed: 0,lsoa21cd,lsoa21nm,date,median_price,d_min_km,d_net_min_km,d_K_avg_km,d_K_net_km,access_idx,nearest_station,nearest_open_year
0,E01000001,City of London 001A,1996-03-01,120178,1.115526,1.410025,1.587198,2.006219,0.662824,Liverpool Street,2015.0
1,E01000002,City of London 001B,1996-03-01,127646,0.823145,1.040455,1.329387,1.680345,0.869266,Liverpool Street,2015.0
2,E01000003,City of London 001C,1996-03-01,71244,1.125599,1.422757,1.427113,1.803871,0.745153,Liverpool Street,2015.0
3,E01000005,City of London 001E,1996-03-01,57122,0.613259,0.775159,0.991254,1.252945,1.158617,Liverpool Street,2015.0
4,E01032739,City of London 001F,1996-03-01,179444,0.766818,0.969258,1.412377,1.785244,0.835649,Liverpool Street,2015.0
...,...,...,...,...,...,...,...,...,...,...,...
495823,E01033601,Westminster 009G,2022-12-01,727612,2.019759,2.552976,2.189082,2.767000,0.339186,South Hampstead,2007.0
495824,E01033602,Westminster 009H,2022-12-01,399343,2.043582,2.583087,2.232525,2.821912,0.328004,South Hampstead,2007.0
495825,E01033603,Westminster 009I,2022-12-01,424893,2.330879,2.946232,2.383697,3.012993,0.277275,Euston (Rail),2007.0
495826,E01033604,Westminster 009J,2022-12-01,396677,2.285109,2.888378,2.401976,3.036098,0.274126,South Hampstead,2007.0


# 3. Processing residential structural and population data

In [None]:
#  Read housing ratio and population data
pop_prop = pd.read_csv('data/pop_prop.csv')

# Define the mapping from quarter to month-day
qtr_to_mid = {
    'Q1': '-03-01',
    'Q2': '-06-01',
    'Q3': '-09-01',
    'Q4': '-12-01',
}

pop_prop['date'] = pd.to_datetime(
    pop_prop['year_qtr'].str[:4] +            
    pop_prop['year_qtr'].str[-2:].map(qtr_to_mid)  #  "Q1"→"-03-01"
)

pop_prop = pop_prop.drop(columns="year_qtr")

In [11]:
# Only retain the LSOA and date remaining after processing the housing price data.
keep_lsoa = price_long_merge['lsoa21cd'].unique()
keep_qtr = price_long_merge['date'].unique()

pop_prop = pop_prop[
    pop_prop['lsoa21cd'].isin(keep_lsoa) & pop_prop['date'].isin(keep_qtr)
].copy()

In [12]:
# Merge
merged = price_long_merge.merge(
    pop_prop,
    on=['lsoa21cd', 'date'],
    how='left',    
    suffixes=('', '_popprop')
)

In [None]:
# merged.to_csv("data/Final_panel_data2.csv", index=False)

In [13]:
print("open_year distribution:\n", merged['nearest_open_year'].value_counts(dropna=False).head())

open_year distribution:
 nearest_open_year
2007.0    246888
2010.0    108756
2015.0     79380
2012.0     29592
2022.0     26136
Name: count, dtype: int64


In [None]:
# replace 2011 → 2012，2009 → 2010
merged['nearest_open_year'] = merged['nearest_open_year'].replace({
    2011: 2012,
    2009: 2010
})


# 2. announcement_year
announce_map = {
    2007: 2006,
    2010: 2006,
    2012: 2009,   
    2015: 2015,
    2022: 2017
}
merged['announcement_year'] = merged['nearest_open_year'].map(announce_map)


idx = merged.columns.get_loc('nearest_open_year')
ann = merged.pop('announcement_year')
merged.insert(idx+1, 'announcement_year', ann)

print(merged[['nearest_station','nearest_open_year','announcement_year']].drop_duplicates())

merged.to_csv('output/Final_panel_data.csv', index=False)


              nearest_station  nearest_open_year  announcement_year
0            Liverpool Street             2015.0               2015
6              Barking (Rail)             2007.0               2006
12          Barking Riverside             2022.0               2017
14                    Romford             2015.0               2015
108              Enfield Town             2015.0               2015
...                       ...                ...                ...
4085                 Shadwell             2010.0               2006
4180      Walthamstow Central             2015.0               2015
4198      Leyton Midland Road             2007.0               2006
4226          St James Street             2015.0               2015
4255  Walthamstow Queens Road             2007.0               2006

[108 rows x 3 columns]
