In [None]:
# Read CSV file and create GeoDataFrame
csv_file = 'LondonLSOAs.csv'
df = pd.read_csv(csv_file)

# Create geometry objects (assuming geometry column contains polygon data in WKT format)
gdf = gpd.GeoDataFrame(df, geometry=gpd.GeoSeries.from_wkt(df['geometry']))

# Set coordinate system (assuming WGS84 coordinate system)
gdf.set_crs(epsg=27700, inplace=True)

# Fixed selected variables
fixed_columns = [
'c_percent Aged 10 to 14 years', 'c_percent Aged 15 to 19 years', 'c_percent male',
'c_percent Aged 20 to 24 years', 'c_percent Aged 25 to 29 years', 'c_percent Aged 30 to 34 years', 'c_percent Aged 35 to 39 years', 
    'c_percent Aged 40 to 44 years', 'c_percent Aged 45 to 49 years', 'c_percent Aged 50 to 54 years', 'c_percent Aged 55 to 59 years', 
    'c_percent Aged 60 to 64 years', 'c_percent Aged 65 to 69 years', 'c_percent Aged 70 to 74 years', 'c_percent Aged 75 to 79 years', 
    'c_percent Aged 80 to 84 years', 'c_percent Aged 85 years and over', 'c_percent mixed', 'c_percent white', 'c_percent 2. Professional occupations',
    'c_pop_density', 'e_NO2', 'e_ndvi', 'e_water', 'e_trees', 'e_grass', 'e_bare', 'c_percent asian', 'c_percent black', 'c_percent buddhist', 
    'c_percent muslim', 'c_percent no central heating', 'c_percent communal heating', 'c_percent TFW less than 2km', 'c_percent 49 or more hours worked', 
    'c_percent commute on foot', 'c_percent commute metro rail', 'c_percent commute bus', 'c_percent commute bicycle', 
    'c_percent student moved to address', 'c_percent occupancy rating bedrooms -2', 
    'c_percent occupancy rating bedrooms -1', 'c_percent 10 years or more', 'c_percent 2 years or more but less than 5 years', 
    'c_percent 5 years or more but less than 10 years', # 'c_percent married or in a registered civil partnership married same sex', 
    'c_percent married or in a registered civil partnership in a registered civil partnership', 
    'c_percent separated but still legally married or still legally in a civil partnership separated but still in a registered civil partnership', 
    'c_percent divorced or civil partnership dissolved formerly in a civil partnership now legally dissolved', 
    'c_percent cannot speak english well', 'c_percent cannot speak english', 'c_percent households deprived in four dimensions', 
    'c_percent christian', 'c_percent WFH', 'c_percent part-time', 'c_percent commute train', 'c_percent outside UK moved to address', 
    'c_percent occupancy rating bedrooms +2', 'c_percent 3. Associate professional and technical occupations',
    'c_percent 5. Skilled trades occupations', 'c_percent 6. Caring leisure and other service occupations', 
    'c_percent divorced or civil partnership dissolved', 'e_lake_mix_layer_depth', 'e_snow_cover', 'e_surface_runoff_sum',
    'e_snow_and_ice', 'e_crops', 'e_evaporation_from_the_top_of_canopy_sum', 'e_shrub_and_scrub' ] # Extract fixed variables
fixed_features = gdf[fixed_columns].apply(pd.to_numeric, errors='coerce')

# Fill null values ​​with the mean of neighboring nodes
for i, row in fixed_features.iterrows():
for col in fixed_columns:
if pd.isna(row[col]):
neighbor_indices = list(G.neighbors(i))
if neighbor_indices: # Make sure there are neighbors
neighbor_values ​​= fixed_features.iloc[neighbor_indices][col]
mean_value = neighbor_values.mean()
if pd.isna(mean_value):
mean_value = fixed_features[col].mean() # If the neighbor's mean is still nan, fill it with the global mean
fixed_features.at[i, col] = mean_value
else:
fixed_features.at[i, col] = fixed_features[col].mean() # If there are no neighbors, use the global mean to fill

# Check again if there are any nan values
print("Remaining NaN count:", fixed_features.isna().sum().sum())

# Calculate VIF
def calculate_vif(df):
vif = pd.DataFrame()
vif["Variable"] = df.columns
vif["VIF"] = [variance_inflation_factor(df.values, i) for i in range(df.shape[1])]
return vif
# Set to display all rows and columns
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
vif_df = calculate_vif(fixed_features)
print(vif_df)

# Standardize
scaler = StandardScaler() final_features_scaled = scaler.fit_transform(fixed_features) final_features_tensor = torch.tensor(final_features_scaled, dtype=torch.float) # Prepare target variables # target_column = 'o_diabetes_quantity_per_capita' # target_column = 'o_hypertension_quantity_per_capita' # target_column = 'o_asthma_quantity_per_capita' target_column = 'o_depression_quantity_per_capita' # target_column = 'o_anxiety_quantity_per_capita' # target_column = 'o_opioids_quantity_per_capita' # target_column = 'o_total_quantity_per_capita' target = gdf[target_column].apply(pd.to_numeric, errors='coerce').fillna(0)
targets = torch.tensor(target.values, dtype=torch.float).unsqueeze(1) # Add dimension to match model output

# Convert NetworkX graph to PyTorch Geometric data
from torch_geometric.utils import from_networkx
data = from_networkx(G)
data.x = final_features_tensor
data.y = targets