# Imports

In [23]:
import numpy as np
import pandas as pd
import geopandas as gpd

# Load dataset

In [24]:
raw_df = gpd.read_file("datasets/3_combined/df.gpkg")

In [25]:
df = raw_df.copy()

# Calculate mean age per LSOA

In [26]:
f_cols = [f"f{i}" for i in range(91)]
m_cols = [f"m{i}" for i in range(91)]
age_cols = f_cols + m_cols
ages = np.tile(np.arange(91), 2)
df["mean_age"] = df[age_cols].apply(
    lambda row: np.average(ages, weights = row),    # Calculates weighted average of ages
    axis = 1
)

# Calculate gender ratio per LSOA

In [27]:
f_total = df[f_cols].sum(axis = 1)
m_total = df[m_cols].sum(axis = 1)
df["f_m_ratio"] = f_total / m_total

# Drop individual age and gender cols

In [28]:
df = df.drop(columns = age_cols)
df = df.drop(columns= ["total"])

In [29]:
df.to_file("datasets/4_fe/df_with_age_gender_fe.gpkg", driver = "GPKG")

# Calculate most prevalent ethnicity per LSOA

In [30]:
ethnicity_cols = ["white_british", "white_irish", "white_other", "white_gypsy/irish_traveller", "white_roma", "asian_bangladeshi", "asian_indian", "asian_pakistani", "asian_chinese", "asian_other","black_african", "black_caribbean", "black_other", "mixed_white_and_asian", "mixed_white_and_black_african", "mixed_white_and_black_caribbean", "mixed_other", "other_arab", "any_other"]

In [31]:
df["most_prevalent_ethnicity"] = df[ethnicity_cols].idxmax(axis = 1)

In [32]:
ethnicity_dummies = pd.get_dummies(df["most_prevalent_ethnicity"], prefix = "prevalent", dtype = "int", drop_first = True)
df = pd.concat([df, ethnicity_dummies], axis = 1)
for col in ethnicity_dummies:
    df[col] = df[col].astype(int)
df = df.drop(columns = "most_prevalent_ethnicity")
df = df.drop(columns = ethnicity_cols)

# Save output

In [33]:
df.to_file("datasets/4_fe/df_with_full_fe.gpkg", driver = "GPKG")