# Imports

In [30]:
import numpy as np
import pandas as pd
import geopandas as gpd

# Load data

In [31]:
raw_df = gpd.read_file("datasets/3_combined/df.gpkg")

In [32]:
df_fe = raw_df.copy()
df_full = raw_df.copy()

# Full df

## Convert figures to proportions

In [33]:
f_cols = [f"f{i}" for i in range(91)]
m_cols = [f"m{i}" for i in range(91)]
age_cols = f_cols + m_cols
ethnicity_cols = ['white_british', 'white_irish',
       'white_gypsy/irish_traveller', 'white_roma', 'white_other',
       'mixed_white_and_asian', 'mixed_white_and_black_african',
       'mixed_white_and_black_caribbean', 'mixed_other', 'asian_bangladeshi',
       'asian_chinese', 'asian_indian', 'asian_pakistani', 'asian_other',
       'black_african', 'black_caribbean', 'black_other', 'other_arab',
       'any_other']

In [34]:
for col in age_cols:
      df_full[col] = df_full[col] / df_full['total'].replace(0, np.nan)

for col in ethnicity_cols:
       df_full[col] = df_full[col] / df_full['all_usual_residents'].replace(0, np.nan)

## Drop total residents features

In [35]:
df_full = df_full.drop(columns = ["total", "all_usual_residents"])

# Df with feature engineering

## Age and gender

### Calculate mean age

In [36]:
ages = np.tile(np.arange(91), 2)
df_fe["mean_age"] = df_fe[age_cols].apply(
    lambda row: np.average(ages, weights = row),    # Calculates weighted average of ages
    axis = 1
)

### Calculate gender ratio

In [37]:
f_total = df_fe[f_cols].sum(axis = 1)
m_total = df_fe[m_cols].sum(axis = 1)
df_fe["f_m_ratio"] = f_total / m_total

### Drop individual age and gender features

In [38]:
df_fe = df_fe.drop(columns = age_cols)
df_fe = df_fe.drop(columns = ["total", "all_usual_residents"])

## Ethnicity

### Calculate most prevalent ethnicity

In [39]:
df_fe["most_prevalent_ethnicity"] = df_fe[ethnicity_cols].idxmax(axis = 1)

### Create dummy features

In [40]:
ethnicity_dummies = pd.get_dummies(df_fe["most_prevalent_ethnicity"], prefix = "prevalent", dtype = "int")
df_fe = pd.concat([df_fe, ethnicity_dummies], axis = 1)
for col in ethnicity_dummies:
    df_fe[col] = df_fe[col].astype(int)
df_fe = df_fe.drop(columns = "most_prevalent_ethnicity")
df_fe = df_fe.drop(columns = ethnicity_cols)

## Drop features with VIF > 10

In [41]:
df_low_vif = df_fe.drop(columns = ["prevalent_asian_indian", "prevalent_black_african", "prevalent_white_british", "prevalent_white_other", "mean_age"])

# Save output

In [42]:
df_full.to_file("datasets/4_fe/df_full.gpkg", driver = "GPKG", index = False)
df_fe.to_file("datasets/4_fe/df_fe.gpkg", driver = "GPKG", index = False)
df_low_vif.to_file("datasets/4_fe/df_low_vif.gpkg", driver = "GPKG", index = False)