# Adding Extra Data to Our Surnames

#### Imports

In [1]:
import json
import pandas as pd
import numpy as np
import phonetics as ph
import re
import geopandas as gp
from tqdm.notebook import tqdm
tqdm.pandas()
import os
import time
import platform
if platform.node() == 'Nick_Laptop':
    drive = 'C'
elif platform.node() == 'MSI':
    drive = 'D'
else:
    drive = 'uhhhhhh'
    print('Uhhhhhhhhhhhhh')
os.chdir(f'{drive}:/PhD/DissolutionProgramming/LND---Land-Paper')

#%% Globals
PROCESSED = 'Data/Processed'
RAW = 'Data/Raw'
MODELS = f'Code/ml_models/'
SURNAMES = f'{PROCESSED}/surname_info'

#### Loading

In [2]:
# Parish dataframe
pdf = pd.read_csv(f'{PROCESSED}/devon_parish_flows.csv')
pdf.rename(columns={
    'PAR': 'parish'
}, inplace=True)

# Subsidy master dataframe
mdf = pd.read_csv(f'{PROCESSED}/master_subsidy_data_final.csv')
mdf.dropna(inplace=True, subset=['parish'])
mdf_copy = mdf.copy()
with open(f'{PROCESSED}/parish_correction.json', 'r') as f:
    parish_correction = json.load(f)

pdf['parish'] = pdf['parish'].str.replace('St. ', 'St ', regex=False)
pdf['parish'] = pdf['parish'].str.replace(',', '', regex=False)
pdf['parish'] = pdf['parish'].str.replace('\'', '', regex=False)
pdf['parish'] = pdf['parish'].str.title()
pdf['parish'] = pdf['parish'].apply(lambda x: parish_correction.get(x, x))
mdf['parish'] = mdf['parish'].apply(lambda x: parish_correction.get(x, x))

pdf.loc[pdf['parish'].str.contains('Dartmouth', regex=False), 'parish'] = 'Dartmouth'
mdf.loc[mdf['parish'].str.contains('Woolfardisworthy', regex=False), 'parish'] = 'Woolfardisworthy'
mdf.loc[mdf['parish'].str.contains('Burlescombe', regex=False), 'parish'] = 'Burlescombe'
mdf.loc[mdf['parish'].str.contains('Littleham', regex=False), 'parish'] = 'Littleham'
mdf.loc[mdf['parish'].str.contains('Uplowman', regex=False), 'parish'] = 'Uplowman'

pdf_parish_set = set(pdf['parish'].unique())
mdf_parish_set = set(mdf['parish'].unique())

print(pdf_parish_set - mdf_parish_set)
print(mdf_parish_set - pdf_parish_set)

{'Bulkworthy', 'Plymouth St Andrew', 'Dotton', 'Exeter Castle Yard', 'Exeter Cathedral', 'East Putford', 'Stockland', 'Rousdon', 'Lundy', 'Exeter Bedford Precinct'}
{'West Teignton', 'Pennycross', 'Maker', 'Oakford', 'Thorncombe'}


#### Merging the Dataframes

In [3]:
new_df = pd.merge(
    mdf,
    pdf[['parish', 'landOwned', 'distmkt', 'distriver', 'area']]
)

In [4]:
# Get taxpayer count for each parish in each subsidy year

parish_data_dict = {}
for subsidy in [
    1524, 
    1543,
    1581,
    1674
]:
    year_dict = {}
    # Get number of taxpayers 
    taxpayer_count = mdf_copy[mdf_copy['year'] == subsidy].groupby('parish').size().reset_index(name='taxpayer_count')
    count_dict = taxpayer_count.set_index('parish')['taxpayer_count'].to_dict()
    year_dict['taxpayer_count'] = count_dict
    # Get average value of subsidy for each parish in each subsidy year
    avg_value = mdf_copy[mdf_copy['year'] == subsidy].groupby('parish')['value'].mean().reset_index(name='avg_value')
    avg_value_dict = avg_value.set_index('parish')['avg_value'].to_dict()
    year_dict['avg_value'] = avg_value_dict
    # Get population density for each parish in each subsidy year
    area_dict = pdf.set_index('parish')['area'].to_dict()
    pop_density_dict = {}
    for parish, area in area_dict.items():
        if area > 0:
            pop_density_dict[parish] = count_dict.get(parish, 0) / area
        else:
            pop_density_dict[parish] = 0
    year_dict['pop_density'] = pop_density_dict

    parish_data_dict[subsidy] = year_dict

for k, v in parish_data_dict.items():
    print(f'Year: {k}')
    print(f'Taxpayer count: {len(v["taxpayer_count"])}')
    print(f'Avg value: {len(v["avg_value"])}')
    print(f'Pop density: {len(v["pop_density"])}')
    print()



for subsidy in [
    1524, 
    1543,
    1581,
    1674
]:
    new_df[f'taxpayer_count_{subsidy}'] = new_df['parish'].map(parish_data_dict[subsidy]['taxpayer_count'])
    new_df[f'avg_value_{subsidy}'] = new_df['parish'].map(parish_data_dict[subsidy]['avg_value'])
    new_df[f'pop_density_{subsidy}'] = new_df['parish'].map(parish_data_dict[subsidy]['pop_density'])
    
# Save the new dataframe
new_df.to_csv(f'{PROCESSED}/master_subsidy_data_final_with_parish_info.csv', index=False)

Year: 1524
Taxpayer count: 430
Avg value: 430
Pop density: 468

Year: 1543
Taxpayer count: 427
Avg value: 427
Pop density: 468

Year: 1581
Taxpayer count: 449
Avg value: 449
Pop density: 468

Year: 1674
Taxpayer count: 321
Avg value: 321
Pop density: 468

