# Parish Dataset Maker

#### Imports

In [1]:
import os

import numpy as np
import pandas as pd

from tqdm.auto import tqdm

tqdm.pandas()
import os
import ast
import json
import shutil
import platform

if platform.node() == 'Nick_Laptop':
    drive = 'C'
elif platform.node() == 'MSI':
    drive = 'D'
else:
    drive = 'uhhhhhh'
    print('Uhhhhhhhhhhhhh')
os.chdir(f'{drive}:/PhD/DissolutionProgramming/LND---Land-Paper')

PROCESSED = 'Data/Processed'
RAW = 'Data/Raw'
SURNAMES = f'{PROCESSED}/surname_info'

#### Loading

In [2]:
with open(f'{PROCESSED}/parish_correction.json', 'r') as j:
    parish_name_correction = json.loads(j.read())
with open(f'{PROCESSED}/hundred_correction.json', 'r') as j:
    hundred_name_correction = json.loads(j.read())
with open(f'{PROCESSED}/parishes_to_hundreds.json', 'r') as j:
    ph_dict = json.loads(j.read())
with open(f'{PROCESSED}/hundreds_damage.json', 'r') as j:
    hundreds_damage = json.loads(j.read())

with open(f'{SURNAMES}/treated_id_dict.json', 'r') as j:
    treated_id_dict = json.loads(j.read())
with open(f'{SURNAMES}/control_id_dict.json', 'r') as j:
    control_id_dict = json.loads(j.read())
    
    
mdf = pd.read_csv(f'{PROCESSED}/master_subsidy_data_final_with_parish_info.csv', encoding='utf-8')
mdf['count'] = 1
mdf['max_val'] = mdf['value']
mdf['parish'] = mdf['parish'].apply(lambda x: parish_name_correction.get(x, x))

tdf = pd.read_csv(f'{PROCESSED}/tithe_landowners_final.csv', encoding='utf-8')

tdf.rename(columns={'area_perches': 'value'}, inplace=True)
tdf['count'] = 1
tdf['max_val'] = tdf['value']
tdf['parish'] = tdf['parish'].str.title()
tdf['parish'] = tdf['parish'].replace(parish_name_correction)
tdf['parish'] = tdf['parish'].apply(lambda x: parish_name_correction.get(x, x))

pdf = pd.read_csv(f'{PROCESSED}/devon_parish_flows.csv')
pdf = pdf[['PAR', 'hundred', 'terrain', 'mean_elev', 'mean_slope', 'wheatsuit', 'lspc1332', 'agsh1370',
           'indsh1370',
           'ind_1831', 'agr_share', 'agr_1831', 'ind_share', 'mills_1400', 'gent_1400', 'NrPatents', 'copys_1850', 'mills',
           'NrGentry', 'area', 'landOwned', 'WheatYield', 'copys_1516', 'hrv_land', 'lspc1525',
           'distriver', 'distmkt', 'distcoal', 'latitude', 'longitude']]

pdf['hrv_land'] = pdf['hrv_land'] * 240
pdf['hrv_dums'] = 0
pdf.loc[pdf['hrv_land'] > 0, 'hrv_dums'] = 1
pdf.rename(columns={'PAR': 'parish'}, inplace=True)
pdf['parish'] = pdf['parish'].str.title()
pdf['parish'] = pdf['parish'].apply(lambda x: parish_name_correction.get(x, x))




  mdf = pd.read_csv(f'{PROCESSED}/master_subsidy_data_final_with_parish_info.csv', encoding='utf-8')


#### Group Master DF by Parish, Year

In [3]:
id_vars = ['unique_id', 'group_id', 'combined_id', 'metaphone_id', 'master_id']
agg_dict = {
    'value': 'sum',
    'count': 'sum',
    'max_val': 'max',
    'surname': 'first',    }
for id_var in id_vars:
    id_type = id_var.split('_')[0]
    new_agg_dict = agg_dict.copy()
    new_agg_dict[f'{id_type}_treatment'] = 'mean'
    new_agg_dict[f'{id_type}_control'] = 'mean'
    gdf = mdf.groupby(['parish', 'year', id_var]).agg(new_agg_dict)
    gdf.rename(columns={'value': 'tot_val'}, inplace=True)
    gdf['avg_val'] = gdf['tot_val'] / gdf['count']
    reshaped = gdf.unstack('year')
    reshaped.columns = [f'{col}_{year}' for col, year in reshaped.columns]
    reshaped.reset_index(inplace=True)

    gtdf = tdf.groupby(['parish', id_var]).agg(new_agg_dict)
    gtdf.rename(columns={'value': 'tot_val'}, inplace=True)
    gtdf['avg_val'] = gtdf['tot_val'] / gtdf['count']
    gtdf.rename(columns={col: f'{col}_1840' for col in gtdf.columns if 'id' not in col and 'parish' not in col}, inplace=True)
    gtdf.reset_index(inplace=True)

    merged = pd.merge(reshaped, gtdf, on=['parish', id_var], how='outer') 
    merged.to_csv(f'{SURNAMES}/parishes_{id_var}_subsidy_data.csv', index=False, encoding='utf-8')

#### Aggregate the Surname Df, Add Parish Flows Vars

In [5]:
agg_dict = {
    'value': 'sum',
    'count': 'sum',
    'max_val': 'max',
    
}

for id_var in id_vars:
    id_type = id_var.split('_')[0]
    agg_dict[f'{id_type}_treatment'] = 'mean'
    agg_dict[f'{id_type}_control'] = 'mean'
    mdf[f'{id_type}_treatment_value'] = mdf[f'{id_type}_treatment']
    tdf[f'{id_type}_treatment_value'] = tdf[f'{id_type}_treatment']
    agg_dict[f'{id_type}_treatment_value'] = 'sum'
    mdf[f'{id_type}_control_value'] = mdf[f'{id_type}_control']
    tdf[f'{id_type}_control_value'] = tdf[f'{id_type}_control']
    agg_dict[f'{id_type}_control_value'] = 'sum'

gdf = mdf.groupby(['parish', 'year']).agg(agg_dict)
gdf['avg_val'] = gdf['value'] / gdf['count']
reshaped = gdf.unstack('year')
reshaped.columns = [f'{col}_{year}' for col, year in reshaped.columns]
reshaped.columns = [f'parish_{x.replace('value_', 'val_')}' for x in reshaped.columns]
reshaped.reset_index(inplace=True)
gtdf = tdf.groupby(['parish']).agg(agg_dict)
gtdf['avg_val'] = gtdf['value'] / gtdf['count']
gtdf.columns = [f'parish_{x}_1840' for x in gtdf.columns]
merged = pd.merge(reshaped, gtdf, on='parish', how='outer')

merged = pd.merge(merged, pdf, on='parish', how='left')
merged = merged.loc[~merged['parish'].isna()]
merged = merged.loc[merged['parish'] != '']

merged.to_csv(f'{PROCESSED}/parish_dataset.csv', index=False, encoding='utf-8')