# Generating Status Measurements

#### Imports

In [1]:
import os
import shutil
import re
import ast
import json
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

tqdm.pandas()
import phonetics as ph
import platform

if platform.node() == 'Nick_Laptop':
    drive = 'C'
elif platform.node() == 'MSI':
    drive = 'D'
else:
    drive = 'uhhhhhh'
    print('Uhhhhhhhhhhhhh')
os.chdir(f'{drive}:/PhD/DissolutionProgramming/LND---Land-Paper')

PROCESSED = 'Data/Processed'
RAW = 'Data/Raw'
SURNAMES = f'{PROCESSED}/surname_info'


#### Load Data

In [2]:
mdf = pd.read_csv(f'{PROCESSED}/master_subsidy_data_final_with_parish_info.csv', encoding='utf-8')
if 'gemini_surname' in mdf.columns:
    mdf.rename(columns={'gemini_surname': 'surname',
                        'gemini_parish': 'parish',
                        'gemini_value': 'value'}, inplace=True)

mdf = mdf.loc[~mdf['surname'].isna()]
mdf = mdf[mdf['surname'].str.istitle()]
phdf = pd.read_csv(f'{RAW}/parishes_hundreds.csv', encoding='utf-8')
tdf = pd.read_csv(f'{PROCESSED}/tithe_landowners_final.csv')

# Calendar of Particulars Data
cdf = pd.read_csv(f'{PROCESSED}/calendar_recipients_final.csv', encoding='utf-8')
cdf = cdf.loc[~cdf['surname'].isna()]
cdf['surname'] = cdf['surname'].str.title()
cdf['surname'] = cdf['surname'].str.replace('St Hill', 'Sainthill', regex=False)
cdf['in_devon'] = cdf['in_devon'].replace({'?': 0})
cdf['in_devon'] = cdf['in_devon'].astype(float)



recipient_unique_ids = cdf['unique_id'].unique()
recipient_group_ids = cdf['group_id'].unique()
recipient_combined_ids = cdf['combined_id'].unique()
recipient_metaphone_ids = cdf['metaphone_id'].unique()
recipient_master_ids = cdf['master_id'].unique()


  mdf = pd.read_csv(f'{PROCESSED}/master_subsidy_data_final_with_parish_info.csv', encoding='utf-8')


# ADJUSTMENT FOR LAND

In [3]:
# Multiply all values with gemini_type == 'L' by 1.47
mdf.loc[mdf['gemini_type'] == 'L', 'value'] *= 1.47

#### Parish and Hundred Dictionaries

In [4]:
with open(f'{PROCESSED}/parish_correction.json', 'r') as j:
    parish_name_correction = json.loads(j.read())
with open(f'{PROCESSED}/parishes_to_hundreds.json', 'r') as f:
    hundo_dict = json.loads(f.read())

In [5]:
# %% Correcting hundreds
mdf['parish'] = mdf['parish'].str.title()
mdf['parish'] = mdf['parish'].apply(lambda x: parish_name_correction[x] if x in parish_name_correction else x)
mdf = mdf.loc[~mdf['parish'].isin(['Northcott Hamlet'])]

tdf['parish'] = tdf['parish'].str.title()
tdf['parish'] = tdf['parish'].apply(lambda x: parish_name_correction[x] if x in parish_name_correction else x)

mdf['hundred'] = mdf['parish'].apply(lambda x: hundo_dict[x] if x in hundo_dict else '')
tdf['hundred'] = tdf['parish'].apply(lambda x: hundo_dict[x] if x in hundo_dict else '')
print(mdf.loc[mdf['hundred'] == '', 'image_name'].unique())
print(tdf.loc[tdf['hundred'] == '', ['parish']])

['little_subsidy1524_page_075.png' 'little_subsidy1524_page_102.png'
 'little_subsidy1524_page_149.png' 'little_subsidy1524_page_150.png'
 'little_subsidy1524_page_212.png' 'little_subsidy1543_page_095.png'
 'little_subsidy1543_page_135.png' 'little_subsidy1581_page_052.png'
 'little_subsidy1581_page_054.png' 'little_subsidy1674_page_132.png'
 'little_subsidy1674_page_202.png']
          parish
111132  Townstal
111133  Townstal
111134  Townstal
111135  Townstal
111136  Townstal
...          ...
111760  Townstal
111761  Townstal
111762  Townstal
111763  Townstal
469436          

[633 rows x 1 columns]


#### Getting surname-group ranks based on all five types of IDs

In [6]:
mdf['count'] = 1
mdf['max_val'] = mdf['value']




tdf.rename(columns={'area_perches': 'value'}, inplace=True)
tdf['count'] = 1
tdf['max_val'] = tdf['value']

for id_type in ['unique', 'group', 'combined', 'metaphone', 'master']:
    id_var = f'{id_type}_id'
    id_df = pd.DataFrame({'id': mdf[id_var].unique()})
    agg_dict = {
    'surname': 'first',
    'count': 'sum',
    'max_val': 'max',
    'value': 'sum'
}    
    mean_vars = ['landOwned', 'distmkt', 'distriver', 'area'] + [x for x in mdf.columns if 'taxpayer_count' in x or 'avg_value' in x or 'pop_density' in x]
    for var in mean_vars:
        agg_dict[var] = 'mean'

    rename_mean_vars = {var: f'parish_{var}' for var in mean_vars}
    rename_mean_vars['landOwned'] = 'parish_monastic_land'
    for subsidy in [
        1524, 
        1543, 
        1581, 
        1674,
        1840
    ]:
        if subsidy == 1840:
            agg_dict = {
                'surname': 'first',
                'count': 'sum',
                'max_val': 'max',
                'value': 'sum'
                    }    
            gdf = tdf.groupby(id_var).agg(agg_dict).reset_index()
        else:
            gdf = mdf.loc[mdf['year'] == subsidy].copy()
            gdf = gdf.groupby(id_var).agg(agg_dict).reset_index()
        gdf.rename(columns={'value': 'tot_val'}, inplace=True)

        id_df['surname'] = id_df['id'].apply(
            lambda x: gdf.loc[gdf[id_var] == x, 'surname'].values[0] if x in gdf[id_var].values else '')
        id_df['count_' + str(subsidy)] = id_df['id'].apply(
            lambda x: gdf.loc[gdf[id_var] == x, 'count'].values[0] if x in gdf[id_var].values else 0)
        id_df['max_val_' + str(subsidy)] = id_df['id'].apply(
            lambda x: gdf.loc[gdf[id_var] == x, 'max_val'].values[0] if x in gdf[id_var].values else 0)
        id_df['tot_val_' + str(subsidy)] = id_df['id'].apply(
            lambda x: gdf.loc[gdf[id_var] == x, 'tot_val'].values[0] if x in gdf[id_var].values else 0)
        id_df['avg_val_' + str(subsidy)] = id_df.apply(
            lambda x: x['tot_val_' + str(subsidy)] / x['count_' + str(subsidy)]
            if x['count_' + str(subsidy)] > 0 else 0, axis=1)
        for measure in ['count', 'max_val', 'tot_val', 'avg_val']:
            id_df[f'{measure}_pctile'] = id_df[f'{measure}_{subsidy}'].rank(pct=True)
        if subsidy != 1840:
            for var in mean_vars:
                id_df[var] = id_df['id'].apply(
                    lambda x: gdf.loc[gdf[id_var] == x, var].values[0] if x in gdf[id_var].values else np.nan)

    id_df.rename(columns=rename_mean_vars, inplace=True)
    for var in id_df.columns:
        if id_df[var].dtype == 'object':
            continue
        id_df[f'ln_{var}'] = np.log(id_df[var] + 1)
    id_df.to_csv(f'{SURNAMES}/{id_var}_subsidy_data_adjusted.csv', index=False, encoding='utf-8')


