# Assigning Surname IDs

#### Imports

In [3]:
import json
import pandas as pd
import numpy as np
import phonetics as ph
import re
from tqdm.notebook import tqdm
tqdm.pandas()
import os
import unidecode
import platform
import torch
import torch.nn as nn
import torch.nn.functional as F
from rapidfuzz import fuzz
import time
if platform.node() == 'Nick_Laptop':
    drive = 'C'
elif platform.node() == 'MSI':
    drive = 'D'
else:
    drive = 'uhhhhhh'
    print('Uhhhhhhhhhhhhh')
os.chdir(f'{drive}:/PhD/DissolutionProgramming/LND---Land-Paper')
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')
#%% Globals
PROCESSED = 'Data/Processed'
RAW = 'Data/Raw'
MODELS = f'Code/ml_models/'
SURNAMES = f'{PROCESSED}/surname_info'

#### Regexes

In [4]:

alt_curly_brackets = re.compile(r'([A-Za-z]+)\s+\{[A-Za-z]+\}')
single_curly_brackets = re.compile(r'\{([A-Za-z]+)\}')
alias_pattern = re.compile(r'([A-Za-z]+) ali?a?s ([A-Za-z]+)')
or_pattern = re.compile(r'([A-Za-z]+) or ([A-Za-z]+)')
double_name = re.compile(r'([A-Za-z]+)\s+([A-Za-z]+)')
and_sons = re.compile(r'([A-Za-z]+) & sons+', re.IGNORECASE)
and_co = re.compile(r'([A-Za-z]+) & co', re.IGNORECASE)
saint_pattern = re.compile(r'Sa?i?n?t\s+([A-Za-z]+)')


#### Defining and Loading the Model

In [5]:
#%% Defining model
def encode_surname(surname, max_len=24):
    CHARSET = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ0'
    CHARSET_DICT = {char: i + 1 for i, char in enumerate(CHARSET)}
    PAD = 0
    surname = surname.upper()
    surname = ''.join([char for char in surname if char in CHARSET])
    metaphone = ph.metaphone(surname)

    encoded = [CHARSET_DICT[char] for char in surname]
    if len(encoded) < max_len:
        encoded += [PAD] * (max_len - len(encoded))
    encoded = torch.tensor(encoded).long()

    encoded_metaphone = [CHARSET_DICT[char] for char in metaphone]
    if len(encoded_metaphone) < max_len:
        encoded_metaphone += [PAD] * (max_len - len(encoded_metaphone))
    encoded_metaphone = torch.tensor(encoded_metaphone).long()

    return encoded, encoded_metaphone

class CrossEncoder(nn.Module):
    def __init__(self, embed_dim=128, hidden_dim=64, fc_dim=32):
        super(CrossEncoder, self).__init__()

        # Embedding layer (shared between names and metaphones)
        self.name_embedding = nn.Embedding(28, embed_dim)  # Assuming 27 letters + 1 padding
        self.metaphone_embedding = nn.Embedding(28, embed_dim)
        # BiLSTM for sequence encoding
        self.name_lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.metaphone_lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True, bidirectional=True)

        # Fully connected layers for classification
        self.fc1 = nn.Linear(4*2 * hidden_dim, fc_dim)  # Combining all four encodings
        self.fc2 = nn.Linear(fc_dim, 1)

    def name_encode(self, x):
        name_embedded = self.name_embedding(x)
        _, (name_hidden, _) = self.name_lstm(name_embedded)
        name_hidden = torch.cat((name_hidden[0], name_hidden[1]), dim=1)  # Concatenate forward & backward LSTM outputs
        return name_hidden
    def metaphone_encode(self, x):
        metaphone_embedded = self.metaphone_embedding(x)
        _, (metaphone_hidden, _) = self.metaphone_lstm(metaphone_embedded)
        metaphone_hidden = torch.cat((metaphone_hidden[0], metaphone_hidden[1]), dim=1)  # Concatenate forward & backward LSTM outputs
        return metaphone_hidden

    def forward(self, name1, metaphone1, name2, metaphone2):
        # Encode each input separately
        name1_encoded = self.name_encode(name1)
        metaphone1_encoded = self.metaphone_encode(metaphone1)
        name2_encoded = self.name_encode(name2)
        metaphone2_encoded = self.metaphone_encode(metaphone2)
        # Concatenate all representations
        combined = torch.cat((name1_encoded, metaphone1_encoded, name2_encoded, metaphone2_encoded), dim=1)

        # Fully connected layers
        fc1_out = self.fc1(combined)
        fc1_relud = F.relu(fc1_out)
        output = torch.sigmoid(self.fc2(fc1_relud))  # Binary classification

        return output

# Load the model
model = CrossEncoder()
model.load_state_dict(torch.load(f'{MODELS}/name_matcher/cross_encoder_1.pth'))
model.eval()
model.to(device)


CrossEncoder(
  (name_embedding): Embedding(28, 128)
  (metaphone_embedding): Embedding(28, 128)
  (name_lstm): LSTM(128, 64, batch_first=True, bidirectional=True)
  (metaphone_lstm): LSTM(128, 64, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=512, out_features=32, bias=True)
  (fc2): Linear(in_features=32, out_features=1, bias=True)
)

#### Surname Lists, Building Surname ID df

In [6]:
with open(f'{SURNAMES}/combined_surnames.json') as f:
    combined_surname_lists = json.load(f)

with open(f'{SURNAMES}/non_combined_surnames.json') as f:
    non_combined_surname_lists = json.load(f)

all_unique_surnames = set()
for surname_list in non_combined_surname_lists:
    all_unique_surnames.update(surname_list)

#1. One ID per unique surname spelling
unique_id_dict = zip(all_unique_surnames, range(1, len(all_unique_surnames)+1))
unique_id_dict = dict(unique_id_dict)

#2. One ID per surname group (not combined)
group_id_dict = {}
group_id_dict_2 = {}
for i, surname_list in enumerate(non_combined_surname_lists):
    for surname in surname_list:
        if surname in group_id_dict:
            if len(surname_list) == 1:
                continue
            other_list = [x for x in group_id_dict.keys() if group_id_dict[x] == group_id_dict[surname]]
            if len(other_list) != 1: 
                group_id_dict_2[surname] = i+1
                continue
        group_id_dict[surname] = i+1

#3. One ID per combined surname group
combined_id_dict = {}
for i, surname_list in enumerate(combined_surname_lists):
    for surname in surname_list:
        combined_id_dict[surname] = i + 1

#4. One ID per metaphone code
metaphone_id_dict = {}
surnames_metaphone = [ph.metaphone(''.join(unidecode.unidecode(char) for char in x if char.isalpha()).title()) for x in all_unique_surnames]
surnames_metaphone = list(set(surnames_metaphone))
for i, surname in enumerate(surnames_metaphone):
    metaphone_id_dict[surname] = i + 1

#5. One ID to run them all, one ID to find them; one ID to bring them all and in the darkness bind them
# This will be based on the non-combined surname lists, but will use a round of metaphone and a round of ML before assigning a new ID
master_id_dict = combined_id_dict.copy()


print(len(unique_id_dict), 'unique surnames')
print(len(group_id_dict), 'non-combined surname groups')
print(len(combined_id_dict), 'combined surname groups')
print(len(metaphone_id_dict), 'unique metaphone codes')
print(len(master_id_dict), 'master IDs')

33348 unique surnames
33348 non-combined surname groups
33407 combined surname groups
8123 unique metaphone codes
33407 master IDs


#### Pre-Compute Metaphone and Encoding for master_id_dict

In [7]:
master_id_data = {}
for surname, master_id in tqdm(master_id_dict.items()):
    metaphone = ph.metaphone(''.join(unidecode.unidecode(char) for char in surname if char.isalpha()).title())
    surname_tensor, metaphone_tensor = encode_surname(surname)
    master_id_data[surname] = {
        'surname': surname,
        'metaphone': metaphone,
        'master_id': master_id,
        'surname_tensor': surname_tensor,
        'metaphone_tensor': metaphone_tensor
    }


  0%|          | 0/33407 [00:00<?, ?it/s]

#### Functions for Cleaning Surnames, Applying IDs

In [8]:
def clean_surnames(df):
    # Replace number characters with letters and remove special characters
    replacements = {
        'NO_SURNAME': np.nan, '#NAME': np.nan,
        '1': 'l', '2': 'z', '3': 'e', '4': 'a', '5': 's', 
        '6': 'b', '7': 't', '8': 'b', '9': 'g', '0': 'o',
        'ĩ': 'i', 'ó': 'o'
    }

    # All special characters to remove
    chars_to_remove = [',', '.', '\'', '(', ')', '-', '/', '?', '­', ';', ':', 
                    ''', ''', '"', '"', '•', '!', '"', '&', '{', '}', '[', ']']

    # Apply replacements
    df['surname'] = df['surname'].replace(replacements)

    # Remove all special characters
    for char in chars_to_remove:
        df['surname'] = df['surname'].str.replace(char, '')

    # Apply regex replacements
    regex_patterns = [
        (alt_curly_brackets, r'\1'), 
        (single_curly_brackets, r'\1'),
        (alias_pattern, r'\1'),
        (or_pattern, r'\1'),
        (and_sons, r'\1'),
        (and_co, r'\1'),
        (saint_pattern, r'\1')
    ]

    for pattern, replacement in regex_patterns:
        df['surname'] = df['surname'].str.replace(pattern, replacement, regex=True)

    # Clean up whitespace and apply double_name pattern twice
    df['surname'] = df['surname'].str.strip()
    df['surname'] = df['surname'].str.replace(double_name, r'\1', regex=True)
    df['surname'] = df['surname'].str.strip()
    df['surname'] = df['surname'].str.replace(double_name, r'\1', regex=True)
    df['surname'] = df['surname'].str.strip()

    df = df[df['surname'] != '']
    df = df[df['surname'] != 'None Qualified']
    df = df[df['surname'] != 'NO_SURNAME']
    df = df[df['surname'] != 'No Entries']
    df = df.loc[df['surname'].notna()]
    df['surname'] = df['surname'].progress_apply(lambda x: ''.join(unidecode.unidecode(char) for char in x if char.isalpha()).title())

    return df


def assign_unique_id(df, unique_id_dict):
    df['unique_id'] = np.nan
    df['unique_id'] = df['surname'].map(unique_id_dict)
    for i, row in df.iterrows():
        if pd.isna(row['unique_id']):
            df.at[i, 'unique_id'] = len(unique_id_dict) + 1
            unique_id_dict[row['surname']] = df.at[i, 'unique_id']
    df['unique_id'] = df['unique_id'].astype(int)
    return df, unique_id_dict

def assign_group_id(df, group_id_dict, group_id_dict_2):
    df['group_id'] = np.nan
    for i, row in df.iterrows():
        surname = row['surname']
        if surname in group_id_dict_2:
            id_1 = group_id_dict[surname]
            id_2 = group_id_dict_2[surname]

            group_1 = [k for k, v in group_id_dict.items() if v == id_1]
            group_2 = [k for k, v in group_id_dict.items() if v == id_2]

            # Get mean metaphone edit distance from surname to each group
            surname_metaphone = ph.metaphone(surname)
            group_1_metaphone = [ph.metaphone(x) for x in group_1]
            group_2_metaphone = [ph.metaphone(x) for x in group_2]
            group_1_metaphone = list(set(group_1_metaphone))
            group_2_metaphone = list(set(group_2_metaphone))
            group_1_scores = [fuzz.ratio(surname_metaphone, x) for x in group_1_metaphone]
            group_2_scores = [fuzz.ratio(surname_metaphone, x) for x in group_2_metaphone]
            group_1_score = np.mean(group_1_scores)
            group_2_score = np.mean(group_2_scores)
            if group_2_score > group_1_score:
                df.at[i, 'group_id'] = id_2
            else:
                df.at[i, 'group_id'] = id_1
            continue
        if surname in group_id_dict:
            df.at[i, 'group_id'] = group_id_dict[surname]
        else:
            next_id = max(group_id_dict.values()) + 1
            df.at[i, 'group_id'] = next_id
            group_id_dict[surname] = next_id
    df['group_id'] = df['group_id'].astype(int)
    return df, group_id_dict

def assign_combined_id(df, combined_id_dict):
    df['combined_id'] = np.nan
    for i, row in df.iterrows():
        surname = row['surname']
        if surname in combined_id_dict:
            df.at[i, 'combined_id'] = combined_id_dict[surname]
        else:
            next_id = max(combined_id_dict.values()) + 1
            df.at[i, 'combined_id'] = next_id
            combined_id_dict[surname] = next_id
    df['combined_id'] = df['combined_id'].astype(int)
    return df, combined_id_dict

def assign_metaphone_id(df, metaphone_id_dict):
    df['metaphone_id'] = np.nan
    for i, row in df.iterrows():
        surname = row['surname']
        metaphone = ph.metaphone(surname)
        if metaphone in metaphone_id_dict:
            df.at[i, 'metaphone_id'] = metaphone_id_dict[metaphone]
        else:
            next_id = max(metaphone_id_dict.values()) + 1
            df.at[i, 'metaphone_id'] = next_id
            metaphone_id_dict[metaphone] = next_id
    df['metaphone_id'] = df['metaphone_id'].astype(int)
    return df, metaphone_id_dict

def assign_master_id(df, master_id_dict):
    df['master_id'] = np.nan
    # Insta-assigning IDs if they're in the dict
    df['master_id'] = df['surname'].map(master_id_dict)
    for i, row in tqdm(df.iterrows(), total=len(df)):
        surname = row['surname']
        if not pd.isna(row['master_id']):
            continue
        # If the surname is already in the dict, we can just assign it
        if surname in master_id_dict:
            df.at[i, 'master_id'] = master_id_dict[surname]
            continue
        # If the surname isn't already in the dict, we need to check for candidate matches
        ## First round: metaphone
        candidate_ids = []
        metaphone = ph.metaphone(surname)
        for key, data in master_id_data.items():
            fuzz_ratio = fuzz.ratio(metaphone, data['metaphone'])/100
            if len(metaphone) - (fuzz_ratio * len(metaphone)) <= 1:
                candidate_ids.append(data['master_id'])
        # If we have a single winner, we assign it
        candidate_ids = list(set(candidate_ids))
        if len(candidate_ids) == 1:
            df.at[i, 'master_id'] = candidate_ids[0]
            continue
        # If there's no similar-sounding candidates, we just assign a new ID and add its info to the dict
        if len(candidate_ids) == 0:
            next_id = max(master_id_dict.values()) + 1
            df.at[i, 'master_id'] = next_id
            master_id_dict[surname] = next_id
            # Add the surname to the master_id_data dict
            surname_tensor, metaphone_tensor = encode_surname(surname)
            master_id_data[surname] = {
                'surname': surname,
                'metaphone': metaphone,
                'master_id': next_id,
                'surname_tensor': surname_tensor,
                'metaphone_tensor': metaphone_tensor
            }
            continue
        mid_time = time.time()
        ## Second round: ML

        surname_tensor, metaphone_tensor = encode_surname(surname)
        surname_tensor = surname_tensor.to(device)
        metaphone_tensor = metaphone_tensor.to(device)
        # Assembling our candidate lists
        id_scores = []
        id_indices = {}
        index = 0
        candidate_surname_tensors = []
        candidate_metaphone_tensors = []
        print(len(candidate_ids))
        for candidate_id in candidate_ids:
            candidate_surnames = [key for key, data in master_id_data.items() if data['master_id'] == candidate_id]
            if len(candidate_ids) > 10000:
                # If there are too many candidates, we sample them to speed up the process
                if len(candidate_surnames) > 3:
                    candidate_surnames = np.random.choice(candidate_surnames, size=3, replace=False).tolist()
            
            for candidate_surname in candidate_surnames:
                candidate_surname_tensor = master_id_data[candidate_surname]['surname_tensor']
                candidate_surname_tensor = candidate_surname_tensor.to(device)
                candidate_metaphone_tensor = master_id_data[candidate_surname]['metaphone_tensor']
                candidate_metaphone_tensor = candidate_metaphone_tensor.to(device)
                candidate_surname_tensors.append(candidate_surname_tensor)
                candidate_metaphone_tensors.append(candidate_metaphone_tensor)

                index_list = [index]
                index_list.append(index)
                if candidate_id not in id_indices:
                    id_indices[candidate_id] = []
                id_indices[candidate_id].extend([index, index + 1])
                index += 1

        candidate_surname_tensors_stacked = torch.stack(candidate_surname_tensors)
        candidate_surname_tensors_stacked = candidate_surname_tensors_stacked.to(device)
        candidate_metaphone_tensors_stacked = torch.stack(candidate_metaphone_tensors)
        candidate_metaphone_tensors_stacked = candidate_metaphone_tensors_stacked.to(device)
        batch_size = len(candidate_surname_tensors_stacked)
        # Reshape the tensors to match the expected input shape
        surname_tensor_batched = surname_tensor.unsqueeze(0).expand(batch_size, -1)
        metaphone_tensor_batched = metaphone_tensor.unsqueeze(0).expand(batch_size, -1)
        surname_tensor_batched = surname_tensor_batched.to(device)
        metaphone_tensor_batched = metaphone_tensor_batched.to(device)
        # Forward pass
        with torch.no_grad():
            output = model(surname_tensor_batched, metaphone_tensor_batched, candidate_surname_tensors_stacked, candidate_metaphone_tensors_stacked)
        for k, v in id_indices.items():
            
            if len(v) == 0:
                continue
            id_scores.append(output[v[0]:v[-1]].max())
        del candidate_surname_tensors_stacked
        del candidate_metaphone_tensors_stacked
        id_scores = torch.stack(id_scores).cpu().numpy()
        if max(id_scores) < 0.99:
            next_id = max(master_id_dict.values()) + 1
            df.at[i, 'master_id'] = next_id
            master_id_dict[surname] = next_id
            # Add the surname to the master_id_data dict
            surname_tensor, metaphone_tensor = encode_surname(surname)
            master_id_data[surname] = {
                'surname': surname,
                'metaphone': metaphone,
                'master_id': next_id,
                'surname_tensor': surname_tensor,
                'metaphone_tensor': metaphone_tensor
            }
            continue
        best_id_index = np.argmax(id_scores)
        best_id = candidate_ids[best_id_index]
        df.at[i, 'master_id'] = best_id
        # Add the surname to the master_id_dict
        master_id_dict[surname] = best_id
        # Send candidate tensors back to CPU
        candidate_surname_tensors = [x.detach().cpu() for x in candidate_surname_tensors]
        candidate_metaphone_tensors = [x.detach().cpu() for x in candidate_metaphone_tensors]
        
        # Add the surname to the master_id_data dict
        surname_tensor, metaphone_tensor = encode_surname(surname)
        master_id_data[surname] = {
            'surname': surname,
            'metaphone': metaphone,
            'master_id': best_id,
            'surname_tensor': surname_tensor.cpu(),
            'metaphone_tensor': metaphone_tensor.cpu()
        }
        torch.cuda.empty_cache()
    df['master_id'] = df['master_id'].astype(int)
    return df, master_id_dict

            

#### Applying IDs to all name lists

In [9]:
master_subsidy_list = []

for doc_file in [
    f'{PROCESSED}/calendar_recipients.csv',
    f'{PROCESSED}/master_subsidy_data.csv',
    f'{PROCESSED}/tithe_landowners.csv',
    f'{RAW}/freeholders_list_1713_1780.csv',
    f'{RAW}/bank_returns_1845_1880.csv',
    f'{RAW}/bankrupts_list_1800_1820.csv',
    f'{RAW}/bankrupts_list_1820_1843.csv',
    f'{RAW}/indictable_offenses_1745_1782.csv',
    f'{RAW}/monumental_brasses.csv',
    f'{RAW}/victuallers_list_1651_1828.csv',
    f'{RAW}/workhouse_list_1861.csv',
    f'{PROCESSED}/ukda_pcc_wills.csv'
     ]:
    tdf = pd.read_csv(doc_file, encoding='utf-8')
    if 'subsidy' in doc_file:
        surname_col = 'gemini_surname'
    elif 'tithe' in doc_file:
        surname_col = 'owner_surname'
    else:
        surname_col = 'surname'
        tdf = tdf[tdf['surname'] != '[No entries]']
        tdf = tdf[tdf['surname'] != 'None Qualified']
    
    tdf = tdf.rename(columns={surname_col: 'surname'})
    

    tdf = clean_surnames(tdf)

    tdf['metaphone'] = tdf['surname'].progress_apply(ph.metaphone)
    
    tdf, unique_id_dict = assign_unique_id(tdf, unique_id_dict)
    print(f'Unique IDs assigned for {doc_file}')
    tdf, group_id_dict = assign_group_id(tdf, group_id_dict, group_id_dict_2)
    print(f'Group IDs assigned for {doc_file}')
    tdf, combined_id_dict = assign_combined_id(tdf, combined_id_dict)
    print(f'Combined IDs assigned for {doc_file}')
    tdf, metaphone_id_dict = assign_metaphone_id(tdf, metaphone_id_dict)
    print(f'Metaphone IDs assigned for {doc_file}')
    tdf, master_id_dict = assign_master_id(tdf, master_id_dict)
    print(f'Master IDs assigned for {doc_file}')
    doc_file = doc_file.replace(RAW, PROCESSED)
    doc_file = doc_file.replace('.csv', '_final.csv')
    tdf.to_csv(doc_file, index=False, encoding='utf-8')
    print('='*20)
# Save all dictionaries when done
with open(f'{SURNAMES}/unique_id_dict.json', 'w') as f:
    json.dump(unique_id_dict, f)
with open(f'{SURNAMES}/group_id_dict.json', 'w') as f:
    json.dump(group_id_dict, f)
with open(f'{SURNAMES}/combined_id_dict.json', 'w') as f:
    json.dump(combined_id_dict, f)
with open(f'{SURNAMES}/metaphone_id_dict.json', 'w') as f:
    json.dump(metaphone_id_dict, f)
with open(f'{SURNAMES}/master_id_dict.json', 'w') as f:
    json.dump(master_id_dict, f)

  0%|          | 0/176 [00:00<?, ?it/s]

  0%|          | 0/176 [00:00<?, ?it/s]

Unique IDs assigned for Data/Processed/calendar_recipients.csv
Group IDs assigned for Data/Processed/calendar_recipients.csv
Combined IDs assigned for Data/Processed/calendar_recipients.csv
Metaphone IDs assigned for Data/Processed/calendar_recipients.csv


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


  0%|          | 0/176 [00:00<?, ?it/s]

62
128
853
2
794
1410
2529
7
28
64
33
6
97
10
483
112
35
301
2529
35
264
147
203
1716
1117
251
345
795
582
760
702
92
2463
133
845
925
Master IDs assigned for Data/Processed/calendar_recipients.csv


  0%|          | 0/88715 [00:00<?, ?it/s]

  0%|          | 0/88715 [00:00<?, ?it/s]

Unique IDs assigned for Data/Processed/master_subsidy_data.csv


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Group IDs assigned for Data/Processed/master_subsidy_data.csv
Combined IDs assigned for Data/Processed/master_subsidy_data.csv
Metaphone IDs assigned for Data/Processed/master_subsidy_data.csv


  0%|          | 0/88715 [00:00<?, ?it/s]

384
620
196
116
47
853
818
565
39
199
666
42
135
363
662
377
230
845
749
418
88
1803
799
671
16
428
1080
110
802
1747
152
2106
96
662
1196
847
797
453
854
1068
29
2465
521
145
50
85
61
96
11
709
88
379
138
545
250
940
151
328
18
178
568
333
346
115
1469
321
397
541
304
8
213
577
330
39
227
278
11414
820
217
1402
229
159
242
539
820
876
256
666
260
224
190
112
152
392
34
68
662
7
139
704
345
521
394
819
164
85
35
241
135
27
552
55
7
397
1403
386
245
799
556
137
49
1720
670
444
464
1198
804
771
47
126
22
199
557
110
82
428
50
900
285
612
770
114
35
464
582
32
352
269
201
2117
43
194
1135
15
375
310
366
102
874
63
522
1421
1721
183
2
160
407
21
450
589
176
250
2
152
26
68
2
819
1405
4
1146
36
17
609
11419
479
36
1328
369
51
65
984
535
82
375
1041
1472
1470
447
821
68
848
535
538
34
1147
799
215
255
650
580
827
125
114
669
94
239
800
30
100
118
435
716
628
700
839
553
1711
94
856
198
5
179
230
827
199
26
18
109
800
361
558
10
183
62
255
9
901
69
495
753
59
758
9
872
552
1337
998
998
34
60


  0%|          | 0/471017 [00:00<?, ?it/s]

  0%|          | 0/471017 [00:00<?, ?it/s]

Unique IDs assigned for Data/Processed/tithe_landowners.csv


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Group IDs assigned for Data/Processed/tithe_landowners.csv
Combined IDs assigned for Data/Processed/tithe_landowners.csv
Metaphone IDs assigned for Data/Processed/tithe_landowners.csv


  0%|          | 0/471017 [00:00<?, ?it/s]

364
50
1486
178
1202
462
21
1834
5
196
36
242
1070
12
51
548
115
12
7
194
452
557
179
476
622
311
5
1375
832
827
215
98
680
1221
128
241
102
201
482
8
65
5
59
12
19
1988
2420
153
352
327
52
70
131
138
469
51
1013
499
641
201
44
21
143
256
163
118
2
570
44
48
130
36
331
113
407
824
499
39
39
73
30
350
103
115
178
24
13
150
65
36
337
34
34
12
465
28
34
11845
382
196
43
238
72
1145
703
189
378
188
932
663
2333
186
365
369
43
1145
66
226
118
930
391
180
11
28
576
39
129
3
569
34
166
2
12
43
179
59
604
158
158
262
475
11
412
464
27
281
464
9
1989
78
8
23
21
591
1726
27
226
34
41
1069
61
188
51
226
296
78
146
1834
208
543
1032
2
940
864
824
604
51
74
623
76
294
38
14
211
19
91
3
563
512
203
1948
83
204
16
166
30
262
264
318
9
226
588
251
118
2
25
347
72
289
81
30
2
2289
552
528
1487
223
227
180
432
242
389
8
262
16
82
231
24
61
356
47
278
489
180
705
644
2828
328
92
18
180
201
1164
183
11
1033
667
942
196
36
542
61
1601
113
245
625
1726
22
369
3
95
394
1021
495
314
1843
11
240
526
35
243
229

  0%|          | 0/130462 [00:00<?, ?it/s]

  0%|          | 0/130462 [00:00<?, ?it/s]

Unique IDs assigned for Data/Raw/freeholders_list_1713_1780.csv


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Group IDs assigned for Data/Raw/freeholders_list_1713_1780.csv
Combined IDs assigned for Data/Raw/freeholders_list_1713_1780.csv
Metaphone IDs assigned for Data/Raw/freeholders_list_1713_1780.csv


  0%|          | 0/130462 [00:00<?, ?it/s]

1235
1219
56
1177
80
514
215
119
7
2456
1248
615
292
579
264
5
201
717
96
35
204
97
46
891
754
234
4
94
39
39
44
269
960
84
117
1868
679
1838
771
771
28
422
118
316
1306
829
136
1654
971
36
352
58
44
910
341
9
229
70
82
799
1893
359
124
165
412
1391
808
71
726
381
510
284
147
33
5
37
70
79
35
97
1184
772
717
114
46
589
412
620
46
915
217
24
14
267
1654
608
352
1654
4
1615
576
15
586
997
240
379
773
1726
132
1304
1112
800
77
284
219
24
743
52
14
365
426
102
1029
911
412
35
97
1551
1184
219
72
74
622
30
389
187
40
6
14
22
467
340
27
361
24
189
579
870
1248
435
1037
156
120
588
788
442
103
209
1488
154
953
16
1078
32
996
134
174
67
309
504
6
11880
248
1870
755
202
165
264
365
254
154
147
39
35
97
1727
14
474
284
1226
909
227
4
936
634
229
158
282
67
114
1584
41
29
24
21
189
532
1840
635
245
33
158
442
139
267
723
24
467
35
297
504
867
244
452
642
682
8
499
125
722
21
197
142
754
50
75
1027
51
88
1113
284
254
119
530
12
147
14
1319
11884
86
154
329
30
35
97
189
464
89
1385
113
39
765
803
3

  0%|          | 0/7391 [00:00<?, ?it/s]

  0%|          | 0/7391 [00:00<?, ?it/s]

Unique IDs assigned for Data/Raw/bank_returns_1845_1880.csv


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Group IDs assigned for Data/Raw/bank_returns_1845_1880.csv
Combined IDs assigned for Data/Raw/bank_returns_1845_1880.csv
Metaphone IDs assigned for Data/Raw/bank_returns_1845_1880.csv


  0%|          | 0/7391 [00:00<?, ?it/s]

14
184
70
146
915
1226
14
1853
334
1047
373
66
66
80
1625
238
537
64
1530
822
2237
1396
1396
11
48
78
343
231
11
2492
31
164
164
278
815
6
704
358
243
490
217
124
40
541
199
120
865
24
11928
652
10
307
34
128
32
233
223
527
1194
704
946
60
193
196
1194
75
637
177
14
21
509
218
688
50
236
50
50
154
482
234
528
11930
2
536
26
1052
1889
26
75
75
489
582
1610
908
910
209
1640
19
319
6
1640
122
7
54
104
92
604
16
114
537
929
156
156
262
84
155
170
72
500
207
17
3
85
923
485
447
156
202
1026
2
595
94
695
32
75
345
15
78
4
8
983
1054
106
106
2877
58
198
1281
559
2107
680
39
97
623
531
266
108
210
397
35
273
15
444
43
22
11941
878
50
509
305
24
1101
878
656
113
582
25
250
20
413
17
11
101
964
72
36
36
329
Master IDs assigned for Data/Raw/bank_returns_1845_1880.csv


  0%|          | 0/340 [00:00<?, ?it/s]

  0%|          | 0/340 [00:00<?, ?it/s]

Unique IDs assigned for Data/Raw/bankrupts_list_1800_1820.csv
Group IDs assigned for Data/Raw/bankrupts_list_1800_1820.csv
Combined IDs assigned for Data/Raw/bankrupts_list_1800_1820.csv
Metaphone IDs assigned for Data/Raw/bankrupts_list_1800_1820.csv


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


  0%|          | 0/340 [00:00<?, ?it/s]

309
122
122
166
543
601
209
253
16
148
217
676
192
706
706
490
441
Master IDs assigned for Data/Raw/bankrupts_list_1800_1820.csv


  0%|          | 0/390 [00:00<?, ?it/s]

  0%|          | 0/390 [00:00<?, ?it/s]

Unique IDs assigned for Data/Raw/bankrupts_list_1820_1843.csv


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Group IDs assigned for Data/Raw/bankrupts_list_1820_1843.csv
Combined IDs assigned for Data/Raw/bankrupts_list_1820_1843.csv
Metaphone IDs assigned for Data/Raw/bankrupts_list_1820_1843.csv


  0%|          | 0/390 [00:00<?, ?it/s]

521
565
332
160
326
15
315
55
8
226
924
219
543
13
979
328
584
487
389
807
1802
Master IDs assigned for Data/Raw/bankrupts_list_1820_1843.csv


  0%|          | 0/1550 [00:00<?, ?it/s]

  0%|          | 0/1550 [00:00<?, ?it/s]

Unique IDs assigned for Data/Raw/indictable_offenses_1745_1782.csv


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Group IDs assigned for Data/Raw/indictable_offenses_1745_1782.csv
Combined IDs assigned for Data/Raw/indictable_offenses_1745_1782.csv
Metaphone IDs assigned for Data/Raw/indictable_offenses_1745_1782.csv


  0%|          | 0/1550 [00:00<?, ?it/s]

659
398
233
747
116
229
1397
788
885
126
78
111
1040
684
1532
1421
698
34
135
84
538
11945
676
27
41
87
617
129
171
980
661
879
14
802
153
19
37
122
1102
21
308
619
207
98
31
21
59
935
1102
260
1352
195
198
119
506
1054
29
2390
115
501
799
260
257
1627
9
274
40
543
924
209
133
20
19
117
33
33
29
580
267
254
293
65
29
224
19
730
65
89
844
551
1006
309
453
85
289
532
Master IDs assigned for Data/Raw/indictable_offenses_1745_1782.csv


  0%|          | 0/3161 [00:00<?, ?it/s]

  0%|          | 0/3161 [00:00<?, ?it/s]

Unique IDs assigned for Data/Raw/monumental_brasses.csv


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Group IDs assigned for Data/Raw/monumental_brasses.csv
Combined IDs assigned for Data/Raw/monumental_brasses.csv
Metaphone IDs assigned for Data/Raw/monumental_brasses.csv


  0%|          | 0/3161 [00:00<?, ?it/s]

65
207
207
197
111
41
515
772
12
79
10
26
108
30
31
26
6
301
24
79
40
267
170
72
13
92
50
516
416
50
964
142
468
454
2240
1627
30
176
34
83
1227
794
979
98
11
779
538
123
1040
446
824
1161
35
518
16
21
1749
343
393
188
157
103
98
98
156
973
1048
23
132
826
3
1399
2242
24
570
3
779
1161
114
100
87
779
56
537
263
276
428
30
1751
1048
4
779
1228
1228
372
51
202
1056
38
71
1929
89
2
5
152
19
45
991
495
400
17
445
3
160
221
28
470
8
348
13
504
370
956
53
418
483
532
268
617
360
13
933
9
2018
60
272
820
48
151
1155
107
29
2
485
17
40
1010
5
399
81
11965
551
49
605
2
1014
2
315
1328
474
121
41
235
56
1320
210
808
419
278
301
38
21
80
11967
1809
178
178
49
3
96
2856
2856
20
1320
21
70
878
304
182
960
737
485
74
18
123
239
2
876
10
233
23
10
30
5
437
437
290
110
331
479
7
1679
856
83
88
11975
42
976
1121
11975
11975
1858
11975
11975
902
958
11975
11975
36
623
386
11975
3
1239
11976
1252
399
11976
11976
220
9
8
21
290
55
130
662
382
20
124
8
16
571
444
2
153
67
3
120
115
748
215
25
902
50
492
22

  0%|          | 0/5007 [00:00<?, ?it/s]

  0%|          | 0/5007 [00:00<?, ?it/s]

Unique IDs assigned for Data/Raw/victuallers_list_1651_1828.csv


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Group IDs assigned for Data/Raw/victuallers_list_1651_1828.csv
Combined IDs assigned for Data/Raw/victuallers_list_1651_1828.csv
Metaphone IDs assigned for Data/Raw/victuallers_list_1651_1828.csv


  0%|          | 0/5007 [00:00<?, ?it/s]

89
89
1115
13
123
652
3
4
2
8
18
33
505
2339
78
587
1148
461
227
427
461
461
1052
1052
2261
36
159
461
287
427
6
82
82
146
33
245
42
3
3
4
4
735
97
33
97
33
97
120
486
33
984
428
788
89
89
105
89
99
13
102
228
228
228
228
118
360
360
13
941
1207
111
1062
1062
1062
733
1062
486
1062
842
90
877
251
67
225
107
657
751
70
59
9
1910
12067
334
334
39
266
313
194
1910
608
2276
1334
222
222
22
22
198
116
116
150
190
58
90
52
775
114
52
691
308
891
75
155
75
62
243
146
598
147
353
266
266
334
186
20
376
519
1144
1629
206
171
19
19
1663
1554
728
921
144
6
6
82
958
591
343
332
332
332
47
332
105
9
2
2
627
1686
601
439
812
206
417
1169
122
12
410
139
1169
376
1235
358
1614
1614
366
250
138
210
500
12067
160
500
500
1210
67
433
419
54
186
229
307
276
537
109
608
608
608
608
147
1435
2025
983
702
311
350
540
124
206
35
1765
946
91
40
386
77
985
985
38
38
948
105
105
246
114
114
38
1686
Master IDs assigned for Data/Raw/victuallers_list_1651_1828.csv


  0%|          | 0/404 [00:00<?, ?it/s]

  0%|          | 0/404 [00:00<?, ?it/s]

Unique IDs assigned for Data/Raw/workhouse_list_1861.csv


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Group IDs assigned for Data/Raw/workhouse_list_1861.csv
Combined IDs assigned for Data/Raw/workhouse_list_1861.csv
Metaphone IDs assigned for Data/Raw/workhouse_list_1861.csv


  0%|          | 0/404 [00:00<?, ?it/s]

234
20
385
664
614
118
175
1224
669
204
606
282
228
623
2037
334
312
1912
159
47
19
46
265
320
Master IDs assigned for Data/Raw/workhouse_list_1861.csv


  0%|          | 0/27034 [00:00<?, ?it/s]

  0%|          | 0/27034 [00:00<?, ?it/s]

Unique IDs assigned for Data/Processed/ukda_pcc_wills.csv


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Group IDs assigned for Data/Processed/ukda_pcc_wills.csv
Combined IDs assigned for Data/Processed/ukda_pcc_wills.csv
Metaphone IDs assigned for Data/Processed/ukda_pcc_wills.csv


  0%|          | 0/27034 [00:00<?, ?it/s]

219
172
1641
53
2503
1293
246
5
362
45
291
7
418
836
2
1686
8
129
473
66
77
309
2340
366
541
51
153
815
240
1224
544
584
315
1530
378
2126
374
2126
52
75
532
736
153
8
43
382
371
892
10
6
104
112
515
122
149
1108
411
10
115
361
199
166
169
2341
45
1333
49
36
9
70
318
398
183
520
218
841
710
159
6
227
6
894
246
695
384
74
127
280
691
494
167
5
22
46
66
323
376
6
14
46
690
75
40
28
1010
1010
57
812
665
39
46
51
40
1913
487
41
105
53
199
139
984
6
440
1175
546
304
32
290
125
87
29
17
58
282
569
186
388
154
184
1437
35
777
233
436
585
23
940
845
361
1063
26
1763
1018
1175
216
87
259
1437
69
1437
373
51
436
572
1839
254
1333
445
79
372
5
1170
11
27
585
331
422
692
21
372
691
11
382
275
49
198
48
264
151
256
263
66
1010
1618
87
146
515
69
755
234
158
1116
31
16
301
246
12072
152
210
721
515
736
62
76
449
841
710
154
1013
319
808
670
127
961
176
169
736
66
623
225
1150
1531
66
1686
25
449
487
206
550
10
378
378
6
238
32
422
367
66
350
40
199
367
420
385
53
295
350
541
1170
2505
749
120
664
14

In [10]:
import json
import os
import platform

if platform.node() == 'Nick_Laptop':
    drive = 'C'
elif platform.node() == 'MSI':
    drive = 'D' 
else:
    drive = 'uhhhhhh'
    print('Uhhhhhhhhhhhhh')
os.chdir(f'{drive}:/PhD/DissolutionProgramming/LND---Land-Paper')
PROCESSED = 'Data/Processed'
SURNAMES = f'{PROCESSED}/surname_info'
with open(f'{SURNAMES}/unique_id_dict.json', 'r') as f:
    unique_id_dict = json.load(f)
with open(f'{SURNAMES}/group_id_dict.json', 'r') as f:
    group_id_dict = json.load(f)
with open(f'{SURNAMES}/combined_id_dict.json', 'r') as f:
    combined_id_dict = json.load(f)
with open(f'{SURNAMES}/metaphone_id_dict.json', 'r') as f:
    metaphone_id_dict = json.load(f)
with open(f'{SURNAMES}/master_id_dict.json', 'r') as f:
    master_id_dict = json.load(f)

# Find max id for each dict and print
print('Max unique ID:', max(unique_id_dict.values()))
print('Max group ID:', max(group_id_dict.values()))
print('Max combined ID:', max(combined_id_dict.values()))
print('Max metaphone ID:', max(metaphone_id_dict.values()))
print('Max master ID:', max(master_id_dict.values()))

Max unique ID: 56874.0
Max group ID: 39580
Max combined ID: 34883
Max metaphone ID: 10827
Max master ID: 12145
