# Processing the Oxford Dictionary of British Surnames

#### Imports

In [1]:
import os
import re
import ast
import json
import nltk
import shutil
import pymupdf
import platform
import numpy as np
import pandas as pd
import phonetics as ph
from tqdm.notebook import tqdm
from pypdf import PdfReader
import time

import spacy

if platform.node() == 'Nick_Laptop':
    drive = 'C'
elif platform.node() == 'MSI':
    drive = 'D'
else:
    drive = 'uhhhhhh'
    print('Uhhhhhhhhhhhhh')
os.chdir(f'{drive}:/PhD/DissolutionProgramming/LND---Land-Paper')

PROCESSED = 'Data/Processed'
RAW = 'Data/Raw'
#%%
nlp = spacy.load('en_core_web_trf')
dict_doc = pymupdf.open(f'{RAW}/family_name_dict/family_name_dict.pdf')





#### Regexes

In [2]:
# Pages to Skip
vol_title = re.compile(r'^The Oxford Dictionary')
copy_page = re.compile(r'^[0-9]\n')
ed_page = re.compile(r'^Editors and contributors\n')
toc_page = re.compile(r'^Contents\n')

letter_header = re.compile(r'^[A-Z]\n$')


# Fixing missed spaces
missed_space = re.compile(r'([A-Za-z]+)([A-Z]+)')
missed_space2 = re.compile(r'([0-9]+)([A-Za-z]+)')
missed_space3 = re.compile(r'(:)([A-Za-z]+)')

# How to grab name blocks
start_name_block = re.compile(r'^\n?([A-Z][\'-. ]?[A-Za-z\'-. ]+)\n\.\.\.')
mid_name_block = re.compile(r'\n?([A-Z][\'-. ]?[A-Za-z\'-. ]+)\n\.\.\.')

# Allowed linguistic origins

permitted_origins = ['English:',
                     'French:',
                     'Welsh:',
                     'Cornish:',
                     'Norman:']

# Variants for list
variants = re.compile(r'Variants:\s?((?:[A-Za-z\']+,?\s?)+)')
see_other = [x + ' see' for x in permitted_origins]
variant_of = [re.compile(rf'{x}[a-z\s,()]+variant\sof\s[A-Z][\'a-z]+[\'\-A-Za-z\s]\.') for x in permitted_origins]

# Early Bearers to add to name list as well
early_bearers = re.compile(r'\nEarly bearers:')
references = re.compile(r'\nReferences:')

occupational_surname_pattern = re.compile(
    r"[Oo]ccupational name [A-Za-z(),\s]+('[A-Za-z\s]+')"
)
occupational_surname = re.compile(r'[Oo]ccupational name')
locative_surname_pattern = re.compile(
    r'[Ll]ocative name from\s([A-Z][a-z]+(?:\s[A-Z][a-z]+)*)'
    r'(?: in\s([A-Z][a-z]+(?:\s[A-Z][a-z]+)*))?'
    r' \((\w+)\)'
)
devon_pattern = re.compile(r'([A-Z][a-z]+(?:\s[A-Z][a-z]+)*) \(Devon\)')
locative_surname = re.compile(r'[Ll]ocative name')
topo_surname = re.compile(r'(toponym|topograph)')
nickname = re.compile(r'[Nn]ickname from')
relationship_name = re.compile(r'[Rr]elationship name')


#### Functions

In [3]:
def fix_unicode_numbers(text: str) -> str:
    return ''.join(
        chr(ord(c) - 0xF700) if '\uf730' <= c <= '\uf739' else c
        for c in text
    )


def fix_ligatures(text):
    ligature_map = {
        "ﬁ": "fi",
        "ﬂ": "fl",
        "ﬃ": "ffi",
        "ﬄ": "ffl",
        "ﬅ": "ft",
        "ﬆ": "st"
    }
    for ligature, replacement in ligature_map.items():
        text = text.replace(ligature, replacement)
    return text

#### Assembling the Text

In [4]:
text_list = []
for page_num in tqdm(range(122, 3138)):
    
    test_page = dict_doc[page_num]
    text_blocks = test_page.get_text('blocks')
    if len(text_blocks) == 0:
        continue
    text_blocks.sort(key=lambda x: x[-2])
    text_blocks = [x[4] for x in text_blocks]
    text_blocks = [fix_unicode_numbers(x) for x in text_blocks]
    text_blocks = [fix_ligatures(x) for x in text_blocks]
    text_blocks = [x.replace('’', "'") for x in text_blocks]

    if re.search(vol_title, text_blocks[0]) or re.search(copy_page, text_blocks[0]) or re.search(ed_page, text_blocks[0]) or re.search(toc_page, text_blocks[0]):
        continue
    if re.search(letter_header, text_blocks[0]):
        text_blocks = text_blocks[1:-1]
    else:
        text_blocks = text_blocks[:-2]
    text_list.extend(text_blocks)
    new_list = []

for text in text_list:

    if new_list and not re.search(start_name_block, text):
        new_list[-1] += '\n' + text  # Append to the last element
    else:
        new_list.append(text)

text_list = new_list

#%%
num_shit = 0
new_list = []
for text in text_list:
    if len(re.findall(mid_name_block, text)) > 1:
        num_shit += len(re.findall(mid_name_block, text))
        text = re.sub(mid_name_block, r'\n\n\n\1', text)
        new_text_blocks = text.split('\n\n\n')
        for new_block in new_text_blocks[1:]:
            print(new_block)
            print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')
        print('0000000000000000000000000000000000000000000000000000000000000000')
        new_list += new_text_blocks[1:]
        continue
    new_list += [text]
print(num_shit)

  0%|          | 0/3016 [00:00<?, ?it/s]

0


In [5]:

new_list = []
for text in text_list:
    if any(x in text for x in see_other):
        continue
    if any(re.search(x, text) for x in variant_of):
        continue
    new_list.append(text)

text_list = new_list
name_info_df = pd.DataFrame()
basic_name_lists = []
for text in tqdm(text_list):
    text = text.replace('\n\n', '\n')
    text = re.sub(missed_space, r'\1 \2', text)
    text = re.sub(missed_space2, r'\1 \2', text)
    text = re.sub(missed_space3, r'\1 \2', text)
    if not any(x in text for x in permitted_origins):
        continue


    name = re.search(start_name_block, text).group(1)

    if re.search(variants, text):
        variants_text = re.search(variants, text).group(1)
        variants_list = variants_text.split(',')
        variants_list = [x.strip() for x in variants_list]
    else:
        variants_list = []
    new_name_list = [name] + variants_list
    basic_name_lists.append(new_name_list)

    origins = re.split(r'\n\d ', text)
    origins = origins[1:]
    occup = 0
    occupation = ''
    loc = 0
    topo = 0
    nick = 0
    rel = 0
    location = ''
    location_in = ''
    county = ''
    for section_text in origins:


        # Let's grab the "early bearers" section
        if re.search(early_bearers, section_text):
            early_bearers_section = section_text.split('\nEarly bearers:')[1]
            early_bearers_section = re.split(r'\nReferences:', early_bearers_section)[0]
            early_bearers_section = re.sub(r'\n', ' ', early_bearers_section)
            early_bearers_section = re.sub(r'\s+', ' ', early_bearers_section)
            early_bearers_section = early_bearers_section.strip()
            doc = nlp(early_bearers_section)
            early_bearer_list = []
            for ent in doc.ents:
                if ent.label_ == 'PERSON':
                    ent_name = ent.text
                    ent_name_list = ent_name.split(' ')
                    if len(ent_name_list) == 2:
                        ent_surname = ent_name_list[1]
                    elif len(ent_name_list) == 3:
                        if ent_name_list[1].lower() in ['de', 'des', 'le', 'dil', 'del', 'at', 'of', 'atte', 'ate', 'a', '(le)', 'ad', 'la',]:
                            ent_surname = ent_name_list[2]
                        else:
                            continue
                    else:
                        continue
                    if ent_surname not in early_bearer_list:
                        early_bearer_list.append(ent_surname)
        new_name_set = set(new_name_list) | set(early_bearer_list)
        new_name_list = list(new_name_set)               
        
        # Extracting the description of the name
        section_text = re.split(r'\nEarly Bearers:', section_text)[0]
        section_text = re.split(r'\nReferences:', section_text)[0]
        if re.search(occupational_surname, section_text):
            occup = 1
            if re.search(occupational_surname_pattern, section_text):
                occupation = re.search(occupational_surname, section_text).group(1)
        if re.search(locative_surname, section_text):
            loc = 1

            if re.search(locative_surname_pattern, section_text):
                location = re.search(locative_surname_pattern, section_text).group(1)
                if re.search(locative_surname_pattern, section_text).group(2):
                    location_in = re.search(locative_surname_pattern, section_text).group(2)
                county = re.search(locative_surname_pattern, section_text).group(3)
        if re.search(topo_surname, section_text):
            topo = 1
        if re.search(nickname, section_text):
            nick = 1
        if re.search(relationship_name, section_text):
            rel = 1
    for name in new_name_list:
        new_row_dict = {'surname': name,
                        'occup': occup,
                        'occupation': occupation,
                        'loc': loc,
                        'location': location,
                        'location_in': location_in,
                        'county': county,
                        'topo': topo,
                        'nick': nick,
                        'rel': rel}
        for origin in permitted_origins:
            if origin in text:
                new_row_dict[origin] = 1
            else:
                new_row_dict[origin] = 0
        new_row = pd.DataFrame(new_row_dict, index=[0])
        name_info_df = pd.concat([name_info_df, new_row], ignore_index=True)


name_info_df = name_info_df.sort_values(by='surname')
name_info_df.to_csv(f'{PROCESSED}/surname_info.csv', index=False)



  0%|          | 0/27561 [00:00<?, ?it/s]

#### Combining the Name Lists

In [6]:
basic_name_sets = [set(x) for x in basic_name_lists]
non_combined_name_lists = [x.copy() for x in basic_name_lists]
# Combine all sets with any elements in common
fixedPoint = False
iteration = 0
while not fixedPoint:
    fixedPoint = True
    print('Iteration: ' + str(iteration))
    iteration += 1
    for i, name_set in enumerate(basic_name_sets):
        for name_set2 in basic_name_sets[i+1:]:
            if name_set & name_set2:
                basic_name_sets.remove(name_set)
                basic_name_sets.remove(name_set2)
                basic_name_sets.append(name_set | name_set2)
                fixedPoint = False
                break

combined_name_lists = [list(x) for x in basic_name_sets]
combined_name_lists.sort()
non_combined_name_lists.sort()
#%%
max = 0
for name_list in combined_name_lists:
    list_length = len(name_list)
    for name in name_list:
        if len(name) > max:
            max = len(name)
            longest = name
    if len(name_list) > 20:
        print(name_list)
print(f'Max Name Length: {max}, Name: {longest}')
print(f'Max list Length: {list_length}')

Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
['Agard', 'Eggers', 'Eager', 'Augar', 'Algie', 'Eagger', 'Elgar', 'Ashard', 'Alger', 'Hagger', 'Eagers', 'Allgar', 'Adgar', 'Haggar', 'Hagar', 'Haggard', 'Agar', 'Eagar', 'Hager', 'Auker', 'Agars', 'Augur', 'Elger', 'Egarr', 'Ager', 'Hatchard', 'Egar', 'Hagard', 'Haggart', 'Agger', 'Edgar', 'Egger', 'Eggar', 'Auger', 'Adger', 'Agers', 'Algar', 'Eger', 'Edger', 'Achard']
['Aldis', 'Oliff', 'Hollen', 'Hollies', 'Holey', 'Ollive', 'Awdas', 'Holly', 'Aldhouse', 'Holliss', 'Alldis', 'Holley', 'Audus', 'Holles', 'Hollings', 'Aldous', 'Hollis', 'Holling', 'Olive', 'Audiss', 'Ollis', 'Hollan', 'Aldus', 'Hollins', 'Hollens', 'Audas', 'Hollin', 'Olliff', 'Olliffe', 'Aldhous', 'Oldis', 'Aldiss']
['Allatt', 'Hayllar', 'Elwood', 'Hellyer', 'Adlard', 'Ellard', 'Hiller', 'Hullah', 'Hallward', 'Hilyard', 'Allett', 'Hilleard', 'Helliar', 'Alwood', 'Ellwood', 'Hayler', 'Eliott', 'Allwood', 'Ellett', 'Hilliar', 'Heller', 'Aylar

#### Adding in the surnames from the subsidy indexes

In [7]:
surname_index_df = pd.read_csv(f'{PROCESSED}/surname_index.csv')
index_surnames = surname_index_df['names'].tolist()
index_surnames = [ast.literal_eval(x) for x in index_surnames]
found = False
for index_surname_list in tqdm(index_surnames):
    for index_surname in index_surname_list:
        for name_list in combined_name_lists:
            if index_surname in name_list:
                name_list.extend(index_surname_list)
                name_list = list(set(name_list))
                found = True
    if not found:
        print('Not Found')
        print(index_surname_list)


  0%|          | 0/2654 [00:00<?, ?it/s]

#### Saving

In [8]:
with open(f'{PROCESSED}/combined_surnames.json', 'w', encoding='utf-8') as f:
    json.dump(combined_name_lists, f, indent=4)

with open(f'{PROCESSED}/non_combined_surnames.json', 'w', encoding='utf-8') as f:
    json.dump(non_combined_name_lists, f, indent=4)