# Post-Processing Steps for Lynching Event Reprint Clusters

This notebook presents code used in post-processing for the lynching events. More likely to be added as more post-processing needs are identified.

But as it stands, this code fixes some OCR errors where the victim names are missing spaces between parts of their full name. It also extracts newspaper clippings–the 100 words appearing before and after the victim's name.

In [None]:
import re
import pandas as pd
import os

In [None]:
def fix_names(text, victim_name):
    full_name = victim_name.split()
    
    ocr_error_patterns = [re.compile(r'(' + re.escape(full_name[i]) + r')(' + re.escape(full_name[i + 1]) + r')') for i in range(len(full_name) - 1)]
    
    for pattern in ocr_error_patterns:
        text = pattern.sub(r'\1 \2', text)
    
    return text

In [None]:
def newspaper_clippings(text, victim_name, word_radius=100):
    # fix_names() function first
    text = fix_names(text, victim_name)
    
    pattern = re.compile(re.escape(victim_name))
    
    words = text.split()
    
    all_clippings = []
    
    for match in pattern.finditer(text):
        start_position = match.start()
        end_position = match.end()
        
        start_word_index = len(re.findall(r'\w+', text[:start_position]))
        end_word_index = len(re.findall(r'\w+', text[:end_position]))

        clipping_start_index = max(start_word_index - word_radius, 0)
        clipping_end_index = min(end_word_index + word_radius, len(words))
        
        clipping = ' '.join(words[clipping_start_index:clipping_end_index])
        all_clippings.append(clipping)
    
    # in case of multiple matches, split them with pipe symbol: |
    return ' | '.join(all_clippings) if all_clippings else None

In [None]:
directory = 'lynch_clusters'

for filename in os.listdir(directory):
    if filename.endswith('.csv'):
        file_path = os.path.join(directory, filename)
        
        df = pd.read_csv(file_path)
        
        victim_name = filename.replace('.csv', '').replace('_', ' ')
        
        clippings = []
        
        for text in df['text']:
            if pd.isna(text):
                clippings.append(None)
            else:
                clipping = newspaper_clippings(text, victim_name)
                clippings.append(clipping)
        
        df['clippings'] = clippings
        
        df.to_csv(file_path, index=False)
        print(f'Updated {filename} with clippings.')