# Post-Processing Steps for Lynching Event Reprint Clusters

This notebook presents code used in post-processing for the lynching events. More likely to be added as more post-processing needs are identified.

But as it stands, this code fixes some OCR errors where the victim names are missing spaces between parts of their full name. It also extracts newspaper clippings–the 100 words appearing before and after the victim's name.

In [5]:
import re
import pandas as pd
import os

In [6]:
def fix_names(text, victim_name):
    full_name = victim_name.split()
    
    ocr_error_patterns = [re.compile(r'(' + re.escape(full_name[i]) + r')(' + re.escape(full_name[i + 1]) + r')') for i in range(len(full_name) - 1)]
    
    for pattern in ocr_error_patterns:
        text = pattern.sub(r'\1 \2', text)
    
    return text

In [7]:
def newspaper_clippings(text, victim_name, word_radius=100):
    # fix_names() function first
    text = fix_names(text, victim_name)
    
    pattern = re.compile(re.escape(victim_name))
    
    words = text.split()
    
    all_clippings = []
    
    for match in pattern.finditer(text):
        start_position = match.start()
        end_position = match.end()
        
        start_word_index = len(re.findall(r'\w+', text[:start_position]))
        end_word_index = len(re.findall(r'\w+', text[:end_position]))

        clipping_start_index = max(start_word_index - word_radius, 0)
        clipping_end_index = min(end_word_index + word_radius, len(words))
        
        clipping = ' '.join(words[clipping_start_index:clipping_end_index])
        all_clippings.append(clipping)
    
    # in case of multiple matches, split them with pipe symbol: |
    return ' | '.join(all_clippings) if all_clippings else None

In [8]:
directory = 'lynch_clusters'

for filename in os.listdir(directory):
    if filename.endswith('.csv'):
        file_path = os.path.join(directory, filename)
        
        df = pd.read_csv(file_path)
        
        victim_name = filename.replace('.csv', '').replace('_', ' ')
        
        clippings = []
        
        for text in df['text']:
            if pd.isna(text):
                clippings.append(None)
            else:
                clipping = newspaper_clippings(text, victim_name)
                clippings.append(clipping)
        
        df['clippings'] = clippings
        
        df.to_csv(file_path, index=False)
        print(f'Updated {filename} with clippings.')

Updated fred_wilson.csv with clippings.
Updated john_maynard.csv with clippings.
Updated son_of_laura_nelson.csv with clippings.
Updated curtis_young_.csv with clippings.
Updated si_king.csv with clippings.
Updated ben_little.csv with clippings.
Updated alabama_red.csv with clippings.
Updated james_johnson.csv with clippings.
Updated tom_williams.csv with clippings.
Updated will_davis.csv with clippings.
Updated bill_armor.csv with clippings.
Updated anderson_calloway.csv with clippings.
Updated jim_whitehead.csv with clippings.
Updated john_bigus.csv with clippings.
Updated robert_murtore.csv with clippings.
Updated alvy_jackson.csv with clippings.
Updated tom_peddy.csv with clippings.
Updated howard_cooper.csv with clippings.
Updated edgar_jones.csv with clippings.
Updated gaines_gordon.csv with clippings.
Updated sim_garrett.csv with clippings.
Updated alexander_white.csv with clippings.
Updated claude_chandler.csv with clippings.
Updated jack_walker.csv with clippings.
Updated fran