# Building and Refining the Lynch Clusters Dataset

The following notebook is the second iteration of scraping ChronAm for instances of victim names. It also includes new steps to enrich the data and find likely reports of racial violence.

In [None]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import time
from datetime import datetime, timedelta
import os

### 1) Build the Broader Dataset

To build the new dataset, I'm recycling a lot of functions and loops. I'm following all the same steps but with less stringent search terms for Chron Am.

In [None]:
# use the seguin and rigby dataset again

df = pd.read_csv('seguin_rigby_data_black_subset.csv')

In [None]:
# this is an updated search url function. It builds the ChronAm search for year and following year and phrase text only.

def build_chron_am_search(row):
    base_url = "https://chroniclingamerica.loc.gov/search/pages/results/list/"
    date1 = row['year']
    date2 = row['year'] + 1
    phrasetext = row['victim'].replace(' ', '+')

    search_url = (f'{base_url}?date1={date1}&date2={date2}&searchType=advanced&language='
                  f'&proxdistance=5&rows=1000&ortext=&proxtext=&phrasetext={phrasetext}'
                  f'&andtext=&dateFilterType=yearRange&page=1&sort=date')
    
    return search_url

df['search_url'] = df.apply(build_chron_am_search, axis=1)

df

In [None]:
# saving the seguin and rigby data with the new search url included.

df.to_csv('seguin_rigby_data_black_subset_02.csv', index=False, encoding='utf-8')

In [None]:
# this Regex pattern will help to scrape urls that direct to search hits
# it is used in the scrape_carefully() function below
page_pattern = re.compile(r'/lccn/sn\d+/\d{4}-\d{2}-\d{2}/ed-\d/seq-\d+/')

# these are presets to help keep track of time it takes to scrape and the request count
request_count = 0
first_request_time = None

In [None]:
# this function scrapes ChronAm, but keeps track of requests and chills itself to avoid hitting their rate limits
def scrape_carefully(url, retries=3):
    global request_count, first_request_time
    
    if request_count == 0:
        first_request_time = datetime.now()
    
    if request_count >= 200:
        elapsed_time = datetime.now() - first_request_time
        
        if elapsed_time < timedelta(minutes=1):
            print('Crawl limit reached. Waiting for 5 minutes.')
            time.sleep(300)
            first_request_time = datetime.now()
            request_count = 0
    
    if request_count > 0 and request_count % 10 == 0:
        print('Burst limit reached. Waiting for 10 seconds.')
        time.sleep(10)
    
    for i in range(retries):
        response = requests.get(url)
        request_count += 1
        print(f'Requests made: {request_count}')
        
        if response.status_code == 200:
            return response
        
        elif response.status_code == 429:
            print(f'Received 429 error. Sorry ChronAm. Waiting one hour.')
            time.sleep(3605) # this is as safe as can be. Consider shortening chill time.
            
        else:
            print(f'Unexpected error for {url}: {response.status_code}')
            return None
        
    return None

In [None]:
# this sets the timer to NOW. And you're off to the races!
start_time = datetime.now()

for index, row in df.iterrows():
    search_url = row['search_url']
    victim_name = row['victim'].replace(' ', '_')
    scrape_results = []
    
    scrape_content = scrape_carefully(search_url)
    
    if scrape_content is None:
        print(f'Retried 3 times but got repeated errors. Skipping search for victim {victim_name}')
        continue
    
    soup = BeautifulSoup(scrape_content.text, 'html.parser')
    
    results_list = soup.find('ul', class_='results_list')
    
    if results_list is None:
        print(f'No results found for victim {victim_name}')
        continue
    
    matching_links = results_list.find_all('a', href=page_pattern)
    
    for link in matching_links:
        link_text = link.get_text(strip=True)
        match = page_pattern.search(link['href'])
        if match:
            matched_href = match.group()
            link_href = f'https://chroniclingamerica.loc.gov{matched_href}ocr/'
            scrape_results.append({'Link Title': link_text, 'URL': link_href})
    
    print(f'Search for victim {victim_name} processed.')
    
    if scrape_results:
        df_results = pd.DataFrame(scrape_results)
        csv_filename = f'lynch_clusters_02/{victim_name}.csv'
        df_results.to_csv(csv_filename, index=False)
        print(f'Results for victim {victim_name} saved to {csv_filename}')

# A little added to keep track of total time elapsed
end_time = datetime.now()
total_elapsed_time = end_time - start_time
print(f'Total elapsed time: {total_elapsed_time}')

In [None]:
# it occurs to me that some victim clusters could have more than 1000 instances. The build_chron_am_search function doesn't account for this. It only retrieves 1000 or less pages. So, the following code checks to see if there are any csv files that reach 1000 hits. If so, there's probably more hits and I'll need to iterate over the ['search_url'] column where 'page={}' is set to 1. But rather than code all that, I thought I'd just check if it's worth it.

directory = 'lynch_clusters_02'

for file_name in os.listdir(directory):

    file_path = os.path.join(directory, file_name)
        
    df = pd.read_csv(file_path)
        
    if len(df) >= 999:
        print(f'{file_name}: {len(df)} rows')
        
# turns out, two files (george_white.csv and will_rogers.csv) both have more than 1,000 hits. Perhaps cut them out of further analysis? They're probably just too common of names.

# I've deleted them manually from the lynch_clusters_02 directory.
# I also deleted ben.csv and jim.csv since they are both just one name (too general).

In [None]:
# thought I should check the number of total hits across all the victim csv files

total_hits = 0

for file_name in os.listdir(directory):
    file_path = os.path.join(directory, file_name)
        
    df = pd.read_csv(file_path)

    total_hits += len(df)

print(f'Total hits: {total_hits}')

# it's 80,569... Oh dear, it's going to take a long time to scrape all those urls...

# PICK UP HERE – RUN THE LONG BLOCK, EXPECT MAYBE LIKE 9 HOURS RUN TIME

In [None]:
# Be warned. This block will run for hours.

request_count = 0
start_time = datetime.now()

directory = 'lynch_clusters_02/'

for filename in os.listdir(directory):
    file_path = os.path.join(directory, filename)
        
    df = pd.read_csv(file_path)
        
    newspaper_content = []
        
    for url in df['URL']:
        try:
            response = scrape_carefully(url)
                
            if response and response.status_code == 200:
                soup = BeautifulSoup(response.text, 'html.parser')
                    
                p_tags = soup.find_all('p')
                p_text = ' '.join([tag.get_text(strip=True) for tag in p_tags])
                    
                newspaper_content.append(p_text)
                    
            else:
                newspaper_content.append(None)
                    
        except Exception as e:
            print(f'Error scraping {url}: {e}')
            newspaper_content.append(None)
        
    df['text'] = newspaper_content
    # do not forget you lowercased the text in the scraping process
    df['text'] = df['text'].str.lower()
        
    df.to_csv(file_path, index=False)
    print(f'Updated {filename} with OCR text.')

end_time = datetime.now()
total_elapsed_time = end_time - start_time
print(f'Total elapsed time: {total_elapsed_time}')

### GETTING SOME ERROR 61 

Note to self: copy output and then take all the urls with error 61: max retries exceeded and retry scraping them separately.

### 2) Refine the broader dataset

I've reviewed a number of searches in Chron Am in the new dataset. For some reason, they don't always actually contain their respective victim names. Chron Am search is mysterious... But to account for this, I will iterate over the csv files, checking for the victim names in the 'text' column. If they do not appear, I will remove the file. Actually, nix this. I am going to remove them at a later step.

In [None]:
# But I check for victim names:

for file_name in os.listdir(directory):
    file_path = os.path.join(directory, file_name)
        
    df = pd.read_csv(file_path)
        
    df['text'] = df['text'].astype(str)
        
    victim_name = file_name.replace('.csv', '').replace('_', ' ')
        
    if df['text'].str.contains(victim_name, na=False).any():
        print(f'{file_name} good')
        
    else:
        print(f'{victim_name} not mentioned in {file_name}. Deleting {file_name}')

In [None]:
# How many csv files remain?
all_victims = os.listdir(directory)
len(all_victims)

# it's 388 victims and corresponding csv files

### 3) Add the 'Clippings' column again

I've made some adjustments here, too. I optimized the fix_names function. I also expanded the size of the clippings to 150 words before and after the victim's name (before it was 100 words). 

In [None]:
# this is a refactoring of this function. I put the loop outside the regex object and simplified the regex pattern so it detects all instances of victim names where there are any number of non-word characters between the first and last name.

def fix_names(text, victim_name):

    full_name = victim_name.split()
    
    for i in range(len(full_name) - 1):

        pattern = re.compile(r'(' + re.escape(full_name[i]) + r')\W*(' + re.escape(full_name[i + 1]) + r')')

        text = pattern.sub(r' \1 \2 ', text)
    
    return text

In [None]:
# okay, I think I fixed it. I needed to add spaces before and after hits from the fix_names() function then use a tokenizer (in this case, nltk's tokenizer) to accurately identify victim names for clippings.

from nltk.tokenize import word_tokenize

def newspaper_clippings(text, victim_name, word_radius=150):
    text = fix_names(text, victim_name)

    pattern = re.compile(re.escape(victim_name))
    
    words = word_tokenize(text)
    
    all_clippings = []
    
    for match in pattern.finditer(text):
        start_position = match.start()
        end_position = match.end()
        
        start_word_index = len(word_tokenize(text[:start_position]))
        end_word_index = len(word_tokenize(text[:end_position]))

        clipping_start_index = max(start_word_index - word_radius, 0)
        clipping_end_index = min(end_word_index + word_radius, len(words))
        
        clipping = ' '.join(words[clipping_start_index:clipping_end_index])
        
        all_clippings.append(clipping)
    
    # in case of multiple matches, split them with pipe symbol: |
    return ' | '.join(all_clippings) if all_clippings else None

In [None]:
# add the clippings

for filename in os.listdir(directory):
    file_path = os.path.join(directory, filename)
        
    df = pd.read_csv(file_path)
        
    victim_name = filename.replace('.csv', '').replace('_', ' ')
        
    clippings = []
        
    for text in df['text']:
        if pd.isna(text):
            clippings.append(None)
        else:
            clipping = newspaper_clippings(text, victim_name)
            clippings.append(clipping)
        
    df['clippings'] = clippings
        
    df.to_csv(file_path, index=False)
    print(f'Updated {filename} with clippings.')

### 4) Count lynching signifier words in 'Clippings'

In [None]:
# redundant, but necessary and I should have thought to do this in previous steps, but whatevs. 
# Save victim names in a new column:

for file_name in os.listdir(directory):

    file_path = os.path.join(directory, file_name)
        
    df = pd.read_csv(file_path)
        
    victim_name = file_name.replace('.csv', '').replace('_', ' ')

    df['victim'] = victim_name
        
    df.to_csv(file_path, index=False)

In [None]:
# extract city from seguin_rigby_data_black_subset_02.csv

seguin_rigby_df = pd.read_csv('seguin_rigby_data_black_subset_02.csv')

for filename in os.listdir(directory):
    file_path = os.path.join(directory, filename)
        
    df = pd.read_csv(file_path)
        
    df['city'] = None
        
    for i, victim in seguin_rigby_df['victim'].items():

        matching_victim = df['victim'] == victim
            
        df.loc[matching_victim, 'city'] = seguin_rigby_df.loc[i, 'city']
        
    df['city'] = df['city'].str.lower()
    
    df.to_csv(file_path, index=False)
    print(f'Updated {filename} with city.')

In [None]:
# extract state from seguin_rigby data

for filename in os.listdir(directory):
    file_path = os.path.join(directory, filename)
        
    df = pd.read_csv(file_path)
        
    df['state'] = None
        
    for i, victim in seguin_rigby_df['victim'].items():

        matching_victim = df['victim'] == victim
            
        df.loc[matching_victim, 'state'] = seguin_rigby_df.loc[i, 'state']
    
    df['state'] = df['state'].str.lower()
    
    df.to_csv(file_path, index=False)
    print(f'Updated {filename} with state.')

In [None]:
# A little step to identify if city name appears in the clippings. Either yes or no results are saved in a new column called 'city_mentioned'
for file_name in os.listdir(directory):
    file_path = os.path.join(directory, file_name)
        
    df = pd.read_csv(file_path)
        
    df['city_mentioned'] = df.apply(lambda row: 'yes' if row['city'] in row['clippings'] else 'no', axis=1)
        
    df.to_csv(file_path, index=False)
        
    print(f'city names reviewed in {file_name}')

In [None]:
# counting signal word instances in the 'clippings' column and saving the counts in a new column
# this will give me a sense of how likely the clippings around victim names include text about violence

# List of signal words. What else should I add to the list?
signal_words = ['lynch', 'mob', 'murder', 'posse', 'shot', 'hang', 'negro']

for file_name in os.listdir(directory):
    file_path = os.path.join(directory, file_name)
        
    df = pd.read_csv(file_path)
        
    df['signal_word_count'] = df['clippings'].apply(lambda text: sum(text.count(word) for word in signal_words))
        
    df.to_csv(file_path, index=False)
        
    print(f'counted signal words in {file_name}')

In [None]:
# just to review individual csv files.

test_df = pd.read_csv('lynch_clusters_02/zachariah_walker.csv')
test_df

### 5) Narrow the dataset

In [None]:
# I removed rows where there are NaN values in the 'text' columns
for file_name in os.listdir(directory):
    file_path = os.path.join(directory, file_name)
        
    df = pd.read_csv(file_path)
        
    df = df.dropna(subset=['text'])
        
    df.to_csv(file_path, index=False)

In [None]:
# to count the number of rows in all the lynch_clusters

total_hits = 0

for file_name in os.listdir(directory):

    file_path = os.path.join(directory, file_name)
        
    df = pd.read_csv(file_path)
        
    number_rows = df.shape[0] 
    total_hits += number_rows

print(f'Total hits: {total_hits}')

# it's 57,434

In [None]:
# here I narrow the data to subsets that are more likely to by lynching reports
# I'm going to save these results in a new directory called lynch_clusters_02_refined
# the plan is to test city mentioned and signal word rate combos to deduce what thresholds are valid in considering things nearly certain to be considered racial violence reports
refined_directory = 'lynch_clusters_02_refined'

for file_name in os.listdir(directory):
    file_path = os.path.join(directory, file_name)
    
    df = pd.read_csv(file_path)
    
    # This line defines the threshold. In this case, it's city_mentioned 'yes' and/or 'signal_word_counts' as greater than or equal to 3
    # in other words, if a row has city_mentioned in the clipping and/or the clipping has 3 or more signal words, it's saved in the new directory
    refined_df = df[(df['city_mentioned'] == 'yes') | (df['signal_word_count'] >= 3)]
    
    refined_file_path = os.path.join(refined_directory, file_name)
    refined_df.to_csv(refined_file_path, index=False)
    
    print(f'Saved refined file: {refined_file_path}')

In [None]:
# How many hits do we have in the refined version now? Let's check with this recycled code:
total_hits = 0

for file_name in os.listdir(refined_directory):

    file_path = os.path.join(refined_directory, file_name)
        
    df = pd.read_csv(file_path)
        
    number_rows = df.shape[0] 
    total_hits += number_rows

print(f'Total hits: {total_hits}')

# it's 10178 hits (city name mentioned in clipping and/or 3 or more signal words in clipping)