# Scraping Chron Am for Black Victim Clusters

This notebook outlines several steps toward identifying newspaper reports about Black lynching victims. It relies on the dataset published by Seguin & Rigby. To learn more, read here: https://journals.sagepub.com/doi/pdf/10.1177/2378023119841780

See my preprocessing notebook to understand how I got the Chron Am search results. It's important to note that these search results are not exact. They will give you every instance where the victim's name appears in newspaper issues from the year documented AND when the city name where the lynching occurred is printed within 100 words of the victim name. In other words, there may be instances where a coincidentally identical name and city appears in ChronAm, but are not related to the lynching event.

In [None]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import time
from datetime import datetime, timedelta
import os

In [None]:
df = pd.read_csv('seguin_rigby_data_black_subset.csv')

In [None]:
df

In [None]:
df['search_url']

In [None]:
# this Regex pattern will help to scrape urls that direct to search hits
# it is used in the scrape_carefully() function below
page_pattern = re.compile(r'/lccn/sn\d+/\d{4}-\d{2}-\d{2}/ed-\d/seq-\d+/')

# these are presets to help keep track of time it takes to scrape and the request count
request_count = 0
first_request_time = None

In [None]:
# this function scrapes ChronAm, but keeps track of requests and chills itself to avoid hitting their rate limits
def scrape_carefully(url, retries=3):
    global request_count, first_request_time
    
    if request_count == 0:
        first_request_time = datetime.now()
    
    if request_count >= 200:
        elapsed_time = datetime.now() - first_request_time
        
        if elapsed_time < timedelta(minutes=1):
            print('Crawl limit reached. Waiting for 5 minutes.')
            time.sleep(300)
            first_request_time = datetime.now()
            request_count = 0
    
    if request_count > 0 and request_count % 10 == 0:
        print('Burst limit reached. Waiting for 10 seconds.')
        time.sleep(10)
    
    for i in range(retries):
        response = requests.get(url)
        request_count += 1
        print(f'Requests made: {request_count}')
        
        if response.status_code == 200:
            return response
        
        elif response.status_code == 429:
            print(f'Received 429 error. Sorry ChronAm. Waiting one hour.')
            time.sleep(3605) # this is as safe as can be. Consider shortening chill time.
            
        else:
            print(f'Unexpected error for {url}: {response.status_code}')
            return None
        
    return None

In [None]:
# this sets the timer to NOW. And you're off to the races!
start_time = datetime.now()

for index, row in df.iterrows():
    search_url = row['search_url']
    victim_name = row['victim'].replace(' ', '_')
    scrape_results = []
    
    scrape_content = scrape_carefully(search_url)
    
    if scrape_content is None:
        print(f'Retried 3 times but got repeated errors. Skipping search for victim {victim_name}')
        continue
    
    soup = BeautifulSoup(scrape_content.text, 'html.parser')
    
    results_list = soup.find('ul', class_='results_list')
    
    if results_list is None:
        print(f'No results found for victim {victim_name}')
        continue
    
    matching_links = results_list.find_all('a', href=page_pattern)
    
    for link in matching_links:
        link_text = link.get_text(strip=True)
        match = page_pattern.search(link['href'])
        if match:
            matched_href = match.group()
            link_href = f'https://chroniclingamerica.loc.gov{matched_href}ocr/'
            scrape_results.append({'Link Title': link_text, 'URL': link_href})
    
    print(f'Search for victim {victim_name} processed.')
    
    if scrape_results:
        df_results = pd.DataFrame(scrape_results)
        csv_filename = f'lynch_clusters/{victim_name}.csv'
        df_results.to_csv(csv_filename, index=False)
        print(f'Results for victim {victim_name} saved to {csv_filename}')

# A little added to keep track of total time elapsed
end_time = datetime.now()
total_elapsed_time = end_time - start_time
print(f'Total elapsed time: {total_elapsed_time}')

Now I have all the pages with hits for every name! Hehehehehe

The code below does the next level of scraping. It uses scrape_carefully() to get the newspaper text.

In [None]:
request_count = 0
start_time = datetime.now()

directory = 'lynch_clusters/'

for filename in os.listdir(directory):
    if filename.endswith('.csv'):
        file_path = os.path.join(directory, filename)
        
        df = pd.read_csv(file_path)
        
        newspaper_content = []
        
        for url in df['URL']:
            try:
                response = scrape_carefully(url)
                
                if response and response.status_code == 200:
                    soup = BeautifulSoup(response.text, 'html.parser')
                    
                    p_tags = soup.find_all('p')
                    p_text = ' '.join([tag.get_text(strip=True) for tag in p_tags])
                    
                    newspaper_content.append(p_text)
                    
                else:
                    newspaper_content.append(None)
                    
            except Exception as e:
                print(f'Error scraping {url}: {e}')
                newspaper_content.append(None)
        
        df['text'] = newspaper_content
        df['text'] = df['text'].str.lower()
        
        df.to_csv(file_path, index=False)
        print(f'Updated {filename} with OCR text.')

end_time = datetime.now()
total_elapsed_time = end_time - start_time
print(f'Total elapsed time: {total_elapsed_time}')