# Pulling Search Results from Chronicling America

In [2]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import time
from tqdm import tqdm
import os
import shutil

## 1) Overview

The following steps demonstrate how I've identified digitized newspaper pages with _potential_ lynching reports in Chronicling America.

These steps rely on Chronicling America's advanced search. The process involves:

1) conducting searches for victim names within the year and year following their murder
2) from the search results, pulling the URLs of OCR pages with potential hits
3) saving those URLs in new csv files (one csv file per victim name)

The results are 3,994 csv files, one per victim. This means there are 3,994 potential lynching victims represented in the data at this point in the process. Across all those csv files, there are 453,050 URLs to digitized pages with potential references to lynchings.

## 2) Build Chronicling America Search URL

In [None]:
df = pd.read_csv('subset_cleaned_combined_lynch_inventories.csv')

# a function that constructs a relevant Chronicling America search URL for every row in our lynching inventory
def build_chron_am_search(row):
    base_url = "https://chroniclingamerica.loc.gov/search/pages/results/list/"
    date1 = row['year']
    date2 = row['year'] + 1
    victim = row['victim_name'].replace(' ', '+')

    search_url = (f'{base_url}?date1={date1}&date2={date2}&searchType=advanced&language='
                  f'&proxdistance=5&rows=1000&ortext=&proxtext=&phrasetext={victim}'
                  f'&andtext=&dateFilterType=yearRange&page=1&sort=date')
    
    return search_url

# applying the function
df['search_url'] = df.apply(build_chron_am_search, axis=1)

# saving the results to our inventory
df.to_csv('subset_cleaned_combined_lynch_inventories.csv')

## 3) Define Scraping Function

This scraping function is one I consistently use when I scrape Chronicling America, but please note that it should be updated periodically to account for any changes in Chronicling America's rate limits. For the latest information on these rate limits, visit: [https://www.loc.gov/apis/json-and-yaml/working-within-limits/](https://www.loc.gov/apis/json-and-yaml/working-within-limits/)

In [5]:
# the URL pattern that indicates positive search hits in Chronicling America
page_pattern = re.compile(r'/lccn/sn\d+/\d{4}-\d{2}-\d{2}/ed-\d/seq-\d+/')

# a scraping function that accounts for Chronicling America's rate limits
def scrape_carefully(url, retries=3):
    for i in range(retries):
        try:
            response = requests.get(url)
            
            if response.status_code == 200:
                time.sleep(4)  # sleep time may need adjustment based on changes to the rate limits
                return response
            
            elif response.status_code == 429:
                print('Received 429 error. Sorry Chron Am! Waiting one hour before retrying.')
                time.sleep(3600)  # sleep time may need adjustment based on changes to the rate limits

            else:
                time.sleep(4)  # sleep time may need adjustment based on changes to the rate limits
            
        except Exception:
            time.sleep(4)  # sleep time may need adjustment based on changes to the rate limits

    return None

## 4) Pull URLs from Search Hits and Save Them in CSV Files

This step will take some time. It was 11 hours of runtime for me. It does not need to be completed in one sitting, though. I added an 'if statement' that checks to see if a victim name has already been pulled from Chron Am. If it has, the code will skip that victim. This means you can run this code at different times, pulling only new victim search results each time, thereby splitting up the whole runtime.

In [6]:
# a 'for-loop' with a progress bar that iterates over the rows in our lynching inventory
# it uses the data in those rows to construct our csv files per victim
for index, row in tqdm(df.iterrows(), desc='Progress thus far', total=len(df)):
    search_url = row['search_url']
    victim_name = row['victim_name'].replace(' ', '_')
    race = row['victim_race']
    gender = row['victim_gender']
    lynch_date = row['lynch_date']
    scrape_results = []

    # an 'if statement' that checks to see if a victim csv already exists
    # if it does, it skips that victim, saving time
    csv_filename = f'name_clusters/{victim_name}.csv'
    if os.path.exists(csv_filename):
        continue
    
    # applies the scrape_carefully() function to the search URL for each victim
    scrape_content = scrape_carefully(search_url)
    
    # if the function fails to pull data, it moves along
    # this may happen if the function attempts 3 times in a row and only gets error messages
    # it may also happen if there are no search results
    if scrape_content is None:
        continue

    # uses BeautifulSoup to parse the html of search results
    soup = BeautifulSoup(scrape_content.text, 'html.parser')
    
    # saves the relevant URL content from searches
    results_list = soup.find('ul', class_='results_list')

    # if there are no search results, it moves along
    if results_list is None:
        continue

    # uses the regex page pattern to extract URLs for positive hits
    matching_links = results_list.find_all('a', href=page_pattern)

    # uses the URL patterns to build the URL and CSV file per victim
    for link in matching_links:
        link_text = link.get_text(strip=True)
        match = page_pattern.search(link['href'])
        if match:
            matched_href = match.group()
            link_href = f'https://chroniclingamerica.loc.gov{matched_href}ocr/'
            scrape_results.append({'newspaper': link_text, 'url': link_href, 'race': race, 'gender': gender, 'lynch_date': lynch_date})
    
    # saves the results as a CSV file in the name_clusters directory
    if scrape_results:
        df_results = pd.DataFrame(scrape_results)
        csv_filename = f'name_clusters/{victim_name}.csv'
        df_results.to_csv(csv_filename, index=False)

Processing rows:  14%|█▍        | 707/4977 [49:28<4:32:19,  3.83s/it] 

Received 429 error. Waiting for 1 hour before retrying...


Processing rows: 100%|██████████| 4977/4977 [11:14:40<00:00,  8.13s/it]   

Total elapsed time: 11:14:40.515581





## 5) Delete Files from Searches with Too Many Hits

In this step, I'm considering any searches with 999+ hits as likely full of false positives. To save myself the trouble of iterating over search result pages and then having to filter out more false positives, I'm just removing these CSV files.

This resulted in the deletion of the following CSV files:

- james_smith.csv: 999 rows
- duke_allen.csv: 1000 rows
- george_white.csv: 999 rows
- dr_james.csv: 1000 rows
- a_king.csv: 1000 rows
- henry_allen.csv: 1000 rows
- allen_nathaniel.csv: 999 rows
- clayton_allen.csv: 1000 rows
- will_rogers.csv: 1000 rows
- t_washington.csv: 999 rows
- allen_brooks.csv: 1000 rows
- william_morris.csv: 999 rows
- john_brown.csv: 1000 rows
- george_washington.csv: 1000 rows
- will_faulkner.csv: 999 rows
- allen_bowen.csv: 1000 rows
- john_smith.csv: 1000 rows
- william_allen.csv: 1000 rows
- john_williams.csv: 1000 rows
- henry_smith.csv: 999 rows
- john_davis.csv: 1000 rows
- allen_cooper.csv: 1000 rows
- a_mcclellan.csv: 1000 rows
- joseph_a_smith.csv: 999 rows
- allen_bolt.csv: 1000 rows
- will_lawton.csv: 999 rows

In [10]:
directory = 'name_clusters'

# a 'for loop' that checks the number of rows in each CSV file
# if the rows reach 999 or more, the file is deleted
for file_name in os.listdir(directory):

    file_path = os.path.join(directory, file_name)
        
    df = pd.read_csv(file_path)
        
    if len(df) >= 999:
        print(f'Deleting {file_name}: {len(df)} rows')
        os.remove(file_path)

Deleting james_smith.csv: 999 rows
Deleting duke_allen.csv: 1000 rows
Deleting george_white.csv: 999 rows
Deleting dr_james.csv: 1000 rows
Deleting a_king.csv: 1000 rows
Deleting henry_allen.csv: 1000 rows
Deleting allen_nathaniel.csv: 999 rows
Deleting clayton_allen.csv: 1000 rows
Deleting t_washington.csv: 999 rows
Deleting allen_brooks.csv: 1000 rows
Deleting william_morris.csv: 999 rows
Deleting john_brown.csv: 1000 rows
Deleting george_washington.csv: 1000 rows
Deleting will_faulkner.csv: 999 rows
Deleting allen_bowen.csv: 1000 rows
Deleting john_smith.csv: 1000 rows
Deleting william_allen.csv: 1000 rows
Deleting john_williams.csv: 1000 rows
Deleting henry_smith.csv: 999 rows
Deleting john_davis.csv: 1000 rows
Deleting allen_cooper.csv: 1000 rows
Deleting a_mcclellan.csv: 1000 rows
Deleting joseph_a_smith.csv: 999 rows
Deleting allen_bolt.csv: 1000 rows
Deleting will_lawton.csv: 999 rows


## 6) Review Number of Total Results

At this point in the process, the total number of digitized pages from the Chron Am search results is 453,050. But it's important to note that these results are just pages with the victim's name appearing on them. There are many more filtering steps to be completed in order to deduce how many of these results are lynching reports.

In [19]:
total_hits = 0

# a 'for loop' that counts the total number of rows across all CSV files
for file_name in os.listdir(directory):
    file_path = os.path.join(directory, file_name)
        
    df = pd.read_csv(file_path)

    total_hits += len(df)

print(f'Total hits: {total_hits}')

Total hits: 453050


## 7) Break Data Files into Manageable Chunks

To make the next part of the process possible (see 03_scrape_search_results.ipynb), I've grouped the CSV files into manageable chunks. These chunks are defined by CSV files that, when added together, contain no more than 5,000 rows.

In [20]:
# I'm setting the max rows per chunk to 5,000
# this can (should) be adjusted depending on how many URLs you're comfortable scraping in a sitting
max_rows = 5000

# I start with w/ a 'for loop' that counts all the CSV files and their row counts
csv_data = []

for file_name in os.listdir(directory):
    file_path = os.path.join(directory, file_name)
    row_count = len(pd.read_csv(file_path))
    csv_data.append((file_name, row_count))

# next I create chunks (subdirectories) and row counts to be iterated over
chunk_number = 1
current_chunk = os.path.join(directory, f'chunk_{chunk_number}')
os.makedirs(current_chunk, exist_ok=True)
current_chunk_rows = 0

# a 'for loop' that checks the row counts per CSV file and directs the CSV file to the current chunk
# but if the current chunk has a row count that exceeds 5,000, the CSV file is placed in a new chunk
for file_name, row_count in csv_data:
    if current_chunk_rows + row_count > max_rows:
        chunk_number += 1
        current_chunk = os.path.join(directory, f'chunk_{chunk_number}')
        os.makedirs(current_chunk, exist_ok=True)
        current_chunk_rows = 0

    src_path = os.path.join(directory, file_name)
    dest_path = os.path.join(current_chunk, file_name)
    shutil.move(src_path, dest_path)
    
    current_chunk_rows += row_count