# Unifying Lynching Inventory Datasets and Subsetting for Our Purposes

In [None]:
import pandas as pd
import numpy as np
from geopy.geocoders import Nominatim
from tqdm import tqdm
import folium
import time

## 1) Overview

The following steps outline everything I've done to build our lynching inventory. 

I compiled our inventory from two other well-known inventories: the Seguin & Rigby Dataset and the Tolnay-Beck Inventory. I preprocessed both of these data sources then concatenated them. I also subset by the date range relevant to the Virality of Racial Terror project (1865-1921) and by victim's race (either Black or white). I also only included cases where we knew the victim's full name, and the month and general location of their murder.

The result is a lynching inventory recording 4,977 victims. Of that number, 1,210 are white and 3,767 are Black. 4,853 are listed as male. 124 are listed as female. This data can be reviewed in the csv file called subset_cleaned_combined_lynch_inventories.csv.

To learn more about the original source data, visit these links:

- Seguin & Rigby Dataset: [https://archive.ciser.cornell.edu/studies/2833/data-and-documentation](https://archive.ciser.cornell.edu/studies/2833/data-and-documentation)
- Seguin & Rigby Project Description: [https://journals.sagepub.com/doi/pdf/10.1177/2378023119841780](https://journals.sagepub.com/doi/pdf/10.1177/2378023119841780)
- Tolnay-Beck Inventory Request Form: [https://sites.uw.edu/lynching/contact/](https://sites.uw.edu/lynching/contact/)
- Tolnay-Beck Project Description: [https://www.press.uillinois.edu/books/?id=p064135](https://www.press.uillinois.edu/books/?id=p064135)

## 2) Preprocessing the Seguin Rigby Dataset


In [None]:
segrig_df = pd.read_csv('seguin_rigby_lynching_data.csv')

# turning 0 values in the day column to 1
segrig_df['day'] = segrig_df['day'].replace(0, 1)

# creating 'lynch_date' column with year, month, and date
segrig_df['lynch_date'] = pd.to_datetime(segrig_df[['year', 'month', 'day']], errors='coerce')

# replacing '.' values in 'city' column with NaN
segrig_df['city'] = segrig_df['city'].replace('.', np.nan)

# adding 'county' or 'parish' string to county column values (county for all states except Louisiana)
segrig_df['county'] = segrig_df.apply(lambda row: row['county'] + ' parish' if row['state'] == 'la' else row['county'] + ' county', axis=1)

# creating a lynch_location column from city, county, and state columns
segrig_df['lynch_location'] = segrig_df.apply(lambda row: ', '.join([str(val) for val in [row['city'], row['county'], row['state']] if pd.notna(val)]),axis=1)

# removing rows where the victim name is only one word
segrig_df = segrig_df[segrig_df['victim'].str.split().str.len() > 1]

# removing all punctuation from victim names
segrig_df['victim'] = segrig_df['victim'].str.replace(r'[^\w\s]', '', regex=True)

# removing all punctuation from victim gender
segrig_df['gender'] = segrig_df['gender'].str.replace(r'[^\w\s]', '', regex=True)

# retaining only white and Black victims
segrig_df = segrig_df[segrig_df['race'].isin(['Black', 'White'])]

# adding a source column to label these rows as coming from the Seguin & Rigby dataset
segrig_df['source'] = 'seguin-rigby'

# changing the column names to more easily concatenate the datasets
segrig_df = segrig_df.rename(columns={'victim': 'victim_name'})
segrig_df = segrig_df.rename(columns={'race': 'victim_race'})
segrig_df = segrig_df.rename(columns={'gender': 'victim_gender'})

# dropping all columns except those necessary for our project
segrig_df = segrig_df.loc[:, ['victim_name', 'victim_race', 'victim_gender', 'lynch_date', 'year', 'lynch_location', 'source']]

# lowercasing all the strings in the dataset
segrig_df = segrig_df.applymap(lambda x: x.lower() if isinstance(x, str) else x)

In [None]:
# reviewing the preprocessing results
segrig_df

## 3) Preprocessing the Tolnay-Beck Inventory

In [None]:
tolbec_df = pd.read_excel('tolnay_beck_data_2022.xls')

# turning 0 values in the day column to 1
tolbec_df['Day'] = tolbec_df['Day'].replace(0, 1)

# creating 'lynch_date' column with year, month, and date
tolbec_df['lynch_date'] = pd.to_datetime(tolbec_df[['Year', 'Month', 'Day']], errors='coerce')

# removing 'Near' and 'Possibly' strings from the 'Place' column
tolbec_df['Place'] = tolbec_df['Place'].str.replace(r'\b(Near|Possibly)\b', '', regex=True).str.strip()

# adding 'county' or 'parish' string to county column values (county for all states except Louisiana)
tolbec_df['Lynch County'] = tolbec_df.apply(lambda row: row['Lynch County'] + ' parish' if row['state'] == 'la' else row['Lynch County'] + ' county', axis=1)

# creating a lynch_location column from Place, Lynch County, and Lynch State columns
tolbec_df['lynch_location'] = tolbec_df.apply(lambda row: ', '.join([str(val) for val in [row['Place'], row['Lynch County'], row['Lynch State']] if pd.notna(val)]),axis=1)

# removing rows with unnamed victims
tolbec_df = tolbec_df[~tolbec_df['Name'].str.contains('Unnamed', na=False)]

# removing all punctuation from victim names
tolbec_df['Name'] = tolbec_df['Name'].str.replace(r'[^\w\s]', '', regex=True)

# removing rows where the victim name is only one word
tolbec_df = tolbec_df[tolbec_df['Name'].str.split().str.len() > 1]

# removing all punctuation from victim gender
tolbec_df['Victim\'s Sex'] = tolbec_df['Victim\'s Sex'].str.replace(r'[^\w\s]', '', regex=True)

# removing hyphenated race labels from victim's race
tolbec_df['Victim\'s Race'] = tolbec_df['Victim\'s Race'].str.replace(r'-.*', '', regex=True)

# retaining only white and Black victims
tolbec_df = tolbec_df[tolbec_df['Victim\'s Race'].isin(['Black', 'White'])]

# adding a source column to label these rows as coming from the Tolnay-Beck inventory
tolbec_df['source'] = 'tolnay-beck'

# changing the column names to more easily concatenate the datasets
tolbec_df = tolbec_df.rename(columns={'Name': 'victim_name'})
tolbec_df = tolbec_df.rename(columns={'Victim\'s Race': 'victim_race'})
tolbec_df = tolbec_df.rename(columns={'Victim\'s Sex': 'victim_gender'})
tolbec_df = tolbec_df.rename(columns={'Year': 'year'})

# dropping all columns except those necessary for our project
tolbec_df = tolbec_df.loc[:, ['victim_name', 'victim_race', 'victim_gender', 'lynch_date', 'year', 'lynch_location', 'source']]

# lowercasing all the strings in the dataset
tolbec_df = tolbec_df.applymap(lambda x: x.lower() if isinstance(x, str) else x)

In [None]:
# reviewing the preprocessing results
tolbec_df

## 4) Concatenating the Datasets

In [None]:
# combining into a new dataframe called master_df
master_df = pd.concat([segrig_df, tolbec_df], ignore_index=True)

# subsetting by the year range of the VRT project (1865 to 1921)
master_df = master_df[(master_df['year'] >= 1865) & (master_df['year'] <= 1921)]

# sorting by year
master_df = master_df.sort_values(by='year', ascending=True)

# removing duplicates that have the same victim name and year
master_df = master_df.drop_duplicates(subset=['victim_name', 'year'], keep='first')

# resetting the indices
master_df = master_df.reset_index(drop=True)

# reviewing the results
master_df

In [None]:
# saving the results
master_df.to_csv('subset_cleaned_combined_lynch_inventories.csv')

## 5) Cleaning the Data by Hand

There is no code at this step. With the master dataframe compiled, I then opened it in Microsoft Excel and reviewed it carefully. I quickly noticed two columns with prolific issues: victim_name and lynch_location. 

The victim_name column had issues originating from two idiosyncrasies in the source data:
- rows where any aliases were included with the victim's name (for example, "john doe / joe doe" or "john 'johnny' doe") 
- rows where the victim's name was not actually mentioned but instead their relationship to some other victim was recorded (for example, "brother of john doe" or "mrs john doe")
In both these cases, I hand corrected or deleted any rows with these issues.

The lynch_location column had issues originating from the Tolnay-Beck Inventory's Place column. That is, this Place column recorded more than the city or town. It included things like "near jail in town" or "outskirts of" or "swamp 5 miles from town." Basically, Tolnay-Beck were directionally specific in their place recordings, but they did not separate the town or city location in its own column. I therefore had to correct the rows in my lynch_location column to remove any directional cues since they would make mapping the data more difficult.

## 6) Further Preprocessing the Unified Data

In [None]:
master_df = pd.read_csv('subset_cleaned_combined_lynch_inventories.csv')

In [None]:
# dropping 'Unnamed' column of old indices
master_df = master_df.drop(columns=['Unnamed: 0'])

# creating a dictionary of state abbreviations and full names
us_states = {'al': 'alabama', 'ak': 'alaska', 'az': 'arizona', 'ar': 'arkansas', 'ca': 'california', 'co': 'colorado', 'ct': 'connecticut', 'de': 'delaware', 'fl': 'florida', 'ga': 'georgia', 'hi': 'hawaii', 'id': 'idaho', 'il': 'illinois', 'in': 'indiana', 'ia': 'iowa', 'ks': 'kansas', 'ky': 'kentucky', 'la': 'louisiana', 'me': 'maine', 'md': 'maryland', 'ma': 'massachusetts', 'mi': 'michigan', 'mn': 'minnesota', 'ms': 'mississippi', 'mo': 'missouri', 'mt': 'montana', 'ne': 'nebraska', 'nv': 'nevada', 'nh': 'new hampshire', 'nj': 'new jersey', 'nm': 'new mexico', 'ny': 'new york', 'nc': 'north carolina', 'nd': 'north dakota', 'oh': 'ohio', 'ok': 'oklahoma', 'or': 'oregon', 'pa': 'pennsylvania', 'ri': 'rhode island', 'sc': 'south carolina', 'sd': 'south dakota', 'tn': 'tennessee', 'tx': 'texas', 'ut': 'utah', 'vt': 'vermont', 'va': 'virginia', 'wa': 'washington', 'wv': 'west virginia', 'wi': 'wisconsin', 'wy': 'wyoming'}

# replacing state abbreviations with full names in the lynch_location column
master_df['lynch_location'] = master_df['lynch_location'].apply(lambda x: x[:-3] + ' ' + us_states[x[-2:]] if x[-2:] in us_states else x)

# loading the Nomatim geolocator
geolocator = Nominatim(user_agent='matthew_k')

# defining the get_lat_long() function
def get_lat_long(place):
    try:
        location = geolocator.geocode(place)
        if location:
            return location.latitude, location.longitude
        else:
            return None, None
    except Exception as e:
        return None, None

# creating shells for latitude and longitude values
latitude = []
longitude = []

# iterating over the lynch_location column to get latitudes and longitudes for each lynching
for place in tqdm(master_df['lynch_location'], desc="Geocoding locations"):
    lat, lon = get_lat_long(place)
    latitude.append(lat)
    longitude.append(lon)
    time.sleep(1)

# adding lat/long data to the dataframe
master_df['latitude'] = latitude
master_df['longitude'] = longitude

In [None]:
# reviewing the results
master_df

In [None]:
# saving the results
master_df.to_csv('subset_cleaned_combined_lynch_inventories.csv')

## 7) Mapping the Lynchings to Assess Quality of Lat/Long Data

In [None]:
# set map start point to middle of continental USA
map_start_point = [39.8283, -98.5795]

# function that slightly alters lat/long so there are no overlapping points
def tweak_coordinates(lat, lon, index, recorded_locations):
    if (lat, lon) in recorded_locations:
        offset = 0.0001 * (index % 10)
        return lat + offset, lon + offset
    else:
        recorded_locations.add((lat, lon))
        return lat, lon
 
# a place to hold the record of previously recorded locations
recorded_locations = set()

# create Folium basemap
map = folium.Map(location=map_start_point, tiles="Cartodb Positron", zoom_start=4)

# iterate over lat/long data to create points on map
for index, row in master_df.iterrows():
    if np.isnan(row['latitude']) or np.isnan(row['longitude']):
        continue

    # adjust any overlapping coordinates
    lat, lon = tweak_coordinates(row['latitude'], row['longitude'], index, recorded_locations)

    # hover over the points and you'll see this info
    tooltip = f"<div style='font-size: 11pt'>{row['victim_name']}</div>" \
              f"<div style='font-size: 11pt'>{row['lynch_location']}</div>" \
              f"<div style='font-size: 11pt'>{row['year']}</div>"

    # add the dots to the map
    folium.Circle(
        [lat, lon],
        tooltip=tooltip,
        color='darkred',
        radius=10
    ).add_to(map)

# display
map