Import necessary python modules

In [None]:
#%matplotlib inline
import pandas as pd
import re
from ast import literal_eval

Save dataset paths

In [None]:
anthony = "data/anthony/susan-b-anthony-papers_2022-10-12.csv"
speech_inventory = "data/anthony/anthony_speech_list.csv"
catt = "data/catt/carrie-chapman-catt-papers_2022-10-12.csv"
stanton = "data/stanton/elizabeth-cady-stanton-papers_2022-10-19.csv"
terrell = "data/terrell/mary-church-terrell-advocate-for-african-americans-and-women_2023-01-20.csv"

In [None]:
def load_csv(file: str) -> pd.DataFrame:
    """Load each CSV file into a data frame."""
    
    df = pd.read_csv(file, dtype=str)
    return df

In [None]:
# Load each csv and store the data frame in a variable
a = load_csv(anthony)
c = load_csv(catt)
s = load_csv(stanton)
t = load_csv(terrell)

In [None]:
# Confirm that the load worked by previewing with `df.head()`
a.head()

---

Configure spaCy

In [None]:
import spacy
import en_core_web_lg

In [None]:
# Load the model
nlp = en_core_web_lg.load()

def tokens(text) -> list:
    """Runs NLP process on text input. 
    
    Returns: 
        process (list): A list containing tuples of NLP attributes for each word in the transcription.
    """
    doc = nlp(str(text))
    process = ([(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
                token.shape_, token.is_alpha, token.is_stop) for token in doc])

    return process


def entities(text) -> list:
    """Runs NER process on text input. 
    
    Returns:
        process (list): A list containing tuples of NER attributes for each word in the transciption.
    """
    doc = nlp(str(text))
    process = [(ent.text, ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]

    return process


In [None]:
# Create a new column containing the output of the tokens function
# NOTE: This will take a while to run
a['tokenized_text'] = a['Transcription'].apply(tokens)

In [None]:
# Create a new column containing the output of the entities function
# NOTE: This will take a while to run
a['entities'] = a['Transcription'].apply(entities)

In [None]:
# Preview the first row of the data
a.head(1)

In [None]:
# Preview the tokenized text for the first row
a['tokenized_text'].iloc[0]

In [None]:
# Preview the tokenized text for the first row
a['entities'].iloc[1000]

In [None]:
def separate_text(df: pd.DataFrame) -> None:
    """Adds new columns to the data frame then loops through the 
    tokenized text of each row moving each category to the newly created relevant column."""
    
    # Add new columns to the data frame
    for c in ['text', 'stop_words', 'nonalphanums', 'numbers', 'ambigs', 'processed_text']:
        df[c] = pd.Series(dtype=str)
    
    # Make a copy of the tokenized text lists by row
    for row in range(df.shape[0]):
        text_block = df['tokenized_text'].iloc[row].copy()
        
        text = []
        stop_words = []
        nonalphanums = []
        numbers = []
        ambigs = []
    
        for idx, word in enumerate(text_block):
            # Move stopwords
            if word[7] == True:
                stop_words.append(text_block[idx])
            # Move punctuation and whitespace
            elif word[2] in ['PUNCT', 'SPACE', 'CCONJ', 'X', 'SYM']:
                nonalphanums.append(text_block[idx])
            # Move numbers
            elif word[2] == 'NUM':
                numbers.append(text_block[idx])
            # Move ambiguous transcribed words
            elif '?' in word[5]:
                ambigs.append(text_block[idx])
            # Move text
            else:
                text.append(text_block[idx])
                
        df['text'].iloc[row] = text
        df['stop_words'].iloc[row] = stop_words
        df['nonalphanums'].iloc[row] = nonalphanums
        df['numbers'].iloc[row] = numbers
        df['ambigs'].iloc[row] = ambigs
        # Add lowercase lemmas for all words in 'text'
        df['processed_text'].iloc[row] = [i[1].lower() for i in a['text'].iloc[row]]


In [None]:
# Run the separate_text fucntion on the Anthony data frame
separate_text(a)

In [None]:
# Preview the first six rows of the updated data frame
a.iloc[0:6]

Start working with only the Susan B. Anthony speeches

In [None]:
# Load the speech inventory
a_speeches = load_csv(speech_inventory)

In [None]:
# Group transcriptions by ItemId
# Creates a dictionary where the ItemId is the key and the value is a list of associated row indexes
a_groups = a.groupby('ItemId').groups

# Create a list of dictionaries representing each speech
# This structure is specifically designed for visualization in the next notebook
speech_list = []
for row in range(a_speeches.shape[0]):
    d = re.findall('\d{4}', a_speeches.iloc[row][1])
    speech_id = a_speeches.iloc[row][0]
    speech_text = []
    for i in a_groups[speech_id]:
        speech_text.extend(a['processed_text'].iloc[i])
    speech = {'id': speech_id, 
              'year': d[0], 
              'title': a_speeches.iloc[row][2], 
              'text': speech_text}
    speech_list.append(speech)

In [None]:
# Store the speech list as a reusable variable across notebooks
%store speech_list

# Reuse the variable in another notebook using the following command
# %store -r speech_list
# Then call the variable like usual