Import necessary python modules

In [78]:
%matplotlib inline
import pandas as pd

Save dataset paths

In [79]:
anthony = "data/anthony/susan-b-anthony-papers_2022-10-12.csv"
catt = "data/catt/carrie-chapman-catt-papers_2022-10-12.csv"
stanton = "data/stanton/elizabeth-cady-stanton-papers_2022-10-19.csv"
terrell = "data/terrell/mary-church-terrell-advocate-for-african-americans-and-women_2023-01-20.csv"

In [80]:
def load_csv(file: str) -> pd.DataFrame:
    """Load each CSV file into a data frame."""
    
    df = pd.read_csv(file, dtype=str)
    return df


In [81]:
# Load each csv and store the data frame in a variable
a = load_csv(anthony)
c = load_csv(catt)
s = load_csv(stanton)
t = load_csv(terrell)

In [82]:
# Confirm that the load worked by previewing with `df.head()`
a.head()

Unnamed: 0,Campaign,Project,Item,ItemId,Asset,AssetId,AssetStatus,DownloadUrl,Transcription,Tags
0,Susan B. Anthony Papers,Speeches and other writings,Susan B. Anthony Papers: Speeches and Writings...,mss11049038,mss11049038-1,179295,completed,http://tile.loc.gov/image-services/iiif/servic...,Susan B. Anthony SPEECHES AND WRITINGS FI...,May 1852
1,Susan B. Anthony Papers,Speeches and other writings,Susan B. Anthony Papers: Speeches and Writings...,mss11049038,mss11049038-2,179296,completed,http://tile.loc.gov/image-services/iiif/servic...,/52\r\nS.B.A-\r\n\r\nDelivered for the\r\nFirs...,
2,Susan B. Anthony Papers,Speeches and other writings,Susan B. Anthony Papers: Speeches and Writings...,mss11049038,mss11049038-3,179297,completed,http://tile.loc.gov/image-services/iiif/servic...,will the best & wisest of mothers continue\r\n...,temperance
3,Susan B. Anthony Papers,Speeches and other writings,Susan B. Anthony Papers: Speeches and Writings...,mss11049038,mss11049038-4,179298,completed,http://tile.loc.gov/image-services/iiif/servic...,[Mind] the youthful mind. Of how\r\nlittle av...,temperance
4,Susan B. Anthony Papers,Speeches and other writings,Susan B. Anthony Papers: Speeches and Writings...,mss11049038,mss11049038-5,179299,completed,http://tile.loc.gov/image-services/iiif/servic...,x\r\nWhile we labor to reclaim one generation ...,temperance


---

Configure spaCy

In [98]:
import spacy
import en_core_web_lg

In [99]:
# Load the model
nlp = en_core_web_lg.load()

def tokens(text) -> list:
    """Runs NLP process on text input. 
    
    Returns: 
        process (list): A list containing tuples of NLP attributes for each word in the transcription.
    """
    doc = nlp(str(text))
    process = ([(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
                token.shape_, token.is_alpha, token.is_stop) for token in doc])

    return process


def entities(text) -> list:
    """Runs NER process on text input. 
    
    Returns:
        process (list): A list containing tuples of NER attributes for each word in the transciption.
    """
    doc = nlp(str(text))
    process = [(ent.text, ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]

    return process


In [101]:
# Create a new column containing the output of the tokens function
a['tokenized_text'] = a['Transcription'].apply(tokens)

In [102]:
# Create a new column containing the output of the entities function
a['entities'] = a['Transcription'].apply(entities)

In [103]:
# Preview the first row of the data
a.head(1)

Unnamed: 0,Campaign,Project,Item,ItemId,Asset,AssetId,AssetStatus,DownloadUrl,Transcription,Tags,tokenized_text,entities
0,Susan B. Anthony Papers,Speeches and other writings,Susan B. Anthony Papers: Speeches and Writings...,mss11049038,mss11049038-1,179295,completed,http://tile.loc.gov/image-services/iiif/servic...,Susan B. Anthony SPEECHES AND WRITINGS FI...,May 1852,"[(Susan, Susan, PROPN, NNP, compound, Xxxxx, T...","[(Susan B. Anthony SPEECHES, 0, 30, PERSO..."


In [104]:
# Preview the tokenized text for the first row
a['tokenized_text'].iloc[0]

[('Susan', 'Susan', 'PROPN', 'NNP', 'compound', 'Xxxxx', True, False),
 ('B.', 'B.', 'PROPN', 'NNP', 'compound', 'X.', False, False),
 ('Anthony', 'Anthony', 'PROPN', 'NNP', 'compound', 'Xxxxx', True, False),
 ('     ', '     ', 'SPACE', '_SP', 'dep', '    ', False, False),
 ('SPEECHES', 'SPEECHES', 'PROPN', 'NNP', 'ROOT', 'XXXX', True, False),
 ('AND', 'and', 'CCONJ', 'CC', 'cc', 'XXX', True, True),
 ('WRITINGS', 'writing', 'NOUN', 'NNS', 'conj', 'XXXX', True, False),
 ('FILE', 'file', 'ADP', 'IN', 'prep', 'XXXX', True, False),
 ('       ', '       ', 'SPACE', '_SP', 'dep', '    ', False, False),
 ('Delivered', 'deliver', 'VERB', 'VBN', 'acl', 'Xxxxx', True, False),
 ('for', 'for', 'ADP', 'IN', 'prep', 'xxx', True, True),
 ('the', 'the', 'DET', 'DT', 'det', 'xxx', True, True),
 ('first', 'first', 'ADJ', 'JJ', 'pobj', 'xxxx', True, True),
 ('\r\n                                                                                            ',
  '\r\n                                        

In [111]:
# Preview the tokenized text for the first row
a['entities'].iloc[1000]

[('Wednesday, October 11, 1865', 0, 27, 'DATE'),
 ('Auburn', 30, 36, 'GPE'),
 ('P. Bridge', 40, 49, 'FAC'),
 ('Wright', 79, 85, 'PERSON'),
 ('Eliza\r\nWright Osbornes', 111, 133, 'PERSON'),
 ('W.', 140, 142, 'PERSON'),
 ("Woman's Rights", 166, 180, 'ORG'),
 ('12 noon', 236, 243, 'TIME'),
 ('Charles Mills', 255, 268, 'PERSON'),
 ('Syracuse Depot', 273, 287, 'ORG'),
 ('Sarah', 318, 323, 'PERSON'),
 ('Thursday, October 12, 1865', 345, 371, 'DATE'),
 ('Albany', 393, 399, 'GPE'),
 ('Uncle & Aunt', 426, 438, 'ORG'),
 ('2 P.M.', 452, 458, 'TIME'),
 ('Albany', 471, 477, 'GPE'),
 ('Phebe\r\n', 495, 502, 'PERSON'),
 ('Margaret', 506, 514, 'PERSON'),
 ('Lydia', 527, 532, 'PERSON'),
 ('Jane', 537, 541, 'PERSON'),
 ('Sister\r\nPhebe Willis', 551, 571, 'PERSON'),
 ('tomorrow\r\nP.M.', 597, 611, 'DATE')]

In [107]:
def separate_text(df: pd.DataFrame) -> None:
    """Adds new columns to the data frame then loops through the 
    tokenized text of each row moving each category to the newly created relevant column."""
    
    # Add new columns to the data frame
    for c in ['text', 'stop_words', 'nonalphanums', 'numbers', 'ambigs']:
        df[c] = pd.Series(dtype=str)
    
    # Make a copy of the tokenized text lists by row
    for row in range(df.shape[0]):
        print(f"Working on row {row}...")
        text_block = df['tokenized_text'].iloc[row].copy()
        
        text = []
        stop_words = []
        nonalphanums = []
        numbers = []
        ambigs = []
    
        for idx, word in enumerate(text_block):
            if word[7] == False: # add logic to account for punct and space
                text.append(text_block[idx])
                #print(f"Moving {idx} (regular text)")
            if word[7] == True:
                stop_words.append(text_block[idx])
                #print(f"Moving {idx} (stop word)")
            if (word[2] == 'PUNCT' or word[2] == 'SPACE'):
                nonalphanums.append(text_block[idx])
                #print(f"Moving {idx} (punctuation/whitespace)")
            if word[2] == 'NUM':
                numbers.append(text_block[idx])
                #print(f"Moving {idx} (number)")
            if '?' in word[5]:
                ambigs.append(text_block[idx])
                #print(f"Moving {idx} (ambiguous word)")
                
        df['text'].iloc[row] = text
        df['stop_words'].iloc[row] = stop_words
        df['nonalphanums'].iloc[row] = nonalphanums
        df['numbers'].iloc[row] = numbers
        df['ambigs'].iloc[row] = ambigs


In [None]:
separate_text(a)

In [115]:
a.iloc[0:6]

Unnamed: 0,Campaign,Project,Item,ItemId,Asset,AssetId,AssetStatus,DownloadUrl,Transcription,Tags,tokenized_text,entities,text,stop_words,nonalphanums,numbers,ambigs
0,Susan B. Anthony Papers,Speeches and other writings,Susan B. Anthony Papers: Speeches and Writings...,mss11049038,mss11049038-1,179295,completed,http://tile.loc.gov/image-services/iiif/servic...,Susan B. Anthony SPEECHES AND WRITINGS FI...,May 1852,"[(Susan, Susan, PROPN, NNP, compound, Xxxxx, T...","[(Susan B. Anthony SPEECHES, 0, 30, PERSO...","[(Susan, Susan, PROPN, NNP, compound, Xxxxx, T...","[(AND, and, CCONJ, CC, cc, XXX, True, True), (...","[( , , SPACE, _SP, dep, , False, ...","[(2, 2, NUM, CD, nummod, d, False, False), (18...",[]
1,Susan B. Anthony Papers,Speeches and other writings,Susan B. Anthony Papers: Speeches and Writings...,mss11049038,mss11049038-2,179296,completed,http://tile.loc.gov/image-services/iiif/servic...,/52\r\nS.B.A-\r\n\r\nDelivered for the\r\nFirs...,,"[(/52, /52, PROPN, NNP, punct, /dd, False, Fal...","[(Batavia, 44, 51, GPE), (N.J., 52, 56, GPE), ...","[(/52, /52, PROPN, NNP, punct, /dd, False, Fal...","[(for, for, ADP, IN, prep, xxx, True, True), (...","[(\r\n, \r\n, SPACE, _SP, dep, \r\n, False, Fa...","[(1852, 1852, NUM, CD, nummod, dddd, False, Fa...",[]
2,Susan B. Anthony Papers,Speeches and other writings,Susan B. Anthony Papers: Speeches and Writings...,mss11049038,mss11049038-3,179297,completed,http://tile.loc.gov/image-services/iiif/servic...,will the best & wisest of mothers continue\r\n...,temperance,"[(will, will, AUX, MD, aux, xxxx, True, True),...","[(the\r\nSociety, 295, 307, ORG), (two, 324, 3...","[(best, good, ADJ, JJS, nsubj, xxxx, True, Fal...","[(will, will, AUX, MD, aux, xxxx, True, True),...","[(\r\n, \r\n, SPACE, _SP, dep, \r\n, False, Fa...","[(two, two, NUM, CD, nummod, xxx, True, True),...",[]
3,Susan B. Anthony Papers,Speeches and other writings,Susan B. Anthony Papers: Speeches and Writings...,mss11049038,mss11049038-4,179298,completed,http://tile.loc.gov/image-services/iiif/servic...,[Mind] the youthful mind. Of how\r\nlittle av...,temperance,"[([, [, X, XX, dep, [, False, False), (Mind, m...","[(christian, 77, 86, NORP), (truth & sobernes...","[([, [, X, XX, dep, [, False, False), (Mind, m...","[(the, the, DET, DT, det, xxx, True, True), (O...","[(., ., PUNCT, ., punct, ., False, False), ( ,...",[],[]
4,Susan B. Anthony Papers,Speeches and other writings,Susan B. Anthony Papers: Speeches and Writings...,mss11049038,mss11049038-5,179299,completed,http://tile.loc.gov/image-services/iiif/servic...,x\r\nWhile we labor to reclaim one generation ...,temperance,"[(x, x, ADP, IN, punct, x, True, False), (\r\n...","[(one, 29, 32, CARDINAL), (Legislature, 145, 1...","[(x, x, ADP, IN, punct, x, True, False), (\r\n...","[(While, while, SCONJ, IN, mark, Xxxxx, True, ...","[(\r\n, \r\n, SPACE, _SP, dep, \r\n, False, Fa...","[(one, one, NUM, CD, nummod, xxx, True, True)]",[]
5,Susan B. Anthony Papers,Speeches and other writings,Susan B. Anthony Papers: Speeches and Writings...,mss11049038,mss11049038-6,179300,completed,http://tile.loc.gov/image-services/iiif/servic...,Howard & their influence to help on the\r\nwor...,temperance,"[(Howard, Howard, PROPN, NNP, ROOT, Xxxxx, Tru...","[(Howard &, 0, 8, ORG), (State, 200, 205, ORG)...","[(Howard, Howard, PROPN, NNP, ROOT, Xxxxx, Tru...","[(their, their, PRON, PRP$, poss, xxxx, True, ...","[(\r\n, \r\n, SPACE, _SP, dep, \r\n, False, Fa...","[(50, 50, NUM, CD, nummod, dd, False, False), ...",[]


In [112]:
a['text'].iloc[0]

[('Susan', 'Susan', 'PROPN', 'NNP', 'compound', 'Xxxxx', True, False),
 ('B.', 'B.', 'PROPN', 'NNP', 'compound', 'X.', False, False),
 ('Anthony', 'Anthony', 'PROPN', 'NNP', 'compound', 'Xxxxx', True, False),
 ('     ', '     ', 'SPACE', '_SP', 'dep', '    ', False, False),
 ('SPEECHES', 'SPEECHES', 'PROPN', 'NNP', 'ROOT', 'XXXX', True, False),
 ('WRITINGS', 'writing', 'NOUN', 'NNS', 'conj', 'XXXX', True, False),
 ('FILE', 'file', 'ADP', 'IN', 'prep', 'XXXX', True, False),
 ('       ', '       ', 'SPACE', '_SP', 'dep', '    ', False, False),
 ('Delivered', 'deliver', 'VERB', 'VBN', 'acl', 'Xxxxx', True, False),
 ('\r\n                                                                                            ',
  '\r\n                                                                                            ',
  'SPACE',
  '_SP',
  'dep',
  '\r\n    ',
  False,
  False),
 ('Batavia', 'Batavia', 'PROPN', 'NNP', 'pobj', 'Xxxxx', True, False),
 ('(', '(', 'PUNCT', '-LRB-', 'punct', '(', 