In [3]:
import pandas as pd
from bs4 import BeautifulSoup
import spacy
import requests
import csv
from datetime import datetime

In [4]:
url = 'https://millercenter.org/the-presidency/presidential-speeches?field_president_target_id[8396]=8396'
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html')

In [5]:
speeches = []
transcripts = soup.findAll("div", attrs={"class" : "views-field-title"})

In [6]:
csv_file_path = 'speech_data.csv'
with open(csv_file_path, 'w', newline='', encoding='utf-8') as csv_file:
    csv_writer = csv.writer(csv_file)
    csv_writer.writerow(['Title', 'Link', 'President', 'Date', 'Summary', 'Speech'])

    for i, transcript in enumerate(transcripts):
      if i < 24:
        link_url = transcript.find('a')['href']
        link_response = requests.get(link_url)
        link_html_content = link_response.content
        link_soup = BeautifulSoup(link_html_content, 'html.parser')
        link_text = link_soup.find("div", attrs = {'class': "transcript-inner"}).text.strip()
        speeches.append(link_text)
        title = transcript.text.strip()
        link = link_url
        president = link_soup.find("p", attrs = {'class': "president-name"}).text.strip()
        date = link_soup.find("p", attrs = {'class': "episode-date"}).text.strip()
        summary = link_soup.find("div", attrs = {'class': "about-sidebar--intro"}).text.strip()

        csv_writer.writerow([title, link, president, date, summary, link_text])

Rename the columns and change the format of the date

In [7]:
data = pd.read_csv('speech_data.csv')
data = data.rename(columns={'Title': 'Speech Title', 'Link': 'URL', 'Speech': 'Transcript'})

In [8]:
nlp = spacy.load('en_core_web_sm')
doc = nlp(link_text)

In [9]:
data['Doc'] = data['Transcript'].apply(nlp)

Create corpus files in the text format

In [10]:
for i, row in data.iterrows():
    formatted_date = datetime.strptime(row['Date'], "%B %d, %Y").strftime("%Y%m%d")
    filename = f"transcript_{formatted_date}.txt"

    with open(filename, 'w', encoding='utf-8') as txt_file:
      txt_file.write(row['Transcript'])

In [16]:
for token in doc:
    print(token.text, token.pos_)

Transcript PROPN

 SPACE
Last ADJ
night NOUN
, PUNCT
the DET
United PROPN
States PROPN
brought VERB
the DET
world NOUN
’s PART
number NOUN
one NUM
terrorist ADJ
leader NOUN
to ADP
justice NOUN
. PUNCT
Abu PROPN
Bakr PROPN
al PROPN
- PUNCT
Baghdadi PROPN
is AUX
dead ADJ
. PUNCT
   SPACE
He PRON
was AUX
the DET
founder NOUN
and CCONJ
leader NOUN
of ADP
ISIS PROPN
, PUNCT
the DET
most ADV
ruthless ADJ
and CCONJ
violent ADJ
terror NOUN
organization NOUN
in ADP
the DET
World PROPN
. PUNCT
   SPACE
The DET
United PROPN
States PROPN
has AUX
been AUX
searching VERB
for ADP
Baghdadi PROPN
for ADP
many ADJ
years NOUN
. PUNCT
   SPACE
Capturing NOUN
or CCONJ
killing VERB
Baghdadi PROPN
has AUX
been AUX
the DET
top ADJ
national ADJ
security NOUN
priority NOUN
of ADP
my PRON
Administration PROPN
. PUNCT
   SPACE
U.S. PROPN
Special PROPN
Operations PROPN
forces NOUN
executed VERB
a DET
dangerous ADJ
and CCONJ
daring ADJ
nighttime ADJ
raid NOUN
into ADP
Northwestern PROPN
Syria PROPN
to PART
accompli

### Tokenization

Segment strings into individual words and punctuation markers

In [17]:
def get_token(doc):
    return [(token.text) for token in doc]

In [19]:
data['Tokens'] = data['Doc'].apply(get_token)
data.head()

Unnamed: 0,Speech Title,URL,President,Date,Summary,Transcript,Doc,POS,Proper_Nouns,Tokens
0,"January 19, 2021: Farewell Address",https://millercenter.org/the-presidency/presid...,Donald Trump,"January 19, 2021",President Donald Trump gives his farewell addr...,Transcript\nMy fellow Americans: Four years ag...,"(Transcript, \n, My, fellow, Americans, :, Fou...","[(PROPN, NNP), (SPACE, _SP), (PRON, PRP$), (AD...","[Transcript, Americans, America, Americans, Pr...","[Transcript, \n, My, fellow, Americans, :, Fou..."
1,"January 13, 2021: Statement about the Violence...",https://millercenter.org/the-presidency/presid...,Donald Trump,"January 13, 2021","A week after a mob stormed the US Capitol, Pre...","Transcript\nMy fellow Americans, \nI want to s...","(Transcript, \n, My, fellow, Americans, ,, \n...","[(PROPN, NNP), (SPACE, _SP), (PRON, PRP$), (AD...","[Transcript, Americans, US, Americans, America...","[Transcript, \n, My, fellow, Americans, ,, \n..."
2,"January 7, 2021: Message After Pro-Trump Mob O...",https://millercenter.org/the-presidency/presid...,Donald Trump,"January 07, 2021",The day after a mob overran the United States ...,Transcript\nI would like to begin by addressin...,"(Transcript, \n, I, would, like, to, begin, by...","[(PROPN, NNP), (SPACE, _SP), (PRON, PRP), (AUX...","[Transcript, United, States, Capitol, American...","[Transcript, \n, I, would, like, to, begin, by..."
3,"January 6, 2021: Speech Urging Supporters to G...",https://millercenter.org/the-presidency/presid...,Donald Trump,"January 06, 2021","On January 6, 2021, Trump supporters and white...",Transcript\nI know your pain. I know your hurt...,"(Transcript, \n, I, know, your, pain, ., I, kn...","[(PROPN, NNP), (SPACE, _SP), (PRON, PRP), (VER...",[Transcript],"[Transcript, \n, I, know, your, pain, ., I, kn..."
4,"November 5, 2020: Remarks on the 2020 Election",https://millercenter.org/the-presidency/presid...,Donald Trump,"November 05, 2020",President Trump makes remarks at the White Hou...,Transcript\nTHE PRESIDENT: Good evening. I’d l...,"(Transcript, \n, THE, PRESIDENT, :, Good, even...","[(PROPN, NNP), (SPACE, _SP), (DET, DT), (PROPN...","[Transcript, PRESIDENT, Florida, Iowa, Indiana...","[Transcript, \n, THE, PRESIDENT, :, Good, even..."


### Lemmatization

The retrieval of the root word for each word in the dictionary

In [21]:
def get_lemma(doc):
    return [(token.lemma_) for token in doc]

data['Lemmas'] = data['Doc'].apply(get_lemma)

We can compare counts of the word "China" in the original Tokens column and in the lemmatized Lemmas column.

In [22]:
print(f'"China" appears in the text tokens column ' + str(data['Tokens'].apply(lambda x: x.count('write')).sum()) + 'times.')
print(f'"China" appears in the lemmas column ' + str(data['Lemmas'].apply(lambda x: x.count('write')).sum()) + 'times.')

"China" appears in the text tokens column 11times.
"China" appears in the lemmas column 41times.


#### Text Annotation

### Parts-of-speech

Predict the simple universal part-of-speech of each token in a text

In [11]:
def get_pos(doc):
    return [(token.pos_, token.tag_) for token in doc]

data['POS'] = data['Doc'].apply(get_pos)

In [12]:
list(data['POS'])

[[('PROPN', 'NNP'),
  ('SPACE', '_SP'),
  ('PRON', 'PRP$'),
  ('ADJ', 'JJ'),
  ('PROPN', 'NNPS'),
  ('PUNCT', ':'),
  ('NUM', 'CD'),
  ('NOUN', 'NNS'),
  ('ADV', 'RB'),
  ('PUNCT', ','),
  ('PRON', 'PRP'),
  ('VERB', 'VBD'),
  ('DET', 'DT'),
  ('ADJ', 'JJ'),
  ('ADJ', 'JJ'),
  ('NOUN', 'NN'),
  ('PART', 'TO'),
  ('VERB', 'VB'),
  ('PRON', 'PRP$'),
  ('NOUN', 'NN'),
  ('PUNCT', ','),
  ('PART', 'TO'),
  ('VERB', 'VB'),
  ('PRON', 'PRP$'),
  ('NOUN', 'NN'),
  ('PUNCT', ','),
  ('CCONJ', 'CC'),
  ('PART', 'TO'),
  ('VERB', 'VB'),
  ('DET', 'DT'),
  ('NOUN', 'NN'),
  ('ADP', 'IN'),
  ('DET', 'DT'),
  ('NOUN', 'NN'),
  ('ADP', 'IN'),
  ('PRON', 'PRP$'),
  ('NOUN', 'NNS'),
  ('PUNCT', '.'),
  ('ADP', 'IN'),
  ('ADJ', 'JJ'),
  ('PUNCT', ','),
  ('PRON', 'PRP'),
  ('VERB', 'VBD'),
  ('ADP', 'IN'),
  ('DET', 'DT'),
  ('NOUN', 'NN'),
  ('PART', 'TO'),
  ('VERB', 'VB'),
  ('PROPN', 'NNP'),
  ('ADJ', 'JJ'),
  ('ADV', 'RB'),
  ('PUNCT', ':'),
  ('ADP', 'IN'),
  ('DET', 'DT'),
  ('PROPN', 'NNPS'),
 

In [13]:
spacy.explain("IN")

'conjunction, subordinating or preposition'

Get all the proper nouns

In [14]:
def extract_proper_nouns(doc):
    return [token.text for token in doc if token.pos_ == 'PROPN']

data['Proper_Nouns'] = data['Doc'].apply(extract_proper_nouns)

In [15]:
list(data.loc[[3, 5], 'Proper_Nouns'])

[['Transcript'],
 ['Transcript',
  'PRESIDENT',
  'America',
  'United',
  'States',
  'Constitution',
  'Constitution',
  'Republic',
  'President',
  'Supreme',
  'Court',
  'October',
  'First',
  'Lady',
  'White',
  'House',
  'United',
  'States',
  'Supreme',
  'Court',
  'Justice',
  'Amy',
  'Coney',
  'Barrett',
  'Justice',
  'Barrett',
  'Court',
  'Americans',
  'Justice',
  'Clarence',
  'Thomas',
  'Senate',
  'Majority',
  'Leader',
  'Mitch',
  'McConnell',
  'Mitch',
  'Senate',
  'Judiciary',
  'Chairman',
  'Lindsey',
  'Graham',
  'Lindsey',
  'Senators',
  'Marsha',
  'Blackburn',
  'Mike',
  'Braun',
  'Bill',
  'Cassidy',
  'Kevin',
  'Cramer',
  'Ted',
  'Cruz',
  'Steve',
  'Daines',
  'Ron',
  'Johnson',
  'James',
  'Lankford',
  'Mike',
  'Lee',
  'Martha',
  'McSally',
  'Vice',
  'President',
  'Mike',
  'Pence',
  'Mike',
  'White',
  'House',
  'Counsel',
  'Pat',
  'Cipollone',
  'Pat',
  '.',
  'Justice',
  'Barrett',
  'Jesse',
  'Jesse',
  'Indiana'

Saving the annotated corpus in csv file

In [None]:
annotated_corpus_path = 'annotated_corpus.csv'
data.to_csv(annotated_corpus_path, index=False, encoding='utf-8')

print(f"Annotated corpus has been saved to {annotated_corpus_path}")

Annotated corpus has been saved to annotated_corpus.csv
