# https://www.nltk.org/howto/corpus.html 
# https://github.com/divyanshrai/medium-code/blob/master/Class%201%20medium.ipynb

In [7]:
from nltk.corpus import stopwords 
from nltk.corpus import inaugural

In [5]:
import nltk

nltk.download('inaugural')
nltk.download('stopwords')

[nltk_data] Downloading package inaugural to
[nltk_data]     /Users/gavinreid/nltk_data...
[nltk_data]   Package inaugural is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/gavinreid/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [43]:
files=inaugural.fileids()
print(len(files))
print(files[0], files[58])

59
1789-Washington.txt 2021-Biden.txt


## Helper Functions

In [28]:
import os
from nltk.tokenize import PunktSentenceTokenizer
from collections import Counter

In [65]:
def read_file(file_name):
 with open(file_name, 'r+', encoding='utf-8') as file:
   file_text = file.read()
 return file_text

In [20]:
def process_speeches(speeches):
 word_tokenized_speeches = list()
 for speech in speeches:
   sentence_tokenizer = PunktSentenceTokenizer()
   sentence_tokenized_speech = sentence_tokenizer.tokenize(speech)
   word_tokenized_sentences = list()
   for sentence in sentence_tokenized_speech:
     word_tokenized_sentence = [word.lower().strip('.').strip('?').strip('!') for word in sentence.replace(",","").replace("-"," ").replace(":","").split()]
     word_tokenized_sentences.append(word_tokenized_sentence)
   word_tokenized_speeches.append(word_tokenized_sentences)
 return word_tokenized_speeches

In [22]:
def merge_speeches(speeches):
 all_sentences = list()
 for speech in speeches:
   for sentence in speech:
     all_sentences.append(sentence)
 return all_sentences

In [24]:
def get_president_sentences(president):
 files = sorted([file for file in os.listdir() if president.lower() in file.lower()])
 speeches = [read_file(file) for file in files]
 processed_speeches = process_speeches(speeches)
 all_sentences = merge_speeches(processed_speeches)
 return all_sentences

In [26]:
def get_presidents_sentences(presidents):
 all_sentences = list()
 for president in presidents:
   files = sorted([file for file in os.listdir() if president.lower() in file.lower()])
   speeches = [read_file(file) for file in files]
   processed_speeches = process_speeches(speeches)
   all_prez_sentences = merge_speeches(processed_speeches)
   all_sentences.extend(all_prez_sentences)
 return all_sentences

In [30]:
def most_frequent_words(list_of_sentences):
 all_words = [word for sentence in list_of_sentences for word in sentence]
 return Counter(all_words).most_common()

## Presidential Speeches

In [39]:
import gensim
import spacy

In [59]:
print(inaugural.raw('1789-Washington.txt'))

Fellow-Citizens of the Senate and of the House of Representatives:

Among the vicissitudes incident to life no event could have filled me with greater anxieties than that of which the notification was transmitted by your order, and received on the 14th day of the present month. On the one hand, I was summoned by my Country, whose voice I can never hear but with veneration and love, from a retreat which I had chosen with the fondest predilection, and, in my flattering hopes, with an immutable decision, as the asylum of my declining years -- a retreat which was rendered every day more necessary as well as more dear to me by the addition of habit to inclination, and of frequent interruptions in my health to the gradual waste committed on it by time. On the other hand, the magnitude and difficulty of the trust to which the voice of my country called me, being sufficient to awaken in the wisest and most experienced of her citizens a distrustful scrutiny into his qualifications, could not bu

In [79]:
all_sentences = inaugural.sents()
print(all_sentences[0])

['Fellow', '-', 'Citizens', 'of', 'the', 'Senate', 'and', 'of', 'the', 'House', 'of', 'Representatives', ':']


In [81]:
all_sentences_processed = process_speeches(all_sentences)

TypeError: expected string or bytes-like object, got 'list'

In [49]:
files = sorted([file for file in os.listdir() if file[-4:] == '.txt'])
print(files)

# read each speech file
speeches = [read_file(speech) for speech in files]

# preprocess each speech
processed_speeches = process_speeches(speeches)

# merge speeches
all_sentences = merge_speeches(processed_speeches)

# view most frequently used words
most_freq_words = most_frequent_words(all_sentences)
print(most_freq_words)

[]
[]


In [45]:



# create gensim model of all speeches
all_prez_embeddings = gensim.models.Word2Vec(all_sentences, size=96, window=5, min_count=1, workers=2, sg=1)


# view words similar to freedom
similar_to_freedom = all_prez_embeddings.most_similar("freedom", topn=20)
print(similar_to_freedom)


similar_to_terror = all_prez_embeddings.most_similar("terror", topn=20)
print(similar_to_terror)


# get President Roosevelt sentences
roosevelt_sentences = get_president_sentences("franklin-d-roosevelt")


# view most frequently used words of Roosevelt
roosevelt_most_freq_words = most_frequent_words(roosevelt_sentences)
#print(roosevelt_most_freq_words)


# create gensim model for Roosevelt
roosevelt_embeddings = gensim.models.Word2Vec(roosevelt_sentences, size=96, window=5, min_count=1, workers=2, sg=1)


# view words similar to freedom for Roosevelt
roosevelt_similar_to_freedom = roosevelt_embeddings.most_similar("freedom", topn=20)
print(roosevelt_similar_to_freedom)


# get sentences of multiple presidents
rushmore_prez_sentences = get_presidents_sentences(["washington","jefferson","lincoln","theodore-roosevelt"])


# view most frequently used words of presidents
rushmore_most_freq_words = most_frequent_words(rushmore_prez_sentences)
#print(rushmore_most_freq_words)


# create gensim model for the presidents
rushmore_embeddings = gensim.models.Word2Vec(rushmore_prez_sentences, size=96, window=5, min_count=1, workers=2, sg=1)


# view words similar to freedom for presidents
rushmore_similar_to_freedom = rushmore_embeddings.most_similar("freedom", topn=20)
print(rushmore_similar_to_freedom)


roosevelt_similar_to_god = roosevelt_embeddings.most_similar("liberty", topn=20)
print(roosevelt_similar_to_god)

[]


TypeError: Word2Vec.__init__() got an unexpected keyword argument 'size'