# Topic Modelling

In [119]:
import string

import numpy as np
import scipy
import pandas as pd
import re

import gensim
from gensim.corpora import Dictionary
from gensim.models import LdaModel
from gensim.matutils import corpus2csc
from sklearn.feature_extraction.text import CountVectorizer

import html

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [120]:
data = pd.read_json('venv/data/presidential_speeches.json') # load data
data.sort_values(by=['date'], inplace=True) # sort by data
data.reset_index(drop=True, inplace=True) # reset index

## Add relevant metadata

In [121]:
# sort presidents to their parties
president_list = []
for i in range(len(data['president'].unique())):
    president_list.append(data['president'].unique()[i])
parties_list= ['Federalist', 'Democratic-Republican', 'National Republican', 'Democratic', 'Whig', 'Republican', 'Democratic (Union)']
sort_party = [0, 0, 1, 1, 1, 2, 3, 3, 4, 4, 3, 4, 4, 5, 3, 3, 6, 5, 5, 5, 5, 3, 5, 5, 5, 5, 3, 5, 5, 5, 3, 3, 5, 5, 3, 3, 5, 5, 3, 5, 3, 5, 3, 5, 3]
party = []
for i in range(len(sort_party)):
    party.append(parties_list[sort_party[i]])
data['party'] = pd.Series(dtype='string')
for i in range(len(president_list)):
    data['party'][data['president'] == president_list[i]] = party[i]
# assign each speech its respective era
data['era'] = pd.Series(dtype='string')
era_list = ['Early Republic', 'Jacksonian Democracy', 'Sectional Conflict', 'Gilded Age', 'Progressive Era', 'Depression and World Conflict', 'Social Change and Soviet Relations', 'Globalization']
for i in range(len(data)):
    if data['date'][i] < pd.Timestamp('1829-01-01T12'):
        data['era'][i] =  era_list[0]
    if pd.Timestamp('1829-01-01T12') <= data['date'][i] < pd.Timestamp('1853-01-01T12'):
        data['era'][i] = era_list[1]
    elif pd.Timestamp('1853-01-01T12') <= data['date'][i] < pd.Timestamp('1881-01-01T12'):
        data['era'][i] = era_list[2]
    elif pd.Timestamp('1881-01-01T12') <= data['date'][i] < pd.Timestamp('1897-01-01T12'):
        data['era'][i] = era_list[3]
    elif pd.Timestamp('1897-01-01T12') <= data['date'][i] < pd.Timestamp('1921-01-01T12'):
        data['era'][i] = era_list[4]
    elif pd.Timestamp('1921-01-01T12') <= data['date'][i] < pd.Timestamp('1961-01-01T12'):
        data['era'][i] = era_list[5]
    elif pd.Timestamp('1961-01-01T12') <= data['date'][i] < pd.Timestamp('1989-01-01T12'):
        data['era'][i] = era_list[6]
    elif pd.Timestamp('1989-01-01T12') <= data['date'][i]:
        data['era'][i] = era_list[7]

In [122]:
data['transcript'].replace(to_replace='(Applause.)', regex=True, value='', inplace=True)
data['transcript'].replace(to_replace='(Laughter.)', regex=True, value='', inplace=True)

In [123]:
for i in range(len(data)):
    data['transcript'][i] = data['transcript'][i].replace('\'', '')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['transcript'][i] = data['transcript'][i].replace('\'', ' ')


## Preprocessing

The Following section is code written by Srikanth Shenoy and can be found under https://towardsdatascience.com/elegant-text-pre-processing-with-nltk-in-sklearn-pipeline-d6fe18b91eb8 (accessed on 02/10/2023)

In [124]:
# remove stopwords, make lowercase, tokenize, lemmatize
from nltk import pos_tag
from nltk.tokenize import sent_tokenize
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

lemmatizer = WordNetLemmatizer()
custom_words = {'q'}
stop_words = set(stopwords.words('english'))
stop_words.update(custom_words)

def preprocess(text):
    text = sent_tokenize(text)
    for sen in text:
        sen = sen.lower()
        
    #words = pos_tag(word_tokenize(text))
    words = word_tokenize(sent_tokenize(text))
    words = [word for word in words if word not in stop_words and word.isalpha()]
    words = pos_tag(words)
    
    #for word, tag in words:
    #print(words)
    #words = [lemmatizer.lemmatize(word) for word in words[0] if word not in stop_words and word.isalpha()]
    return words



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Enno\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Enno\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Enno\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Enno\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [125]:
import string
import re
import contractions

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

from bs4 import BeautifulSoup
from textblob import TextBlob
from unidecode import unidecode

In [126]:
def lemmatize_pos_tagged_text(text, lemmatizer, pos_tag_dict):
  sentences = nltk.sent_tokenize(text)
  new_sentences = []

  for sentence in sentences:
    sentence = sentence.lower()
    new_sentence_words = []
    #one pos_tuple for sentence
    pos_tuples = nltk.pos_tag(nltk.word_tokenize(sentence)) 

    for word_idx, word in enumerate(nltk.word_tokenize(sentence)):
      nltk_word_pos = pos_tuples[word_idx][1]
      wordnet_word_pos = pos_tag_dict.get(
                          nltk_word_pos[0].upper(), None)
      if wordnet_word_pos is not None:
        new_word = lemmatizer.lemmatize(word, wordnet_word_pos)
      else:
        new_word = lemmatizer.lemmatize(word)

      new_sentence_words.append(new_word)

    new_sentence = " ".join(new_sentence_words)
    new_sentences.append(new_sentence)

  return " ".join(new_sentences)

In [127]:
def download_if_non_existent(res_path, res_name):
  try:
    nltk.data.find(res_path)
  except LookupError:
    print(f'resource {res_path} not found. Downloading now...')
    nltk.download(res_name)
class NltkPreprocessingSteps:
  def __init__(self, X):
    self.X = X
    download_if_non_existent('corpora/stopwords', 'stopwords')
    download_if_non_existent('tokenizers/punkt', 'punkt')
    download_if_non_existent('taggers/averaged_perceptron_tagger',
                             'averaged_perceptron_tagger')
    download_if_non_existent('corpora/wordnet', 'wordnet')
    download_if_non_existent('corpora/omw-1.4', 'omw-1.4')

    self.sw_nltk = stopwords.words('english')
    new_stopwords = ['<*>']
    self.sw_nltk.extend(new_stopwords)
    self.sw_nltk.remove('not')

    self.pos_tag_dict = {"J": wordnet.ADJ,
                    "N": wordnet.NOUN,
                    "V": wordnet.VERB,
                    "R": wordnet.ADV}

    # '!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~' 32 punctuations in python
    # we dont want to replace . first time around
    self.remove_punctuations = string.punctuation.replace('.','')

  def remove_html_tags(self):
    self.X = self.X.apply(
            lambda x: BeautifulSoup(x, 'html.parser').get_text())
    return self

  def replace_diacritics(self):
    self.X = self.X.apply(
            lambda x: unidecode(x, errors="preserve"))
    return self

  def to_lower(self):
    self.X = np.apply_along_axis(lambda x: x.lower(), self.X)
    return self

  def expand_contractions(self):
    self.X = self.X.apply(
            lambda x: " ".join([contractions.fix(expanded_word) 
                        for expanded_word in x.split()]))
    return self

  def remove_numbers(self):
    self.X = self.X.apply(lambda x: re.sub(r'\d+', '', x))
    return self

  def replace_dots_with_spaces(self):
    self.X = self.X.apply(lambda x: re.sub("[.]", " ", x))
    return self

  def remove_punctuations_except_periods(self):
    self.X = self.X.apply(
                 lambda x: re.sub('[%s]' %
                  re.escape(self.remove_punctuations), '' , x))
    return self

  def remove_all_punctuations(self):
    self.X = self.X.apply(lambda x: re.sub('[%s]' %
                          re.escape(string.punctuation), '' , x))
    return self

  def remove_double_spaces(self):
    self.X = self.X.apply(lambda x: re.sub(' +', ' ', x))
    return self

  def fix_typos(self):
    self.X = self.X.apply(lambda x: str(TextBlob(x).correct()))
    return self

  def remove_stopwords(self):
    # remove stop words from token list in each column
    self.X = self.X.apply(
            lambda x: " ".join([ word for word in x.split() 
                     if word not in self.sw_nltk]) )
    return self

  def lemmatize(self):
    lemmatizer = WordNetLemmatizer()
    self.X = self.X.apply(lambda x: lemmatize_pos_tagged_text(
                           x, lemmatizer, self.pos_tag_dict))
    return self

  def get_processed_text(self):
    return self.X

In [None]:
txt_preproc = NltkPreprocessingSteps(data['transcript'])
processed_text = \
    txt_preproc \
    .remove_html_tags()\
    .replace_diacritics()\
    .expand_contractions()\
    .remove_numbers()\
    .fix_typos()\
    .remove_punctuations_except_periods()\
    .lemmatize()\
    .remove_double_spaces()\
    .remove_all_punctuations()\
    .remove_stopwords()\
    .get_processed_text()

resource corpora/wordnet not found. Downloading now...
resource corpora/omw-1.4 not found. Downloading now...


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Enno\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Enno\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [178]:
dictionary = Dictionary(preprocessed)
dictionary.filter_extremes(no_below=5, no_above=0.5)

bow_corpus = [dictionary.doc2bow(text) for text in preprocessed] 

In [179]:
num_topics = 10
chunksize = 5000
passes = 1
iterations = 400
eval_every = None

model = gensim.models.LdaModel(
corpus=bow_corpus,
id2word=dictionary,
chunksize=chunksize,
alpha='auto',
eta='auto',
iterations=iterations,
num_topics=num_topics,
passes=passes,
eval_every=eval_every
)
topics = model.show_topics(
num_topics=num_topics,
num_words=10,
log=False,
formatted=False)

In [180]:
for topic_id, topic in topics:
    print('topic: {}'.format(topic_id))
    print('Words: {}'.format([word for word, _ in topic]))

topic: 0
Words: ['going', 'program', 'help', 'applause', 'problem', 'federal', 'get', 'administration', 'south', 'child']
topic: 1
Words: ['applause', 'help', 'energy', 'iraq', 'constitution', 'man', 'family', 'back', 'problem', 'going']
topic: 2
Words: ['subject', 'increase', 'department', 'tax', 'report', 'trade', 'business', 'going', 'federal', 'provision']
topic: 3
Words: ['applause', 'treaty', 'report', 'person', 'secretary', 'job', 'get', 'department', 'going', 'territory']
topic: 4
Words: ['case', 'subject', 'treaty', 'money', 'tax', 'mexico', 'treasury', 'bank', 'amount', 'constitution']
topic: 5
Words: ['going', 'get', 'child', 'bill', 'business', 'man', 'party', 'treaty', 'family', 'federal']
topic: 6
Words: ['constitution', 'subject', 'department', 'officer', 'treaty', 'condition', 'court', 'secretary', 'commerce', 'trade']
topic: 7
Words: ['going', 'bank', 'help', 'job', 'get', 'governor', 'program', 'family', 'thank', 'back']
topic: 8
Words: ['tax', 'federal', 'business', 

In [70]:
pp_test = data.loc[data['president'] == 'Donald Trump']['transcript'].apply(preprocess)

In [81]:
pp_test[995][0][0]

'the'

In [52]:
dictionary = Dictionary(pp_test)
dictionary.filter_extremes(no_below=5, no_above=0.5)

bow_corpus = [dictionary.doc2bow(text) for text in pp_test] 
num_topics = 10
chunksize = 10000
passes = 1
iterations = 400
eval_every = None

model = gensim.models.LdaModel(
corpus=bow_corpus,
id2word=dictionary,
chunksize=chunksize,
alpha='auto',
eta='auto',
iterations=iterations,
num_topics=num_topics,
passes=passes,
eval_every=eval_every
)
topics = model.show_topics(
num_topics=num_topics,
num_words=5,
log=False,
formatted=False)

In [53]:
for topic_id, topic in topics:
    print('topic: {}'.format(topic_id))
    print('Words: {}'.format([word for word, _ in topic]))

topic: 0
Words: ['virus', 'governor', 'talking', 'yeah', 'ahead']
topic: 1
Words: ['tax', 'trump', 'reform', 'tonight', 'governor']
topic: 2
Words: ['immigration', 'tax', 'wall', 'reform', 'guy']
topic: 3
Words: ['biden', 'iran', 'trump', 'money', 'guy']
topic: 4
Words: ['governor', 'ahead', 'test', 'virus', 'question']
topic: 5
Words: ['test', 'okay', 'guy', 'money', 'tax']
topic: 6
Words: ['boy', 'trump', 'tonight', 'usa', 'violence']
topic: 7
Words: ['vote', 'money', 'okay', 'talking', 'wall']
topic: 8
Words: ['governor', 'test', 'ahead', 'virus', 'testing']
topic: 9
Words: ['tonight', 'approved', 'judge', 'death', 'tax']


## Sentiment Analysis

In [48]:
from transformers import pipeline
sentiment_analysis = pipeline("sentiment-analysis",model="j-hartmann/sentiment-roberta-large-english-3-classes", top_k=1)

Some weights of the model checkpoint at j-hartmann/sentiment-roberta-large-english-3-classes were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [35]:
pp_test[986]

['chief',
 'justice',
 'robert',
 'president',
 'carter',
 'president',
 'clinton',
 'president',
 'bush',
 'president',
 'obama',
 'fellow',
 'american',
 'people',
 'world',
 'thank',
 'citizen',
 'america',
 'joined',
 'great',
 'national',
 'effort',
 'rebuild',
 'country',
 'restore',
 'promise',
 'people',
 'together',
 'determine',
 'course',
 'america',
 'world',
 'year',
 'come',
 'face',
 'challenge',
 'confront',
 'hardship',
 'get',
 'job',
 'done',
 'every',
 'four',
 'year',
 'gather',
 'step',
 'carry',
 'orderly',
 'peaceful',
 'transfer',
 'power',
 'grateful',
 'president',
 'obama',
 'first',
 'lady',
 'michelle',
 'obama',
 'gracious',
 'aid',
 'throughout',
 'transition',
 'magnificent',
 'today',
 'ceremony',
 'however',
 'special',
 'meaning',
 'today',
 'merely',
 'transferring',
 'power',
 'one',
 'administration',
 'another',
 'one',
 'party',
 'another',
 'transferring',
 'power',
 'washington',
 'giving',
 'back',
 'american',
 'people',
 'long',
 'small',
 

In [96]:
sentiment_analysis('You are cool')

[[{'label': 'positive', 'score': 0.9995902180671692}]]

In [102]:
from nltk.tokenize import sent_tokenize
tokenized_text=sent_tokenize(data['transcript'][950])
word_tokenize(tokenized_text)

TypeError: expected string or bytes-like object

In [23]:
data['transcript'][950]

'Thank you very much. Everybody, please have a seat. Thank you very much. Well, thank you. It is good to be back. () It is good to be back in New York, it is good to be back in the Great Hall at Cooper Union. () We’ve got some special guests here that I want to acknowledge. Congresswoman Carolyn Maloney is here in the house. () Governor David Paterson is here. () Attorney General Andrew Cuomo. () State Comptroller Thomas DiNapoli is here. () The Mayor of New York City, Michael Bloomberg. () Dr. George Campbell, Jr., president of Cooper Union. () And all the citywide elected officials who are here. Thank you very much for your attendance. It is wonderful to be back in Cooper Union, where generations of leaders and citizens have come to defend their ideas and contest their differences. It’s also good to be back in Lower Manhattan, a few blocks from Wall Street. () It really is good to be back, because Wall Street is the heart of our nation’s financial sector. Now, since I last spoke here