In [204]:
import pandas as pd
import numpy as np
import re

from string import punctuation
import spacy
from gensim.models.phrases import Phrases
from gensim import corpora
from gensim.models import CoherenceModel, ldamodel, KeyedVectors
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

from tqdm import tqdm

from plotnine import ggplot, aes, geoms, theme
%matplotlib inline

In [93]:
# read raw data
df = pd.read_csv('Data/116th_nonLegislative.csv')

# omit procedural speeches
df = df.loc[-df.title.str.match("TRIBUTE")]
df = df.loc[-df.title.str.match('The SPEAKER pro tempore.*')]
df = df.loc[-df.title.str.match('SPECIAL ORDERS.*')]

In [94]:
def matched(tx):
    """
    A function to categorize speeches into morning debate, one minutes, or special orders
    """
    
    if re.search('(asked and was given permission to address the House for 1 minute and to revise and extend (her|his) remarks)',tx.replace('\n','')):
        return "one_minute"
    elif re.search("The SPEAKER pro tempore.* is recognized for 5 minutes",tx.replace('\n','')):
        return 'special_order'
    elif re.search('The SPEAKER pro tempore.* morning hour debates for 5 minutes',tx.replace('\n','')):
        return 'morning_debate'
    else:
        return "None"
    
df['category'] = df.text.apply(matched)
df_ = df.loc[df.category != 'None']

In [95]:
# Parse Speeches
one_minutes = []
for row in df.iterrows():
    i = row[1]
    Text = i.text.replace('\n','') # remove newlines
    
    # if a one minute speech
    if i.category == 'one_minute':
        matched = re.findall("(www.gpo.gov\] *)(.*)( *\()(.*)( asked .*\) *)(.*)",Text)
        if matched:
            Title = i['title'].split(';')[0]
            Speaker = matched[0][3]
            Speech = matched[0][-1]
            Speech = Speech.replace(Speaker + '. ','')
            one_minutes.append({"Title":Title,"Speaker":Speaker,"Speech":Speech,'Category':i.category,'date':i.date})
            
one_minutes = pd.DataFrame(one_minutes)

In [99]:
one_minutes = one_minutes.loc[one_minutes.Category == 'one_minute']

### pre-processing

In [117]:
nlp = spacy.load("en_core_web_sm")

speeches = []
for speech in tqdm(one_minutes.Speech):
    speech_ = []
    for term in nlp(speech):
        if term.pos_ in ['NOUN','ADJ','VERB','ADV','PROPN']:
            speech_.append(term.lemma_)
    speeches.append(speech_)
    
    
vectorizer = TfidfVectorizer(min_df=2,max_df=0.35,use_idf=True)
DF = vectorizer.fit_transform([' '.join(speech) for speech in speeches])
vocab = vectorizer.get_feature_names()

100%|██████████| 2500/2500 [01:14<00:00, 33.60it/s]


## Get started with TC-W2V

In [206]:
word_vectors = KeyedVectors.load_word2vec_format('wiki_200/model.txt', binary=False)

### Run model with different num topics

In [None]:
def run_model(components,ntop=10):
    model = NMF(n_components=25,init='nndsvda',max_iter=1000)
    W = model.fit_transform(DF)
    H = model.components_
    
    term_rankings = []
    for topic_index in range(H.shape[0]):
        top_indices = np.argsort(H[topic_index,:])[::-1]
        term_ranking = [terms[i] for i in top_indices[:ntop]]
        term_rankings.append(term_ranking)
    
    
    # COMPUTE W2V