In [33]:
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.models.phrases import Phrases, Phraser
from gensim import corpora
from collections import namedtuple
import logging
import pandas as pd
from pathlib import Path
import numpy as np
from sklearn.decomposition import PCA
from tqdm import tqdm

In [2]:
data_dir = Path().cwd().parent / "data"
model_dir = Path().cwd().parent / "models"

In [27]:
full_df = pd.read_csv(data_dir / "cleaned_all_house.txt", sep="\t")

In [74]:
class phraseIterator(object):

    def __init__(self, inpath):
        self.inpath = inpath

    def __iter__(self):
        with open(self.inpath, 'r') as file:
            for i, line in enumerate(file):
                if i == 0: continue
                ls = line.split('\t')
                text = ls[1].replace('\n','') 
                yield text.split()


In [75]:
class corpusIterator(object):

    def __init__(self, inpath, bigram=None, trigram=None):
        if bigram:
            self.bigram = bigram
        else:
            self.bigram = None
        if trigram:
            self.trigram = trigram
        else:
            self.trigram = None
        self.inpath = inpath

    def __iter__(self):
        self.speeches = namedtuple('speeches', 'words tags')
        with open(self.inpath, 'r') as file:
            for i, line in enumerate(file):
                if i == 0: continue
                ls = line.split('\t')
                text = ls[1].replace('\n','')
                tokens = text.split()
                if self.bigram and self.trigram:
                    self.words = self.trigram[self.bigram[tokens]]
                elif self.bigram and not self.trigram:
                    self.words = self.bigram[tokens]
                else:
                    self.words = tokens
                speaker = ls[2]
                party = ls[3]
                congress = ls[4].replace("\n","")
                tags = [f"{party}"]
                self.tags = tags
                yield TaggedDocument(self.words, self.tags)


In [76]:
congresses = [97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114]

In [77]:
df_list = []
year_means = []

In [78]:
for congress in tqdm(congresses): 
    temp_df = full_df[full_df["congress"] == congress]
    temp_df.to_csv(data_dir / "temp.txt", sep="\t", index=False)
    phrases = Phrases(phraseIterator(data_dir / "temp.txt"))
    bigram = Phraser(phrases)
    tphrases = Phrases(bigram[phraseIterator(data_dir / "temp.txt")])
    trigram = Phraser(tphrases)
    
    
    model0 = Doc2Vec(vector_size=200, window=20, min_count=50, workers=8, epochs=20)
    model0.build_vocab(corpusIterator(data_dir / "temp.txt", bigram=bigram, trigram=trigram), min_count=50)
    model0.train(corpusIterator(data_dir / "temp.txt", bigram=bigram, trigram=trigram), 
             total_examples=model0.corpus_count, epochs=model0.epochs)
    
    
    speaker_tags = model0.dv.index_to_key
    embeds = np.array([model0.dv[tag] for tag in speaker_tags])
    
    pca = PCA(n_components=2)
    pca.fit_transform(embeds)
    
    pca_df = pd.DataFrame(pca.transform(embeds), columns=["pc1", "pc2"])
    
    pca_df["tag"] = speaker_tags

    pca_df[["unique_id","party","congress"]] = pca_df["tag"].str.split('_', n=2, expand=True)
    
    year_means.append(pca_df.groupby("party").mean(["pc1", "pc2"]))
    
    wordlist=[]
    for word in model0.wv.key_to_index.keys():
        wordlist.append((word, model0.wv.get_vecattr(word, "count")))
    wordlist = sorted(wordlist, key=lambda tup: tup[1], reverse=True)
    sorted_vocab = [w for w,c in wordlist if c>100 and c<1000000 and w.count('_')<3]

    S = np.zeros((len(sorted_vocab), 2))

    for idx, w in enumerate(sorted_vocab):
        S[idx, :] = pca.transform(model0.wv[w].reshape(1,-1))
        
    temp = pd.DataFrame({'word': sorted_vocab, 'pc1': S.T[0], 'pc2':S.T[1]})
    temp["congress"] = congress
    
    df_list.append(temp)

  0%|          | 0/18 [04:38<?, ?it/s]


KeyboardInterrupt: 

In [65]:
right_top_words = {}
left_top_words = {}

for df in df_list: 
    l_top = df.sort_values("pc1")["word"][:10]
    r_top = df.sort_values("pc1", ascending=False)["word"][:10]
    congress = df["congress"][0]
    right_top_words[congress] = r_top
    left_top_words[congress] = l_top

  l_top = df.sort_values("pc1")["word"][:10]
  r_top = df.sort_values("pc1", ascending=False)["word"][:10]


In [107]:
print(left_top_words[114])
right_top_words[114]

2444            bureaucracy
3255            bureaucrats
3106      rules_regulations
722               obamacare
2819              overreach
4041               red_tape
1823          west_virginia
3417        new_regulations
1784                   farm
4140    federal_regulations
Name: word, dtype: object


2482                         flint
1567             voting_rights_act
1602    congressional_black_caucus
1677                    zika_virus
877                  public_health
2291                 voting_rights
3257                    head_start
1334             republican_budget
2748                    fast_track
1608                  civil_rights
Name: word, dtype: object