In [1]:
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.models.phrases import Phrases, Phraser
from gensim import corpora
from collections import namedtuple
import logging
import pandas as pd
from pathlib import Path
import numpy as np

In [2]:
data_dir = Path().cwd().parent / "data"
model_dir = Path().cwd().parent / "models"

In [6]:
class phraseIterator(object):

    def __init__(self, file_path):
        self.file_path = file_path

    def __iter__(self):
        with open(self.file_path, 'r') as file:
            for i, line in enumerate(file):
                # skip first line - is header with column names
                if i == 0: continue
                
                line_split = line.split('\t')
                text = line_split[1].replace('\n','') 
                yield text.split()


In [4]:
class corpusIterator(object):

    def __init__(self, inpath, bigram=None, trigram=None):
        if bigram:
            self.bigram = bigram
        else:
            self.bigram = None
        if trigram:
            self.trigram = trigram
        else:
            self.trigram = None
        self.inpath = inpath

    def __iter__(self):
        self.speeches = namedtuple('speeches', 'words tags')
        with open(self.inpath, 'r') as file:
            for i, line in enumerate(file):
                if i == 0: continue
                ls = line.split('\t')
                text = ls[1].replace('\n','')
                tokens = text.split()
                if self.bigram and self.trigram:
                    self.words = self.trigram[self.bigram[tokens]]
                elif self.bigram and not self.trigram:
                    self.words = self.bigram[tokens]
                else:
                    self.words = tokens
                speaker = ls[2]
                party = ls[3]
                congress = ls[4].replace("\n","")
                tags = [party, congress]
                self.tags = tags
                yield TaggedDocument(self.words, self.tags)


In [7]:
phrases = Phrases(phraseIterator(data_dir / "cleaned_all_house.txt"))
bigram = Phraser(phrases)
tphrases = Phrases(bigram[phraseIterator(data_dir / "cleaned_all_house.txt")])
trigram = Phraser(tphrases)

In [8]:
model0 = Doc2Vec(vector_size=200, window=20, min_count=50, workers=8, epochs=20)
model0.build_vocab(corpusIterator(data_dir / "cleaned_all_house.txt", bigram=bigram, trigram=trigram), min_count=50)
model0.train(corpusIterator(data_dir / "cleaned_all_house.txt", bigram=bigram, trigram=trigram), 
             total_examples=model0.corpus_count, epochs=model0.epochs)

In [7]:
#model0.save("../models/main_model_102_108")

In [23]:
model0.wv.most_similar('democrat')

[('republican', 0.6449437737464905),
 ('democrats', 0.6413997411727905),
 ('democratic', 0.608285665512085),
 ('liberal_democrat', 0.5163456201553345),
 ('conservative_republican', 0.5129528641700745),
 ('liberal_wing', 0.5101121664047241),
 ('liberal_democratic', 0.4927789568901062),
 ('publican', 0.4866924285888672),
 ('moderate_conservative', 0.4702413082122803),
 ('democrat_democrat', 0.46612730622291565)]