In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
from functools import reduce
import re

import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.models.phrases import Phrases, Phraser
from gensim import corpora
from collections import namedtuple
import logging

from sklearn.decomposition import PCA

In [None]:
# define function to read in speech files line-by-line to deal with corrupted lines
def file_load(file_path):
    temp_dict = {"speech_id":[], "speech":[]}
    with open(file_path, "r", encoding="ansi") as file:
        for i, line in enumerate(file):
            if i == 0: continue
            ls = line.split("|")
            ls[1] = ls[1].replace("\n","")
            ls[1] = ls[1].replace("\t", "")
            temp_dict["speech_id"].append(int(ls[0]))
            temp_dict["speech"].append(ls[1])
    return pd.DataFrame.from_dict(temp_dict)


In [None]:
# define list of congresses we want to retrieve speeches for
congresses = [102, 103, 104, 105, 106, 107, 108]

# initialise empty lists to hold dfs of speeches and speakers
df_list = []
speakers_df_list = []

for congress in congresses: 
    # read in speeches and speakers for each congress in congresses
    speeches = file_load(data_dir / f"hein-daily/speeches_{congress}.txt")
    speakers = pd.read_csv(data_dir / f"hein-daily/{congress}_SpeakerMap.txt", delimiter="|", encoding="ANSI")
    
    # merge speeches and speakers together (so we have a speaker for each speech)
    merged = speeches.merge(speakers, how="inner", on="speech_id").dropna()
    
    # filter to only speeches in the house and only voting representatives
    house = merged[(merged["chamber"] == "H") & (merged["nonvoting"] == "voting")].copy()
    house_speakers = speakers[(speakers["chamber"] == "H") & (speakers["nonvoting"] == "voting")].copy()
    
    house["congress"] = str(congress)
    
    # get speech-invariante speaker characteristics and drop duplicates to get a dataframe with one observation of each speaker per Congress
    house_speakers = house_speakers[["speakerid", "lastname", "firstname", "state", "district"]]
    house_speakers["speakerid"] = house_speakers["speakerid"].map(str)
    house_speakers = house_speakers.drop_duplicates(ignore_index=True)
    
    # add to list of dfs
    df_list.append(house)
    speakers_df_list.append(speakers)

# concatanate df_list together so we have one dataframe with all of the speeches of interest
final_df = pd.concat(df_list, ignore_index=True)
speakers_df = pd.concat(speakers_df_list, ignore_index=True)

In [None]:
final_df.to_csv(data_dir / "house_102_to_108.txt", sep="\t", index=False)
speakers_df.to_csv(data_dir / "speakers_102_to_108.csv", index=False)

In [None]:
# CONTRACTIONS AND PROCEDURAL WORDS LISTS DRAWN FROM RHEAULT AND COCHRANE (2020)

# Define dictionary of contractions to map to whole words
contractions_map = {"you'd": 'you would', "he'd": 'he would', "she's": 'she is', "where'd": 'where did', "might've": 'might have', \
                "he'll": 'he will', "they'll": 'they will',  "mightn't": 'might not', "you'd've": 'you would have', "shan't": 'shall not', \
                "it'll": 'it will', "mayn't": 'may not', "couldn't": 'could not', "they'd": 'they would', "so've": 'so have', \
                "needn't've": 'need not have', "they'll've": 'they will have', "it's": 'it is', "haven't": 'have not', "didn't": 'did not', \
                "y'all'd": 'you all would', "needn't": 'need not', "who'll": 'who will', "wouldn't've": 'would not have', "when's": 'when is', \
                "will've": 'will have', "it'd've": 'it would have', "what'll": 'what will', "that'd've": 'that would have', \
                "y'all're": 'you all are', "let's": 'let us', "where've": 'where have', "o'clock": 'oclock', "when've": 'when have', \
                "what're": 'what are', "should've": 'should have', "you've": 'you have', "they're": 'they are', "aren't": 'are not', \
                "they've": 'they have', "it'd": 'it would', "i'll've": 'i will have', "they'd've": 'they would have', "you'll've": 'you will have', \
                "wouldn't": 'would not', "we'd": 'we would', "hadn't've": 'had not have', "weren't": 'were not', "i'd": 'i would', \
                "must've": 'must have', "what's": 'what is', "mustn't've": 'must not have', "what'll've": 'what will have', "ain't": 'aint', \
                "doesn't": 'does not', "we'll": 'we will', "i'd've": 'i would have', "we've": 'we have', "oughtn't": 'ought not', \
                "you're": 'you are', "who'll've": 'who will have', "shouldn't": 'should not', "can't've": 'cannot have', "i've": 'i have', \
                "couldn't've": 'could not have', "why've": 'why have', "what've": 'what have', "can't": 'cannot', "don't": 'do not', \
                "that'd": 'that would', "who's": 'who is', "would've": 'would have', "there'd": 'there would', "shouldn't've": 'should not have', \
                "y'all": 'you all', "mustn't": 'must not', "she'll": 'she will', "hadn't": 'had not', "won't've": 'will not have', \
                "why's": 'why is', "'cause": 'because', "wasn't": 'was not', "shan't've": 'shall not have', "ma'am": 'madam', "hasn't": 'has not', \
                "to've": 'to have', "how'll": 'how will', "oughtn't've": 'ought not have', "he'll've": 'he will have', "we'd've": 'we would have', \
                "won't": 'will not', "could've": 'could have', "isn't": 'is not', "she'll've": 'she will have', "we'll've": 'we will have', \
                "you'll": 'you will', "who've": 'who have', "there's": 'there is', "y'all've": 'you all have', "we're": 'we are', "i'll": 'i will', \
                "i'm": 'i am', "how's": 'how is', "she'd've": 'she would have', "sha'n't": 'shall not', "there'd've": 'there would have', \
                "he's": 'he is', "it'll've": 'it will have', "that's": 'that is', "y'all'd've": 'you all would have', "he'd've": 'he would have', \
                "how'd": 'how did', "where's": 'where is', "so's": 'so as', "she'd": 'she would', "mightn't've": 'might not have'}

# define list procedural words to remove as stop words
procedural_words = ["member","members","president",
    "hon","parliament","house","ask","asked","asks","question","questioned","questions","bills","bill",
    "party","parties","mp","mps","sir","madam","mr","gentleman","gentlemen","lady","ladies",
    "speaker","chair","motion","motions","vote","votes","order","yes","deputy","secretary",
    "chairman","chairwoman",
    "america","usa","american","americans",
    "pursuant","supply","supplementary","please","friend","s",
    "clause","amendment","i","ii","iii","section","sections", "colleague", "colleagues"]

stop_words = stopwords.words("english")

In [None]:
# define function to clean speech text; cleaning functions drawn from Rheault and Cochrane
def clean_text(text): 
    #convert to lower case
    text = text.lower()
    # replace contractions with whole words
    text = reduce(lambda c, k_v: c.replace(*k_v), contractions_map.items(), text)
    #remove excess whitespace characters
    text = re.sub(r"[\t\n\r]", " ", text)
    # remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))
    
    # tokenize using nltk.word_tokenize
    tokens = word_tokenize(text) 
    
    # remove stop words, procedural words, words less than 2 characters long, blank spaces, digits
    tokens = [word for word in tokens if word not in stop_words and word not in procedural_words and len(word)>2 and word!=" " and not word.isdigit()]
    
    # return cleaned text joined together as a string
    return " ".join(tokens)

In [None]:
# define function to extract just unique_id component of speakerid (from identifiers for Congress and chamber)
def extract_speaker(speakerid): 
    speakerid=str(speakerid)
    # each speaker has a unique, time invariant id that is characters 3-7 (inclusive) in speakerid identifier
    return "".join([*speakerid][3:8])

In [None]:
house_102_to_108 = pd.read_csv(data_dir / "house_102_to_108.txt", sep="\t")

In [None]:
# get descriptive statistics on number of speeches per congress
house_102_to_108.groupby(["congress", "speakerid"]).size().groupby("congress").describe().to_csv(data_dir / "speech_stats.csv")

In [None]:
# clean speeches and extract unique speaker id for all of our speeches
house_102_to_108["cleaned_speech"] = house_102_to_108["speech"].map(clean_text)
house_102_to_108["cleaned_speaker"] = house_102_to_108["speakerid"].map(extract_speaker)

In [None]:
# save cleaned data
house_102_to_108 = house_102_to_108[["speech_id", "cleaned_speech", "cleaned_speaker", "party", "congress"]]
house_102_to_108.to_csv(data_dir / "cleaned_house_102_to_108.txt", sep="\t", index=False)

In [None]:
# code drawn from Rheault and Cochrane (2020)
class get_phrases(object):

    def __init__(self, file_path):
        self.file_path = file_path

    def __iter__(self):
        with open(self.file_path, "r") as file:
            for i, line in enumerate(file):
                # skip first line - is header with column names
                if i == 0: continue
                
                # split line by tab character
                line_split = line.split("\t")
                
                # yield speech text as a list of words
                text = line_split[1].replace("\n","") 
                yield text.split()

                
class get_document(object):

    def __init__(self, file_path, bigram, trigram):
        self.bigram = bigram
        self.trigram = trigram
        self.file_path = file_path

    def __iter__(self):
        with open(self.file_path, "r") as file:
            for i, line in enumerate(file):
                
                # skip first line - is header with column names
                if i == 0: continue
                
                # split line by tab character
                ls = line.split("\t")
                
                # get speech text and get bigrams and trigrams
                text = ls[1].replace("\n","")
                tokens = text.split()
                self.words = self.trigram[self.bigram[tokens]]
                
                # create legislator-congress tag, party is included for later analysis but has no effect
                speaker = ls[2]
                party = ls[3]
                congress = ls[4].replace("\n","")
                tags = [f"{speaker}_{party}_{congress}"]
                self.tags = tags
                
                # yield the tagged document
                yield TaggedDocument(self.words, self.tags)
               
            

In [None]:
phrases = Phrases(get_phrases(data_dir / "cleaned_house_102_to_108.txt"))
bigram = Phraser(phrases)
tphrases = Phrases(bigram[get_phrases(data_dir / "cleaned_house_102_to_108.txt")])
trigram = Phraser(tphrases)

In [None]:
# run model
model = Doc2Vec(vector_size=200, window=20, min_count=50, workers=8, epochs=20, alpha=0.025)
model.build_vocab(get_document(data_dir / "cleaned_house_102_to_108.txt", bigram=bigram, trigram=trigram), min_count=50)
model.train(get_document(data_dir / "cleaned_house_102_to_108.txt", bigram=bigram, trigram=trigram), 
             total_examples=model.corpus_count, epochs=model.epochs)

In [None]:
model.save("../models/main_model_102_108")

In [None]:
model = Doc2Vec.load("../models/main_model_102_108")

In [None]:
# get list of tags for the model
speaker_tags = model.dv.index_to_key

# for each tag, get the model's tag-embedding (the main parameters of interest)
embeds = np.array([model.dv[tag] for tag in speaker_tags])

In [None]:
# do a 2D PCA on the legislator-session embeddings and turn into a dataframe

pca = PCA(n_components=2)

pca_df = pd.DataFrame(pca.fit_transform(embeds), columns=["pc1", "pc2"])

# add column for legislator- session tage
pca_df["tag"] = speaker_tags

In [None]:
# split tage into ID, party, congress components
pca_df[["unique_id","party","congress"]] = pca_df["tag"].str.split('_', n=2, expand=True)

# remake speaker id column
pca_df["speakerid"] = pca_df["congress"] + pca_df["unique_id"] 

speakers = pd.read_csv(data_dir / "speakers_102_to_108.csv")

# merged with speakerid to get speaker information
pca_df = pca_df.merge(speakers, how="left", on="speakerid")

# save 
pca_df.to_csv(data_dir / "pca_102_108.csv", index=False)

In [None]:
wordlist=[]
# get words and how many times they appear 
for word in model.wv.key_to_index.keys():
    wordlist.append((word, model.wv.get_vecattr(word, "count")))

# get words that appear more than 100 times in the corpus
vocab = [word for word,count in wordlist if count>100]

word_pca = np.zeros((len(vocab), 2))

for i, word in enumerate(vocab):
    word_pca[i, :] = pca.transform(model.wv[word].reshape(1,-1))


In [None]:
# get and save 20 words that map as lowest (most liberal) on pc1 (ideology axis)
pd.DataFrame({'word': sorted_vocab, 'pc1': S.T[0], 'pc2':S.T[1]}).sort_values("pc1").head(20).to_csv(data_dir/"most_liberal_words.csv")

# get and save 20 words that map as highest (most conservative) on pc1 (ideology axis)
pd.DataFrame({'word': sorted_vocab, 'pc1': S.T[0], 'pc2':S.T[1]}).sort_values("pc1").head(20).to_csv(data_dir/"most_conservative_words.csv")