In [23]:
import numpy as np
import pandas as pd
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.models.phrases import Phrases, Phraser
from gensim import corpora
from pathlib import Path
from sklearn.decomposition import PCA

In [24]:
data_dir = Path().cwd().parent / "data"

In [25]:
model0 = Doc2Vec.load("../models/main_model_102_108")

In [26]:
speaker_tags = model0.dv.index_to_key

embeds = np.array([model0.dv[tag] for tag in speaker_tags])

In [5]:
pca = PCA(n_components=2)

pca_df = pd.DataFrame(pca.fit_transform(embeds), columns=["pc1", "pc2"])

In [6]:
pca_df["tag"] = speaker_tags

pca_df[["unique_id","party","congress"]] = pca_df["tag"].str.split('_', n=2, expand=True)
pca_df["speakerid"] = pca_df["congress"] + pca_df["unique_id"] 

def make_id(x): 
    if len(x)==8: 
        x=x+"0"
    if len(x)==7: 
        x=[*x]
        x.insert(2, "1")
        x="".join(x)
    return str(x)

pca_df["speakerid"] = pca_df["speakerid"].map(make_id)

In [7]:
speakers = pd.read_csv(data_dir / "speakers_all.csv")
speakers["speakerid"] = speakers["speakerid"].map(str)

In [8]:
pca_df = pca_df.merge(speakers, how="left", on="speakerid")

In [9]:
pca_df

Unnamed: 0,pc1,pc2,tag,unique_id,party,congress,speakerid,lastname,firstname,state,district
0,-5.286919,2.025922,11983_D_102,11983,D,102,102119830,HOYER,STENY,MD,5.0
1,-6.194284,-2.247623,12090_D_102,12090,D,102,102120900,PRICE,DAVID,NC,4.0
2,5.520672,11.096998,11184_R_102,11184,R,102,102111840,GINGRICH,NEWTON,GA,6.0
3,-2.183232,4.750772,11061_R_102,11061,R,102,102110610,MICHEL,ROBERT,IL,18.0
4,6.345341,3.366552,11089_D_102,11089,D,102,102110890,WHITTEN,JAMIE,MS,1.0
...,...,...,...,...,...,...,...,...,...,...,...
3068,0.707791,-1.641502,11673_D_108,11673,D,108,108116730,CHANDLER,BEN,KY,6.0
3069,2.407817,0.359756,11550_R_108,11550,R,108,108115500,DEAL,NATHAN,GA,10.0
3070,-0.112780,-0.107643,11426_R_108,11426,R,108,108114260,JENKINS,WILLIAM,TN,1.0
3071,-0.673378,0.469160,11636_D_108,11636,D,108,108116360,WU,DAVID,OR,1.0


In [10]:
pca_df.to_csv(data_dir / "pca_102_108.csv", index=False)

In [6]:
from sklearn.metrics.pairwise import euclidean_distances, cosine_similarity


In [7]:
wordlist=[]
for word in model0.wv.key_to_index.keys():
    wordlist.append((word, model0.wv.get_vecattr(word, "count")))
wordlist = sorted(wordlist, key=lambda tup: tup[1], reverse=True)
sorted_vocab = [w for w,c in wordlist if c>100 and c<1000000 and w.count('_')<3]

S = np.zeros((len(sorted_vocab), 2))

for idx, w in enumerate(sorted_vocab):
    S[idx, :] = pca.transform(model0.wv[w].reshape(1,-1))


In [12]:
pd.DataFrame({'word': sorted_vocab, 'pc1': S.T[0], 'pc2':S.T[1]}).sort_values("pc1").head(20)

Unnamed: 0,word,pc1,pc2
11242,matters_worse,-8.232989,-4.309378
9488,deep_cuts,-7.938724,-1.610411
9233,civil_rights_laws,-7.630497,-1.334454
4451,poorest,-7.60169,1.535496
7731,domestic_programs,-7.53679,-3.108415
13320,social_justice,-7.511666,-3.32638
6814,child_labor,-7.388353,-0.58063
17484,legal_representation,-7.285821,0.019019
2960,congressional_black_caucus,-7.191643,-3.361098
11327,disenfranchised,-7.123498,-0.007255


In [13]:
pd.DataFrame({'word': sorted_vocab, 'pc1': S.T[0], 'pc2':S.T[1]}).sort_values("pc1").tail(20)


Unnamed: 0,word,pc1,pc2
3438,paperwork,5.383968,-0.211211
10956,federal_bureaucrats,5.403413,-5.189444
6790,parents_teachers,5.405427,-2.828275
4327,rules_regulations,5.459419,1.071771
4748,border_patrol,5.508176,3.447003
4753,south_dakota,5.550333,2.146436
9489,dollars_classroom,5.681424,-0.640943
2509,bureaucrats,5.69416,-2.181439
13038,overtaxed,5.762327,-1.244048
10529,going_broke,5.801149,2.160942
