In [16]:
import pandas as pd
import numpy as np
import joblib

from sklearn.preprocessing import normalize, StandardScaler
from sklearn.feature_extraction.text import CountVectorizer

%run procedural_stop_words.py
procedural_stop_words.extend(['do','be','mr_speaker','have','time','other'])

from tqdm import tqdm
from plotnine import ggplot, aes, geoms

# R package import 
import rpy2.robjects as ro
from rpy2.robjects import pandas2ri

R = ro.r
pandas2ri.activate()

from p_tqdm import p_map

from statsmodels.stats.multitest import fdrcorrection
import time

In [2]:
with open('Results/Official_TopicModel_80k.pkl','rb') as File:
    models = joblib.load(File)
    
all_df = pd.read_csv('Results/All_speeches_labelled.csv')


## For Speakers

A massive-univariate technique in which every phrases is tested against the null-hypothesis that the frequency distribution of the word for Democrats and Republicans come from the same underlying distribution. This analysis is akin to the mass-univeraite analysis undertaken in basic neuroimaging research, where each voxel is analyzed independently given the same model. Results are of course corrected for multiple comparison using FDR correction.



In [42]:
def select_data(year,topic):
    sub_df = all_df.loc[(all_df.year_y == year) & (all_df.dynamic_label == topic)]

    # Linker for speaker party to speaker
    name_party_link = sub_df[['speaker','party_y']].groupby('speaker').first().reset_index()

    # term DTM
    vectorizer = CountVectorizer(stop_words=procedural_stop_words,min_df=0.01,binary=True)
    DTM = vectorizer.fit_transform(sub_df.speech_processed)
    DTM = pd.DataFrame(DTM.toarray())

    # sum term occurance by speaker and merge with party
    DTM['speaker'] = list(sub_df['speaker'])
    DTM = (DTM
           .groupby('speaker')
           .sum()
           .reset_index()
           .merge(name_party_link,on='speaker',how='left')
           .drop('speaker',1)
          )

    DTM.columns = [f'x_{i}' for i in DTM.columns]
    print("DTM of size - ",DTM.shape)
    return DTM

def run_poisson(col):
    mod = R.glm(f'x_{col}~x_party_y',family='poisson',data=DTM)
    effects = R.summary(mod).rx2('coefficients')
    return {'col':col,'est':effects[1,0],'pval':effects[1,-1]}



In [None]:
DTM = select_data(1999,'abortion')

start = time.time()
estimates = [run_poisson(i) for i in range(len(DTM.columns) - 1)]
print(time.time() - start)
df= pd.DataFrame(estimates)


DTM of size -  (96, 1499)


In [40]:
df['fdr_p'] = fdrcorrection(df.pval)[0]
df.loc[df.fdr_p == True]

Unnamed: 0,col,est,pval,fdr_p
10,10,-1.734601,0.000105,True
767,767,-1.201303,5.9e-05,True
