In [1]:
import os
os.environ['R_HOME'] = '/home/ec2-user/anaconda3/envs/R/lib/R'
os.environ["NUMEXPR_MAX_THREADS"] = '36'

In [2]:
import pandas as pd
import numpy as np
import joblib

from sklearn.preprocessing import normalize, StandardScaler
from sklearn.feature_extraction.text import CountVectorizer

%run procedural_stop_words.py
procedural_stop_words.extend(['do','be','mr_speaker','have','time','other'])

from tqdm import tqdm
from plotnine import ggplot, aes, geoms

# R package import 
import rpy2.robjects as ro
from rpy2.robjects import pandas2ri

R = ro.r
pandas2ri.activate()

from statsmodels.stats.multitest import fdrcorrection
import time

from multiprocess import Pool
import logging

logging.basicConfig(filename='Results/Frame_log.log', level=logging.DEBUG)



In [3]:
all_df = pd.read_csv('Results/All_speeches_labelled.csv')


## For Speakers

A massive-univariate technique in which every phrases is tested against the null-hypothesis that the frequency distribution of the word for Democrats and Republicans come from the same underlying distribution. This analysis is akin to the mass-univeraite analysis undertaken in basic neuroimaging research, where each voxel is analyzed independently given the same model. Results are of course corrected for multiple comparison using FDR correction.



In [4]:
def run_poisson(info):
    """
    Runs Poisson regression for terms
    """
    col,DTM = info
    mod = R.glm(f'x_{col}~x_party_y',family='poisson',data=DTM)
    effects = R.summary(mod).rx2('coefficients')
    return {'col':col,'est':effects[1,0],'pval':effects[1,-1]}


def run_model(year,topic,min_df=0.5,num_cpu=30):
    
    # subset dataframe for year and topic
    sub_df = all_df.loc[(all_df.year_y == year) & (all_df.dynamic_label == topic)]
    if len(sub_df) > 0:
        
        # Linker for speaker party to speaker
        name_party_link = sub_df[['speaker','party_y']].groupby('speaker').first().reset_index()

        # term DTM
        vectorizer = CountVectorizer(stop_words=procedural_stop_words,min_df=0.05,binary=True)
        DTM = vectorizer.fit_transform(sub_df.speech_processed)
        DTM = pd.DataFrame(DTM.toarray())

        # sum term occurance by speaker and merge with party
        DTM['speaker'] = list(sub_df['speaker'])
        DTM = (DTM
               .groupby('speaker')
               .sum()
               .reset_index()
               .merge(name_party_link,on='speaker',how='left')
               .drop('speaker',1)
              )
        
        # give columns names compatible with R
        DTM.columns = [f'x_{i}' for i in DTM.columns]
        
        # Run Massive Univariate Poisson GLM
        iter_list = [(i,DTM) for i in range(len(vectorizer.get_feature_names()))]
        
        with Pool(30) as p:
            estimates = p.map(run_poisson,iter_list)
        
        # Make to DataFrame
        frame = pd.DataFrame(estimates)
        frame['term'] = vectorizer.get_feature_names() # add terms
        frame['year'] = year
        frame['topic'] = topic
        frame = frame.drop('col',1)
        
        # perform FDR correction for multiple comparisons, alpha = 0.05
        frame['fdr_p'] = fdrcorrection(frame.pval)[0]
        
        return frame

In [5]:
Frames = []
for topic in all_df.dynamic_label.unique(): # for each topic
    start_time = time.time()
    for year in range(1983,2017): # for every year
        Frames.append(run_model(year,topic))    
    
    end_time = time.time()
    logging.debug(f'  topic - {topic} completed in {(end_time-start_time)/60} minutes.')

In [6]:
All_terms = pd.concat(Frames)
All_terms.to_csv('Results/Univariate_Frame_analysis.csv')