In [None]:
os.environ["NUMEXPR_MAX_THREADS"] = '36'

In [4]:
import joblib
import pandas as pd
import numpy as np
from tqdm import tqdm
import itertools
from sklearn.feature_extraction.text import CountVectorizer
from scipy.stats import pearsonr

%run procedural_stop_words.py

from plotnine import ggplot, aes, geoms, theme, ggtitle, ylim, xlim, ylab
from plotnine import element_blank, element_line, element_text, scales, annotate,facet_wrap

from scipy.spatial.distance import cosine
import p_tqdm

In [5]:
all_df = pd.read_csv('Results/All_speeches_labelled.csv')
all_df = all_df.loc[all_df.party_y != 'I']

combinations = []
for year in range(1983,2017):
    for topic in all_df.dynamic_label.unique():
        combinations.append((year,topic))

In [8]:
def make_DTM(sub,binary=True):
    """
    Make a Document Term Matrix from topic subset speeches
    
    args:
        - sub: pandas dataframe with speeches and metadata
        - binary: 1 or count for term occurence in document
    returns:
        - Document Term Matrix
    
    """
    # speaker and party ID
    features = sub.groupby('speaker',as_index=False).party_y.first()
    
    # document term matrix
    vectorizer = CountVectorizer(min_df=5,binary=binary,stop_words=procedural_stop_words)
    DTM = vectorizer.fit_transform(sub.speech_processed)
    DTM = pd.DataFrame(DTM.toarray())

    # associate DTM with speaker and get term-speech frequency by speaker
    DTM['speaker'] = list(sub['speaker'])
    DTM = (DTM
           .groupby('speaker',as_index=False)
           .sum()
           .merge(features,on='speaker',how='left')
           .drop('speaker',1)
          )

    # assign terms to DTM
    terms = vectorizer.get_feature_names()
    DTM.columns = terms + ['party_y']
    
    return DTM


def chi_sq(x):
    """
    Run chi-squared test from Gentzkow et al. 2010.
    to be run on each term in frequency frame
    """
    
    numer = ((x['R']*x['Dn']) - (x['D']*x['Rn']))**2
    denom = (x['R'] + x['D']) * (x['R'] + x['Rn']) * (x['D'] + x['Dn']) * (x['Dn'] + x['Rn'])
    return numer/denom


def chiSq_df(dtm,permute=False):
    """
    sets up dataframe containing term frequencies and
    expected frequencies for chi-square test
    
    args:
        - dtm: Document Term Matrix
        - permute: if True shuffle speaker values (default False)
    returns:
        - term_frequencies dataframe with chi-square stats
    """
    
    if permute: # shuffle party labels
        dtm.party_y = np.random.permutation(dtm.party_y.values)
        
    term_frequencies = dtm.groupby('party_y').sum().T  # term frequency by party
    total_frequencies = term_frequencies.sum()  # total frequencies

    # set up for chi-square test
    term_frequencies['Dn'] =  total_frequencies['D'] - term_frequencies['D'] 
    term_frequencies['Rn'] = total_frequencies['R'] - term_frequencies['R']
    term_frequencies['chi2'] = term_frequencies.apply(chi_sq,1)

    term_frequencies['terms'] = dtm.columns[:-1]
    
    return term_frequencies

def perform_correlations(dtm,permute=False):
    """
    runs correlation analysis on every term with party ID
    method from Jensen et al. 2012
    
    args:
        - dtm: Document Term matrix containing speech-term frequencies
        - permute: if True, shuffle party labels (default False)
    returns:
        - dataframe containing Pearson r values for every word
    """
    
    # contrast code for party
    party_ID = [-1 if party == 'D' else 1 for party in dtm.party_y]
    dtm = dtm.drop('party_y',1)
    
    # normalize frequencies
    dtm_normed = np.apply_along_axis(lambda x: (x - np.mean(x))/np.std(x),0,dtm.to_numpy())
    
    # perform correlation analysis
    if permute:
        party_ID = np.random.permutation(party_ID)
        
    # perform correlations
    corrs = np.apply_along_axis(lambda x: pearsonr(x,party_ID)[0],0,dtm_normed)
    
    df = pd.DataFrame({
            "term":dtm.columns,
            "correlation":corrs,
            'freq':dtm.sum(0)
        }).dropna()
    
    return df

In [35]:
def run_similarity(year,topic,binary=True):
    """
    run analysis for a given year and topic
    
    args:
        - year: year to subset speeches
        - topic: topic to subset speeches
        - binary: if True, 1 for speech containing word else frequency within speech
    returns:
        - dictionary containing summary statistics and correlation dataframe
    """
    
    # subset speeches
    sub_df = all_df.loc[(all_df.year_y == year) & (all_df.dynamic_label == topic)]

    if len(sub_df) > 0:

        DTM = make_DTM(sub_df,binary=binary) # Make DTM
        term_df = chiSq_df(DTM) # Make Chi_square frequency table

        pre_drop_terms = DTM.columns[:-1] # record keeping
        
        # remove low value chi square terms
        term_df['chi2'] = term_df.loc[term_df.chi2 > 0]
        drop_cols = list(term_df.loc[term_df.chi2 <= 0,'terms'].values)
        DTM = DTM.drop(drop_cols,1)

        corr_df = perform_correlations(DTM) # make correlation DF
        
        # calculate metrics
        corr_df['weighted'] = corr_df['correlation']*corr_df['freq']
        partisanship = corr_df['weighted'].sum()/corr_df['freq'].sum()
        polarization = corr_df['weighted'].abs().sum()/corr_df['freq'].sum()
        distance = cosine(term_df['D'],term_df['R'])
        
        return {"results":{'distance':distance,
                    'polarization':polarization,
                    'partisanship':partisanship,
                    "pre_termlength":len(pre_drop_terms),
                    'post_termlength':len(corr_df)},
                'correlations':corr_df,
                'topic':topic,
                'year':year
               }

# Run analysis

In [35]:
Results = []
pbar = tqdm(combinations)
for combination in pbar:
    pbar.set_description("%s %s"%combination)
    year,topic = combination
    f = run_similarity(year,topic,binary=True)
    if f:
        Results.append(f)




2016 healthcare: 100%|██████████| 2142/2142 [11:06<00:00,  3.21it/s]               


# Permutation analysis for null models

In [24]:
def run_similarity_perm(year,topic,binary=True,perms=200):
    sub_df = all_df.loc[(all_df.year_y == year) & (all_df.dynamic_label == topic)]
    if len(sub_df) > 0:
        
        # get terms maintained from true results
        true_corrs = [b['correlations'] for b in Binary_Results 
                   if b['topic'] == topic and b['year'] == year]

        keep_terms = true_corrs[0].term.values
        
        DTM = make_DTM(sub_df,binary=binary) # Make DTM
        pre_drop_terms = DTM.columns[:-1] # record keeping
        drop_cols = [i for i in pre_drop_terms if i not in keep_terms]
        DTM = DTM.drop(drop_cols,1)
        
        Nulls = []
        for perm in range(perms):
            term_df = chiSq_df(DTM,permute=True) # Make Chi_square frequency table
            # remove low value chi square terms
            term_df['chi2'] = term_df.loc[-term_df.terms.isin(drop_cols)]

            corr_df = perform_correlations(DTM,permute=True) # make correlation DF

            # calculate metrics
            corr_df['weighted'] = corr_df['correlation']*corr_df['freq']
            partisanship = corr_df['weighted'].sum()/corr_df['freq'].sum()
            polarization = corr_df['weighted'].abs().sum()/corr_df['freq'].sum()
            distance = cosine(term_df['D'],term_df['R'])

            Nulls.append({'distance':distance,
                        'polarization':polarization,
                        'partisanship':partisanship,
                        "pre_termlength":len(pre_drop_terms),
                        'post_termlength':len(corr_df),
                        'topic':topic,
                        'year':year,
                        'iter':perm})
        return pd.DataFrame(Nulls)

In [None]:
def run_(x):
    year,topic = x
    f = run_similarity_perm(year,topic,binary=True)
    if type(f) == pd.core.frame.DataFrame:
        return f

Null_Results = p_tqdm.p_map(run_,combinations,num_cpus=30)

Null_DF = pd.concat(Null_Results)
Null_DF.to_csv('Results/Null_Frame_Results.csv')

  0%|          | 0/2142 [00:00<?, ?it/s]

### Saving to disc and as CSV for visualization in R

In [None]:
with open('Results/Frame_results.pkl', 'wb') as File:
    joblib.dump(Results,File)

In [None]:
Null_DF['polarization_st'] = Null_DF.groupby('iter').polarization.transform(lambda x: (x-x.mean())/x.std())
Null_DF['partisanship_st'] = Null_DF.groupby('iter').partisanship.transform(lambda x: (x-x.mean())/x.std())

Result_DF = pd.DataFrame([_['results'] for _ in Results])
Result_DF['polarization_st'] = Result_DF.polarization.transform(lambda x: (x-x.mean())/x.std())
Result_DF['partisanship_st'] = Result_DF.partisanship.transform(lambda x: (x-x.mean())/x.std())

Null_DF['type'] = 'null'
Result_DF['iter'] = 99
Result_DF['type'] = 'true'

Null_DF = Null_DF[['type','iter','year','topic','distance','polarization','polarization_st','partisanship','partisanship_st']]
Result_DF = Result_DF[['type','iter','year','topic','distance','polarization','polarization_st','partisanship','partisanship_st']]

combined = pd.concat([Result_DF,Null_DF])

combined.to_csv('Results/True_and_Nulls_Frame.csv')

In [None]:
corr_df = []
for r in Results:
    df = r['correlations']
    df['year'] = r['year']
    df['topic'] = r['topic']
    corr_df.append(df)
corr_df = pd.concat(corr_df)

corr_df.to_csv('Results/term_correlations_frames.csv')