In [113]:
import numpy as np
import pandas as pd
import os, joblib

from scipy.spatial.distance import cosine
from scipy.stats import pearsonr
from statsmodels.stats.multitest import fdrcorrection

from sklearn.preprocessing import normalize,StandardScaler, minmax_scale
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

from plotnine import ggplot, aes, geoms, scales
from pyvis.network import Network
from scipy.stats import pearsonr

import networkx as nx

In [2]:
all_df = pd.read_csv('Results/All_speeches_labelled.csv')

In [None]:
def make_adj(mat,perm=False):
    mat_cols = mat.shape[1]
    adj = np.zeros([mat_cols,mat_cols])
    for i in range(mat_cols):
        mi = mat[:,i]
        if perm:
            _mi = mi.copy()
            np.random.shuffle(_mi)
        else:
            _mi = mi
        for j in range(mat_cols):
            if i != j:
                mj = mat[:,j]
                adj[i,j] = 1 - cosine(_mi,mj)
    return adj

def make_threshed_mat(mat):
    true = make_adj(mat)
    
    nulls = [make_adj(mat,perm=True) for i in range(200)]
    proportions = np.greater_equal(nulls,true).sum(0)
    Pvals = proportions/200
    
    indices = np.triu_indices_from(Pvals,1)
    p_tri = Pvals[indices]
    
    indices = np.array(indices)
    corrected = fdrcorrection(p_tri)[0]
    for ix, val in enumerate(corrected):
        r,c = indices[:,ix]
        if not val:
            true[r,c] = 0
            true[c,r] = 0
            
    return true

In [127]:
def run_count(year, perm=False):
    sub_df = all_df.loc[all_df.year == year]
    sub_df = sub_df.loc[-sub_df.dynamic_label.isin(['procedural','tribute','NA'])]
    sub_df = sub_df.loc[-sub_df.dynamic_label.isnull()]
    sub_parties = sub_df[['unique_ID','party']].groupby('unique_ID').first()
    
    speaker_counts = (sub_df
                      .groupby(['unique_ID','topic_id'])
                      .speaker.count()
                      .reset_index()
                      .pivot(index='unique_ID',columns='topic_id',values='speaker')
                      .fillna(0)
                      .reset_index()
                      .merge(sub_parties,on='unique_ID',how='left')
                     )
    
    if perm: # for null modelling
        speaker_counts['party'] = np.random.permutation(speaker_counts['party'])
    
    names = speaker_counts.columns[1:-1]

    Dem_matrix = speaker_counts.loc[speaker_counts.party == 'D'].drop(['unique_ID','party'],1).to_numpy()
    Rep_matrix = speaker_counts.loc[speaker_counts.party == 'R'].drop(['unique_ID','party'],1).to_numpy()

    dem_adj = make_threshed_mat(Dem_matrix)
    rep_adj = make_threshed_mat(Rep_matrix)
    
    dem_triu = dem_adj[np.triu_indices_from(dem_adj,1)]
    rep_triu = rep_adj[np.triu_indices_from(rep_adj,1)]
    
    return dem_adj, rep_adj, np.abs(dem_adj - rep_adj), pearsonr(dem_triu,rep_triu)[0],names
    
    

In [128]:
def run_year(year,iters=200):
    true_dem, true_rep, true_diff, true_sim,names = run_count(year)
    null_diffs = np.zeros([iters,true_diff.shape[0],true_diff.shape[1]])
    null_pearson = []
    for i in tqdm(range(iters)):
        _,_,null_diffs[i,:,:],pearson,_ = run_count(year,True)
        null_pearson.append(pearson)
    return true_diff, true_sim, null_diffs,null_pearson

In [140]:
def run_nulls(year):
    _,_,diffs,pearson,_ = run_count(year)
    return (diffs,pearson,year)