In [3]:
import pandas as pd
import numpy as np
import joblib

from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegression
import pandas as pd
import numpy as np

import os
import joblib

from tqdm import tqdm
# import boto3
# client = boto3.client('s3')

# Code from https://github.com/derekgreene/dynamic-nmf
%run Greene_dnmf.py

# get procedural stop words
%run procedural_stop_words.py


In [5]:
with open('Results/Official_TopicModel_80k.pkl','rb') as File:
    models = joblib.load(File)
    
all_df = pd.read_csv('Results/All_speeches_labelled.csv')

KeyboardInterrupt: 

In [7]:
def term_rankings(H,terms,ntop):
    term_rankings = []
    for topic_index in range(H.shape[0]):
        top_indices = np.argsort(H[topic_index,:])[::-1]
        term_ranking = [terms[i] for i in top_indices[:ntop]]
        term_rankings.append(term_ranking)
    return term_rankings

In [8]:
def select_data(year):
    sub_df = all_df.loc[all_df.year_x == year]
    model = [mod for mod in models['window_models'] if mod['year'] == year][0]
    
    Y = [1 if i == 'D' else 0 for i in sub_df.party_y]
    X = model['W']
    
    
    return X,Y,sub_df[['party_y','speaker']].reset_index()

In [28]:
def fit_model(X,Y,df):
    _models = []
    for C in np.arange(0.1,1,0.1):
        mod = LogisticRegression(penalty='l1',solver='liblinear',class_weight='balanced',C=C)
        mod.fit(X,Y)
        predictions = mod.predict(X)
        speech_accuracy = sum([1 for ix,i in enumerate(predictions) if i == Y[ix]])/len(Y)
        
        speech_probs = [i[1] for i in mod.predict_proba(X)]
        df['prob_party'] = speech_probs
        
        partisan_assigned = df.groupby('speaker').prob_party.mean().reset_index()
        partisan_assigned['predicted_party'] = partisan_assigned.prob_party.apply(lambda x: 'D' if x > 0.5 else 'R')
        correct_speaker = sum(df.groupby('speaker').party_y.first().reset_index()
                             .merge(partisan_assigned,on='speaker',how='inner')
                             .apply(lambda x: 1 if x.party_y == x.predicted_party else 0,1))

        speaker_accuracy = correct_speaker/len(partisan_assigned)
        _models.append({"C":C,'speech_accuracy':speech_accuracy,'speaker_accuracy':speaker_accuracy,'coefs':mod.coef_})
    return pd.DataFrame(_models), partisan_assigned

In [29]:
records = []
for year in tqdm(range(1983,2017)):
    X,Y,df = select_data(year)
    mod_df = fit_model(X,Y,df)
    records.append({"year":year,
                    "speaker_accuracy":mod_df.speaker_accuracy.max(),
                    "speaker_acc_C":mod_df.loc[mod_df.speaker_accuracy == mod_df.speaker_accuracy.max(),'C'].values[0],
                    "speaker_acc_coef":mod_df.loc[mod_df.speaker_accuracy == mod_df.speaker_accuracy.max(),'coefs'].values[0],
                    "speech_accuracy":mod_df.speech_accuracy.max(),
                   "speech_acc_C":mod_df.loc[mod_df.speech_accuracy == mod_df.speech_accuracy.max(),'C'].values[0],
                   "speech_acc_coef":mod_df.loc[mod_df.speech_accuracy == mod_df.speech_accuracy.max(),'coefs'].values[0]})
    
true_records = pd.DataFrame(records)

100%|██████████| 34/34 [00:44<00:00,  1.32s/it]


In [30]:
true_records.head()

Unnamed: 0,year,speaker_accuracy,speaker_acc_C,speaker_acc_coef,speech_accuracy,speech_acc_C,speech_acc_coef
0,1983,0.699234,0.8,"[[0.0, 0.859087542023104, -0.5206892923239224,...",0.599486,0.3,"[[0.0, 0.0, 0.0, -1.4959521475964013, 0.0, 0.0..."
1,1984,0.708333,0.3,"[[0.0, 0.0, -11.282313476002717, 0.0, -5.86440...",0.631702,0.7,"[[0.0, 0.0, -14.03327331974852, 0.069685493535..."
2,1985,0.708191,0.9,"[[0.0, -19.82308523381375, -5.754092998809567,...",0.607525,0.9,"[[0.0, -19.82308523381375, -5.754092998809567,..."
3,1986,0.685714,0.6,"[[0.0, 8.559600746455118, -0.7342625371268254,...",0.601663,0.8,"[[0.0, 9.482713844706906, -0.9557675274864119,..."
4,1987,0.669329,0.7,"[[-5.261858005583338, -13.995375689957324, 0.0...",0.577853,0.6,"[[-2.5032975594148708, -13.483195354287819, 0...."


In [31]:
true_records.to_csv('Results/classification_results_with_coefs.csv')

### Chance models

In [None]:
speaker_C = true_records.loc[true_records.year == year,'speaker_acc_C'].values[0]
speech_C = true_records.loc[true_records.year == year,'speech_acc_C'].values[0]
for i in tqdm(range(200),desc=f"{year}: "): # bootstrap sample
    boot_df = df.sample(len(df),replace=True)
    boot_x = X[boot_df.index]
    boot_y = [Y[_] for _ in boot_df.index]

    # Model for speaker accuracy
    mod = LogisticRegression(penalty='l1',solver='liblinear',class_weight='balanced',C=speaker_C)
    mod.fit(boot_x,boot_y)

    speech_probs = [i[1] for i in mod.predict_proba(boot_x)]
    boot_df['prob_party'] = speech_probs

    partisan_assigned = boot_df.groupby('speaker').prob_party.mean().reset_index()
    partisan_assigned['predicted_party'] = partisan_assigned.prob_party.apply(lambda x: 'D' if x > 0.5 else 'R')
    correct_speaker = sum(df.groupby('speaker').party_y.first().reset_index()
                         .merge(partisan_assigned,on='speaker',how='inner')
                         .apply(lambda x: 1 if x.party_y == x.predicted_party else 0,1))

    speaker_accuracy = correct_speaker/len(partisan_assigned)

    # Model for Speech accuracy
    mod = LogisticRegression(penalty='l1',solver='liblinear',class_weight='balanced',C=speech_C)
    mod.fit(boot_x,boot_y)
    predictions = mod.predict(boot_x)
    speech_accuracy = sum([1 for ix,i in enumerate(predictions) if i == boot_y[ix]])/len(boot_y)

    records.append({"year":year,
                "speaker_accuracy":speaker_accuracy,
                "speech_accuracy":speech_accuracy,
                "iteration":i})

In [None]:
records = []
for year in range(1983,2017):
    speaker_C = true_records.loc[true_records.year == year,'speaker_acc_C'].values[0]
    speech_C = true_records.loc[true_records.year == year,'speech_acc_C'].values[0]
    
    X,Y,df = select_data(year)
    
    for i in tqdm(range(200),desc=f"{year}: "): 
        np.random.shuffle(Y)

        # Model for speaker accuracy
        mod = LogisticRegression(penalty='l1',solver='liblinear',class_weight='balanced',C=speaker_C)
        mod.fit(X,Y)

        speech_probs = [i[1] for i in mod.predict_proba(X)]
        df['prob_party'] = speech_probs

        partisan_assigned = df.groupby('speaker').prob_party.mean().reset_index()
        partisan_assigned['predicted_party'] = partisan_assigned.prob_party.apply(lambda x: 'D' if x > 0.5 else 'R')
        correct_speaker = sum(df.groupby('speaker').party_y.first().reset_index()
                             .merge(partisan_assigned,on='speaker',how='inner')
                             .apply(lambda x: 1 if x.party_y == x.predicted_party else 0,1))

        speaker_accuracy = correct_speaker/len(partisan_assigned)

        # Model for Speech accuracy
        mod = LogisticRegression(penalty='l1',solver='liblinear',class_weight='balanced',C=speech_C)
        mod.fit(X,Y)
        predictions = mod.predict(X)
        speech_accuracy = sum([1 for ix,i in enumerate(predictions) if i == Y[ix]])/len(Y)

        records.append({"year":year,
                    "speaker_accuracy":speaker_accuracy,
                    "speech_accuracy":speech_accuracy,
                    "iteration":i})
    
null_records = pd.DataFrame(records)

1983: 100%|██████████| 200/200 [00:39<00:00,  5.12it/s]
1984: 100%|██████████| 200/200 [00:35<00:00,  5.56it/s]
1985: 100%|██████████| 200/200 [00:46<00:00,  4.29it/s]
1986: 100%|██████████| 200/200 [00:37<00:00,  5.38it/s]
1987: 100%|██████████| 200/200 [00:40<00:00,  4.99it/s]
1988: 100%|██████████| 200/200 [00:37<00:00,  5.37it/s]
1989: 100%|██████████| 200/200 [00:34<00:00,  5.88it/s]
1990: 100%|██████████| 200/200 [00:40<00:00,  4.88it/s]
1991: 100%|██████████| 200/200 [00:36<00:00,  5.49it/s]
1992: 100%|██████████| 200/200 [00:32<00:00,  6.09it/s]
1993: 100%|██████████| 200/200 [00:36<00:00,  5.54it/s]
1994: 100%|██████████| 200/200 [00:37<00:00,  5.36it/s]
1995: 100%|██████████| 200/200 [00:54<00:00,  3.69it/s]
1996: 100%|██████████| 200/200 [00:33<00:00,  5.90it/s]
1997: 100%|██████████| 200/200 [00:39<00:00,  5.09it/s]
1998: 100%|██████████| 200/200 [00:39<00:00,  5.09it/s]
1999: 100%|██████████| 200/200 [00:38<00:00,  5.13it/s]
2000: 100%|██████████| 200/200 [00:36<00:00,  5.

In [89]:
combo = pd.concat([null_records,true_records[['year','speaker_accuracy','speech_accuracy']]])
combo.loc[combo.iteration.isnull(),'type']  = 'actual'
combo.loc[-combo.iteration.isnull(),'type']  = 'null'
combo.loc[combo.iteration.isnull(),'iteration']  = 11
combo.to_csv('results_with_null.csv')

### Bootstrapped models

In [92]:
def Run_boot(year):
    records = []
    X,Y,df = select_data(year)
    speaker_C = true_records.loc[true_records.year == year,'speaker_acc_C'].values[0]
    speech_C = true_records.loc[true_records.year == year,'speech_acc_C'].values[0]
    for i in tqdm(range(1000),desc=f"{year}: "): # bootstrap sample
        boot_df = df.sample(len(df),replace=True)
        boot_x = X[boot_df.index]
        boot_y = [Y[_] for _ in boot_df.index]
        
        # Model for speaker accuracy
        mod = LogisticRegression(penalty='l1',solver='liblinear',class_weight='balanced',C=speaker_C)
        mod.fit(boot_x,boot_y)
        
        speech_probs = [i[1] for i in mod.predict_proba(boot_x)]
        boot_df['prob_party'] = speech_probs
        
        partisan_assigned = boot_df.groupby('speaker').prob_party.mean().reset_index()
        partisan_assigned['predicted_party'] = partisan_assigned.prob_party.apply(lambda x: 'D' if x > 0.5 else 'R')
        correct_speaker = sum(boot_df.groupby('speaker').party_y.first().reset_index()
                             .merge(partisan_assigned,on='speaker',how='inner')
                             .apply(lambda x: 1 if x.party_y == x.predicted_party else 0,1))

        speaker_accuracy = correct_speaker/len(partisan_assigned)
    
        # Model for Speech accuracy
        mod = LogisticRegression(penalty='l1',solver='liblinear',class_weight='balanced',C=speech_C)
        mod.fit(boot_x,boot_y)
        predictions = mod.predict(boot_x)
        speech_accuracy = sum([1 for ix,i in enumerate(predictions) if i == boot_y[ix]])/len(boot_y)
        
        records.append({"year":year,
                    "speaker_accuracy":speaker_accuracy,
                    "speech_accuracy":speech_accuracy})
    return pd.DataFrame(records)

In [None]:
boot_results = []
for year in range(1983,2017):
    boot_results.append(Run_boot(year))
boot_df = pd.concat(boot_results)
boot_df.to_csv('bootstrap_results.csv')

1983: 100%|██████████| 1000/1000 [04:26<00:00,  3.76it/s]
1984: 100%|██████████| 1000/1000 [04:31<00:00,  3.69it/s]
1985: 100%|██████████| 1000/1000 [05:12<00:00,  3.20it/s]
1986: 100%|██████████| 1000/1000 [04:20<00:00,  3.85it/s]
1987: 100%|██████████| 1000/1000 [04:17<00:00,  3.89it/s]
1988: 100%|██████████| 1000/1000 [03:54<00:00,  4.26it/s]
1989: 100%|██████████| 1000/1000 [03:50<00:00,  4.34it/s]
1990: 100%|██████████| 1000/1000 [04:20<00:00,  3.84it/s]
1991: 100%|██████████| 1000/1000 [04:33<00:00,  3.66it/s]
1992: 100%|██████████| 1000/1000 [03:39<00:00,  4.56it/s]
1993: 100%|██████████| 1000/1000 [04:50<00:00,  3.44it/s]
2006:  76%|███████▌  | 762/1000 [03:30<01:03,  3.75it/s]

## Cross validation approach

This analysis is only appropriate for speeh classification