In [1]:
import pandas as pd
import numpy as np
import joblib

from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV

from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import TfidfVectorizer

import pandas as pd
import numpy as np

import os
import joblib

from tqdm import tqdm

from plotnine import ggplot, aes, geoms

# get procedural stop words
%run procedural_stop_words.py

In [2]:
with open('Results/Official_TopicModel_80k.pkl','rb') as File:
    models = joblib.load(File)
    
all_df = pd.read_csv('Results/All_speeches_labelled.csv')

In [3]:
def select_data(year):
    """
    convenience function for extracting X,Y data and dataframe
    """
    sub_df = all_df.loc[all_df.year_x == year]
    model = [mod for mod in models['window_models'] if mod['year'] == year][0]
    
    Y = np.array([1 if i == 'D' else 0 for i in sub_df.party_y])
    X = model['W']
    
    return X,Y,sub_df[['party_y','speaker']].reset_index()

In [4]:
# set kfold split
kfold = StratifiedKFold(n_splits=10,shuffle=True)

In [5]:
def run_model(X,Y,df,year,type='lasso',Cs=np.arange(0.1,1,0.1)):

    Results = []
    
    # for values of C
    for C in Cs:
        
        # fit and test model on all 10 cross validation folds
        if type == 'lasso':
            mod = LogisticRegression(penalty='l1',solver='liblinear',class_weight='balanced',C=C)
        elif type == 'svc':
            svc = LinearSVC(penalty='l1',class_weight='balanced',C=C,dual=False)
            mod = CalibratedClassifierCV(svc)
            
        predict_probs = np.zeros(len(Y)) # empty array for speach probabilities
        k_fold_vals = []
        
        for train_index, test_index in kfold.split(X, Y):
            X_train,X_test = X[train_index],X[test_index]
            Y_train,Y_test = Y[train_index],Y[test_index]
            
            # fit model predict test set
            mod.fit(X_train,Y_train)
            predictions = mod.predict(X_test)
            
            # get accuracy
            speech_accuracy = sum([1 for ix,i in enumerate(predictions) if i == Y_test[ix]])/len(Y_test)
            k_fold_vals.append(speech_accuracy)

            # get probability of being Democrat for every speech
            Dem_probs = [i[1] for i in mod.predict_proba(X_test)]
            predict_probs[test_index] = Dem_probs

        Results.append({"C":C,'mean_acc':np.mean(k_fold_vals),'std':np.std(k_fold_vals),'Dem_probs':predict_probs})

    C_frame = pd.DataFrame(Results)

    # select the best performing model
    best_row = C_frame.sort_values(by='mean_acc',ascending=False).reset_index().loc[0].to_dict()
    best_row['year'] = year
    
    # Speaker_accuracy
    df['prob_party'] = predict_probs
    partisan_assigned = df.groupby('speaker').prob_party.mean().reset_index()

    partisan_assigned['predicted_party'] = partisan_assigned.prob_party.apply(lambda x: 'D' if x > 0.5 else 'R')

    correct_speaker = sum(df.groupby('speaker').party_y.first().reset_index()
                     .merge(partisan_assigned,on='speaker',how='inner')
                     .apply(lambda x: 1 if x.party_y == x.predicted_party else 0,1))
    
    # speaker accuracy overall
    best_row['speaker_acc'] = correct_speaker/len(partisan_assigned)
    
    speaker_party_true = df.groupby('speaker').party_y.first().reset_index()
    speaker_party_true = speaker_party_true.loc[speaker_party_true.party_y != 'I']
    speaker_party_true = speaker_party_true.merge(partisan_assigned,on='speaker').groupby('party_y')

    # mean probability and std for Dem and Rep speakers
    best_row['Dem_speaker_mean'],best_row['Rep_speaaker_mean'] = speaker_party_true.prob_party.mean()
    best_row['Dem_speaker_std'],best_row['Rep_speaker_std'] = speaker_party_true.prob_party.std()
        
    # refit on all data to get coefficients
    coefs = LogisticRegression(penalty='l1',solver='liblinear',class_weight='balanced',C=best_row['C']).fit(X,Y).coef_
    best_row['coefs'] = coefs
    
    return best_row


## Run LASSO

In [6]:
Years = []
for year in tqdm(range(1983,2017)):
    X,Y,df = select_data(year)
    Years.append(run_model(X,Y,df,year))
LASSO_Results = pd.DataFrame(Years)
LASSO_Results = LASSO_Results.drop('Dem_probs',1)

100%|██████████| 34/34 [04:18<00:00,  7.61s/it]


In [7]:
LASSO_Results.to_csv('Results/Lasso.csv')

## Null Models

In [8]:
def run_null_model(X,Y,df,year,n=200):
    C = LASSO_Results.loc[LASSO_Results.year == year,'C'].values
    iterations = []
    for r in range(n):
        np.random.shuffle(Y)
        null = run_model(X,Y,df,year,Cs=C)
        null['iter'] = r
        iterations.append(null)
    return pd.DataFrame(iterations).drop('Dem_probs',1)

In [10]:
Years_null = []
for year in tqdm(range(1983,2016)):
    X,Y,df = select_data(year)
    Years_null.append(run_null_model(X,Y,df,year))
NULL_LASSO_Results = pd.concat(Years_null)

100%|██████████| 1/1 [01:10<00:00, 70.19s/it]


In [15]:
 NULL_LASSO_Results.to_csv('Results/Null_Results_Lasso.csv')

## With SVC

In [16]:
Years_SVC = []
for year in tqdm(range(1983,2016)):
    X,Y,df = select_data(year)
    Years_SVC.append(run_model(X,Y,df,year,type='svc',Cs=[0.1, 1, 10, 100, 1000]))
SVC_Results = pd.DataFrame(Years_SVC)
SVC_Results = SVC_Results.drop('Dem_probs',1)

100%|██████████| 1/1 [00:17<00:00, 17.83s/it]


In [18]:
SVC_Results.to_csv("Results/SVC.csv")

## Classification result robustness

To ensure that classificaiton results are robust to the types of features that are entered into the models. More specifically, for each year four topic models will be built with alternative K (20,60,80,100).

In [19]:
def run_NMF(year,k):
    sub_df = all_df.loc[all_df.year_x == year]
    vectorizer = TfidfVectorizer(min_df=0.001,max_df=0.3,stop_words=procedural_stop_words,use_idf=True,)
    dtm = vectorizer.fit_transform(sub_df.speech_processed)
    vocab = vectorizer.get_feature_names()
    
    model = NMF(n_components=k,max_iter=5000,init='nndsvd')
    X = model.fit_transform(dtm)
    Y = np.array([1 if i == 'D' else 0 for i in sub_df.party_y])
    return X,Y,sub_df[['party_y','speaker']].reset_index()

In [20]:
def robustness_check(k):
    Years = []
    for year in tqdm(range(1983,2016)):
        X,Y,df = run_NMF(year,k)
        Years.append(run_model(X,Y,df,year))
    Results = pd.DataFrame(Years)
    Results = Results.drop('Dem_probs',1)
    Results['k'] = k
    return Results

In [21]:
Results_20 = robustness_check(20)

100%|██████████| 1/1 [00:06<00:00,  6.73s/it]


In [28]:
Results_60 = robustness_check(60)

100%|██████████| 1/1 [00:17<00:00, 17.61s/it]


In [29]:
Results_80 = robustness_check(80)

100%|██████████| 1/1 [00:30<00:00, 30.41s/it]


In [30]:
Results_100 = robustness_check(100)

100%|██████████| 1/1 [01:31<00:00, 91.98s/it]


In [52]:
LASSO_Results['k'] = 45
All_models = pd.concat([LASSO_Results,Results_20,Results_60,Results_80,Results_100])
All_models['k'] = All_models.k.astype(str)

In [36]:
All_models.to_csv('Results/Robustness.csv')