In [7]:
#STEP 1: Load modules
%pylab inline
import pandas as pd
import contributions
import votes
import bills
import sqlCommands
from peoplefinder import PeopleFinder
from operator import attrgetter
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
import psycopg2
from tqdm import tqdm
from sklearn import linear_model
from sklearn import svm
import random
import scipy as sc

#define function that generates plot of F1-score with varying parameters.
def F1plot(f,x,xtitle, islog):
    plt.plot(x,f*100.)
    if islog:
        plt.xscale('log')
    plt.title('F1 Score', fontsize=20)
    plt.xlabel(xtitle, fontsize=16)
    plt.ylabel('F1 Score (%)', fontsize=16)
    
def accplot(y_accs,x,xtitle, islog):
    plt.plot(x,y_accs*100.)
    if islog:
        plt.xscale('log')
    plt.title('Accuracy', fontsize=20)
    plt.xlabel(xtitle, fontsize=16)
    plt.ylabel('Accuracy (%)', fontsize=16)

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy


In [8]:
#set up postgresql engine
dbname = 'legislatr'
engine = sqlCommands.get_engine(dbname)

In [9]:
#STEP 2: Load features
#read in the subject feature array
query = "SELECT * FROM features_subs WHERE final_result = 0 OR final_result = 1;"
feat_subs = pd.read_sql_query(query,engine)
#read in the legislator feature array
query = "SELECT * FROM features_legis WHERE final_result = 0 OR final_result = 1;"
feat_legis = pd.read_sql_query(query,engine)
#read in the committee feature array
query = "SELECT * FROM features_comms WHERE final_result = 0 OR final_result = 1;"
feat_comms = pd.read_sql_query(query,engine)

#fuse dataframes
feat = pd.concat([feat_subs,feat_legis.drop(['bill_number','bill_type','index','result','status','final_result','num_amends','congress'],axis=1),
                 feat_comms.drop(['bill_number','bill_type','index','result','status','final_result','num_amends','congress'],axis=1)],axis=1)

In [10]:
#STEP 3: Create train (60%), CV (20%), and test (20%) sets.

#randomize input
data = feat.iloc[np.random.permutation(len(feat))]

indlen = feat['index'].size
trlen = int(indlen*0.6) #training set length
print(trlen)
cvlen = int((indlen-trlen)*0.5) #CV set length
print(cvlen)
testlen = indlen - trlen - cvlen #test set length
print(testlen)

#create training set
tr_data = data.iloc[0:trlen]
tr_data_Y = (tr_data.as_matrix(columns=['final_result'])).flatten()
tr_data = tr_data.drop(['bill_number','bill_type','index','result','status','final_result','congress'],axis=1)
tr_data_X = tr_data.as_matrix()
print(tr_data_X.shape)
print(tr_data_Y.shape)

#create CV set
CV_data = data.iloc[trlen:cvlen+trlen]
CV_data_Y = (CV_data.as_matrix(columns=['final_result'])).flatten()
CV_data = CV_data.drop(['bill_number','bill_type','index','result','status','final_result','congress'],axis=1)
CV_data_X = CV_data.as_matrix()
print(CV_data_X.shape)
print(CV_data_Y.shape)

#create testing set
test_data = data.iloc[cvlen+trlen:-1]
test_data_Y = (test_data.as_matrix(columns=['final_result'])).flatten()
test_data = test_data.drop(['bill_number','bill_type','index','result','status','final_result','congress'],axis=1)
test_data_X = test_data.as_matrix()
print(test_data_X.shape)
print(test_data_Y.shape)

18094
6032
6032
(18094, 1317)
(18094,)
(6032, 1317)
(6032,)
(6031, 1317)
(6031,)


# Optimize SVM

In [11]:
#optimize number of splits
base = 2. #base for numbers
exp = np.arange(10)+1. #exponent for base
val = base**exp #values to be tested
recs = list() #all recalls
prec = list() #all precisions
accs = list() #all accuracies
fscr = list() #all F1-scores

#loop through forests
#for v in val:
#    print('----------------------')
#    print('Min Samples Split = ',v)
svc = svm.LinearSVC(class_weight='balanced')
svc = svc.fit(tr_data_X,tr_data_Y)
Z =svc.predict(CV_data_X)

#set up variables
acc = 0 #accuracy
true_pos = 0.
false_pos = 0.
true_neg = 0.
false_neg = 0.
final = 0.

#get parameters for statistics
for i in range(0,len(Z)):
    if Z[i] == CV_data_Y[i]: #mark correct answers
        acc = acc + 1.
    if (Z[i] == 1) and (CV_data_Y[i] == 1): #mark true positives
        true_pos = true_pos + 1.
    if (Z[i] == 1) and (CV_data_Y[i] == 0): #mark false positives
        false_pos = false_pos + 1.
    if (Z[i] == 0) and (CV_data_Y[i] == 1): #mark false negatives
        false_neg = false_neg + 1.
    if (Z[i] == 0) and (CV_data_Y[i] == 0): #mark true negatives
        true_neg = true_neg + 1.
#calculate Accuracy
final = acc/len(Z)
print('Accuracy = ',final)
accs.append(final)

#calculate Precision
pre = true_pos/(true_pos+false_pos)
print('Precision = ',pre)
prec.append(pre)
    
#calculate Recall
rec = true_pos/(true_pos+false_neg)
print('Recall = ',rec)
recs.append(rec)
    
#calculate F1-Score
f1 = 2*((rec*pre)/(rec+pre))
print('F1-Score = ',f1)
fscr.append(f1)

#y_accs = np.asarray(accs)
#y_prec = np.asarray(prec)
#y_recs = np.asarray(recs)
#y_fscr = np.asarray(fscr)

#F1plot(y_fscr,val,'Min Sample Split',True)


Accuracy =  0.8978779840848806
Precision =  0.2078125
Recall =  0.5495867768595041
F1-Score =  0.30158730158730157
