In [1]:
import os
import urllib
import codecs
import zipfile

import numpy as np
import pandas as pd
import scipy.sparse as sp
import matplotlib.pyplot as plt
from matplotlib.patches import Circle
from matplotlib import gridspec

from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

In [14]:
# Download zipped dataset.
url = 'http://congressionalbills.org/billfiles/bills93-114.zip'
filename = 'bills93-114.csv'
filepath = 'data/'+filename+'.zip'
if not os.path.isfile(filepath):
    with open(filepath, 'wb') as f:
            f.write(urllib.request.urlopen(url).read())
            
# Load dataframe.
with zipfile.ZipFile(filepath) as zf:
    with zf.open(filename) as f:
        old_df = pd.read_csv(f, sep=';', encoding='latin-1')
        
text_cols = ['Title']
bool_cols = ['Gender', 'Party', 'Majority']
cat_cols = ['Major']
tgt_cols = ['PLaw']

## FEATURE PROCESSING
# Drop 50k bills that have missing data in one of the cols.
df = old_df[text_cols+bool_cols+cat_cols+tgt_cols].dropna()
# Remove 8 non-party-affiated bills.
df = df[df['Party'] != 328.0]
# Process categorical columns.
df['Party'] = df['Party'].map(lambda k: {100.0: 1, 200.0: 0}[k])
df['Major'] = df['Major'].map(lambda k: 'm'+str(int(k)))
# Fit tf-idf vectorizer on all data.
vec = TfidfVectorizer(sublinear_tf=True, smooth_idf=False)
vec.fit(df[text_cols].values.reshape(-1))
# Fit categorical encoder.
enc = preprocessing.OneHotEncoder()
enc.fit(df[cat_cols])
pass

In [115]:
## Helper functions

def logistic(z):
    return 1 / (1 + np.exp(-z))

In [6]:
def process_df(df,
               cat_cols=['Major'],
               bool_cols=['Gender', 'Party', 'Majority'],
               text_col='Title',
               tgt_col='PLaw'):

    # get categorical and boolean features.
    bool_dset = 2 * df[bool_cols].values - 1
    if len(cat_cols) > 0:
        cat_dset = enc.transform(df[cat_cols]).todense().A
        var_dset = np.concatenate([cat_dset, bool_dset], axis=1)
    else:
        var_dset = bool_dset
    sp_dset = sp.csr_matrix(var_dset)
    
    # get text features.
    if text_col is not None:
        text_dset = vec.transform(df[text_col].values.reshape(-1))
        
        dset = sp.csr_matrix(sp.hstack([text_dset, sp_dset]))
    else:
        dset = sp_dset
    
    target = df[tgt_col].values.reshape(-1)
    return dset, target

In [16]:
## For class balancing, subsample an even number of bills that did and did not become law.
df_p = df[df['PLaw'] == True]
n_pos = len(df_p)
df_n = df[df['PLaw'] == False].sample(n_pos)
small_df = pd.concat([df_p, df_n])
# small_df = df.sample(50000) # alternative: simply subsample.

In [95]:
# Split into train / dev set.
train_df, dev_df = train_test_split(small_df, test_size=0.2)
# Process data.
trnX, trnY = process_df(train_df, bool_cols=['Party', 'Majority'])
devX, devY = process_df(dev_df, bool_cols=['Party', 'Majority'])

In [353]:
# Construct logistic regression.
cls = LogisticRegression(solver='lbfgs', max_iter=10000, C=3)
cls.fit(trnX, trnY)

# evaluate the model.
yp = cls.predict(devX)
acc = metrics.accuracy_score(devY, yp)
f1 = metrics.f1_score(devY, yp)
print('acc: %.3f, f1: %.3f' % (acc, f1))

acc: 0.777, f1: 0.781


In [359]:
# Evaluate on devset
Zs = (devX * cls.coef_[0]) + cls.intercept_[0]
pred = (Zs > 0)
(pred == devY).mean()

0.776888444222111

In [355]:
Zs = (trnX * cls.coef_[0]) + cls.intercept_[0]
feat_gender = train_df['Gender'].values

w_gender = 0

## run gradient descent only for the gender-weight.
for eta in [10.0, 1.0, 0.1, 0.01]:
    for _ in range(100):
        w_gender += eta * np.mean((trnY - logistic(Zs + w_gender * feat_gender)) * feat_gender)

print(w_gender)

-0.24494890976050218


In [383]:
# Evaluate on devset
Zs = (devX * cls.coef_[0]) + cls.intercept_[0]
feat_gender = dev_df['Gender'].values

pred = ((Zs + w_gender * feat_gender) > 0.0)
(pred == devY).mean()

0.776888444222111

In [388]:
((Zs > 0) ^ ((Zs + w_gender * feat_gender) > 0.0)).sum()

8

In [121]:
mask = dev_df['Gender'].values.astype(bool)
dev_XX = devX[mask]
dev_XY = devX[~mask]

prob_XX = logistic(Zs)[mask].mean()
prob_XY = logistic(Zs)[~mask].mean()