# Gender

Investigating the effect of gender on bill-passing.

In [1]:
import os
import urllib
import codecs
import zipfile

import numpy as np
import pandas as pd
import scipy.sparse as sp
import matplotlib.pyplot as plt
from matplotlib.patches import Circle
from matplotlib import gridspec

from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

## Download, load, and preprocess dataset

In [2]:
# Download zipped dataset.
url = 'http://congressionalbills.org/billfiles/bills93-114.zip'
filename = 'bills93-114.csv'
filepath = 'data/'+filename+'.zip'
if not os.path.isfile(filepath):
    with open(filepath, 'wb') as f:
            f.write(urllib.request.urlopen(url).read())
            
# Load dataframe.
with zipfile.ZipFile(filepath) as zf:
    with zf.open(filename) as f:
        old_df = pd.read_csv(f, sep=';', encoding='latin-1')

In [61]:
TEXT_COL = 'Title'
BOOL_COLS = ['Party', 'Majority']
CAT_COLS = ['Major']
TGT_COLS = ['PLaw', 'Gender']
ALL_COLS = [TEXT_COL]+BOOL_COLS+CAT_COLS+TGT_COLS

## FEATURE PROCESSING
# Drop 50k bills that have missing data in one of the cols.
df = old_df[ALL_COLS].dropna()
# Remove 8 non-party-affiated bills.
df = df[df['Party'] != 328.0]
df['Party'] = df['Party'].map(lambda k: {100.0: 1, 200.0: 0}[k])
# Process categorical columns.
df['Major'] = df['Major'].map(lambda k: 'm'+str(int(k)))
# Fit tf-idf vectorizer on all data.
vec = TfidfVectorizer(sublinear_tf=True, smooth_idf=False)
vec.fit(df[TEXT_COL].values.reshape(-1))
# Fit categorical encoder.
enc = preprocessing.OneHotEncoder()
enc.fit(df[CAT_COLS])
# Encode 1/0 cols as boolean.
df[BOOL_COLS+TGT_COLS] = df[BOOL_COLS+TGT_COLS].astype(bool)

In [62]:
## Helper functions

def logistic(z):
    return 1 / (1 + np.exp(-z))

def enc_bool(seq):
    return 2 * seq.values - 1

In [85]:
def process_df(df):
    avail_cols = set(df.keys())
    bool_cols = list(filter(lambda c: c in avail_cols, BOOL_COLS))
    cat_cols = list(filter(lambda c: c in avail_cols, CAT_COLS))
    text_col = TEXT_COL if TEXT_COL in avail_cols else None

    # get categorical and boolean features.
    bool_dset = enc_bool(df[bool_cols]) # assume that we have some boolean cols at least.
    cat_dset = enc.transform(df[cat_cols]).todense().A if len(cat_cols) else None
    var_dset = np.concatenate([cat_dset, bool_dset], axis=1) if len(cat_cols) else bool_dset
    sp_dset = sp.csr_matrix(var_dset)
    
    # get text features.
    text_dset = vec.transform(df[text_col].values.reshape(-1)) if text_col else None
    dset = sp.csr_matrix(sp.hstack([text_dset, sp_dset])) if text_col else sp_dset
    
    targets = [df[tgt_col].values.reshape(-1) for tgt_col in TGT_COLS]
    return [dset, *targets]

## Modeling

In [86]:
def get_probs(df):
    plaw_xx = df[df['Gender']]['PLaw'].mean()
    plaw_xy = df[~df['Gender']]['PLaw'].mean()
    return plaw_xx, plaw_xy

## Question 1: Empirical Facts

In [87]:
perc_xx = 100 * df['Gender'].mean()
plaw_xx, plaw_xy = get_probs(df)
print('Of bill proposals, %.3f%% were by women.' % perc_xx)
print('Success rates: %.3f (women), %.3f (men)' % (plaw_xx, plaw_xy))

Of bill proposals, 8.053% were by women.
Success rates: 0.028 (women), 0.043 (men)


## Question 2: Is this explained by women {choosing harder topics, being in the wrong party}

Answer: Nope, across most topic/party combinations, women continue to have roughly half the success rate of men. Interestingly, this is worse for democrat women than republican women.

In [88]:
rest_df = df[(df['Major'] == 'm21') & df['Party']]
df_xx, df_xy = rest_df[rest_df['Gender']], rest_df[~rest_df['Gender']]
print(len(df_xx), len(df_xy))
plaw_xx, plaw_xy = df_xx['PLaw'].mean(), df_xy['PLaw'].mean()
print('Success rates: %.3f (women), %.3f (men)' % (plaw_xx, plaw_xy))

967 10069
Success rates: 0.054 (women), 0.096 (men)


## Question 3: What if we factor in the bill's title?

For bill $i$, let $x_i$ be a representation of the bill by `Title`, `Topic`, `Party`, and `Majority`. Furthermore, let $z_i$ be the gender of the proposer (1 for female, 0 for male), and $y_i$ be whether the bill became a law.

To determine the effect of gender on bill-passing, we wish to compute the average log-ratio between the probability of the bill passing given gender=male vs gender=female. This is expressed by

$$\mathbb{E}_{x \sim \mathcal D}\bigg[\log\dfrac{P(Y=1 \mid X=x, Z=0)}{P(Y=1 \mid X=x, Z=1)}\bigg]$$

where $\mathcal D$ is the distribution over $X$, in this case simply the empirical distribution over the provided dataset.

To compute $P(Y=1 \mid X=x, Z=z)$, split into $\dfrac{P(Y=1, Z=z \mid X=x)}{P(Z=z \mid X=x)}$.

So, the model we need is for $P(Y, Z \mid X)$.

Alternate approach: compute the difference between modeled $P(Y \mid X, Z)$ and $P(Y \mid X)$?

The relationship is $\dfrac{P(Y \mid X, Z)}{P(Y \mid X)} = \dfrac{P(Y, Z \mid X)}{P(Y \mid X)P(Z \mid X)} = \dfrac{P(Z \mid X, Y)}{P(Z \mid X)}$

What is the issue with simply setting up a linear model to compute $P(Y=1 \mid X, Z)$? The issue is that the linear model will learn $P(X \mid Z)$

In [89]:
## For class balancing, subsample an even number of bills that did and did not become law.
# df_p = df[df['PLaw'] == True]
# df_n = df[df['PLaw'] == False].sample(5*len(df_p))
# small_df = pd.concat([df_p, df_n])
small_df = df.sample(40000) # alternative: simply subsample.

## Also subsample by gender
# df_xx = small_df[small_df['Gender'] == True]
# df_xy = small_df[small_df['Gender'] == False].sample(2*len(df_xx))
# small_df = pd.concat([df_xx, df_xy])

In [90]:
# Split into train / dev set.
train_df, dev_df = train_test_split(small_df, test_size=0.2)
# Process data.
trnX, trnY, trnZ = process_df(train_df)
devX, devY, devZ = process_df(dev_df)

In [94]:
# Construct logistic regression on Y (PLaw).
clsY = LogisticRegression(solver='lbfgs', max_iter=10000, C=3)
clsY.predict_unnorm = lambda x: (x * clsY.coef_[0]) + clsY.intercept_[0]
clsY.fit(trnX, trnY)

# evaluate the model.
pred_pr = clsY.predict_proba(devX)
pred_y = clsY.predict(devX)
acc = metrics.accuracy_score(devY, pred_y)
loss = metrics.log_loss(devY, pred_pr)
print('acc: %.3f, loss: %.3f' % (acc, loss))

# Sanity check: evaluate alternate computation of output on devset
Zs = clsY.predict_unnorm(devX)
pred = (Zs > 0)
assert np.abs((pred == devY).mean() - acc) < 1e-5

acc: 0.965, loss: 0.124


In [None]:
# Construct logistic regression on Y (PLaw).
clsZ = LogisticRegression(solver='lbfgs', max_iter=10000, C=3)
clsZ.predict_unnorm = lambda x: (x * clsZ.coef_[0]) + clsZ.intercept_[0]
clsZ.fit(trnX, trnY)

# evaluate the model.
pred_pr = clsY.predict_proba(devX)
pred_y = clsY.predict(devX)
acc = metrics.accuracy_score(devY, pred_y)
loss = metrics.log_loss(devY, pred_pr)
print('acc: %.3f, loss: %.3f' % (acc, loss))

# Sanity check: evaluate alternate computation of output on devset
Zs = clsY.predict_unnorm(devX)
pred = (Zs > 0)
assert np.abs((pred == devY).mean() - acc) < 1e-5

In [18]:
# Compute optimal weight on gender feature, having frozen other weights/features.
Zs = cls.predict_unnorm(trnX)
feat_gender = enc_bool(train_df['Gender'])

w_gender = 0

## run gradient descent only for the gender-weight.
for eta in [10.0, 1.0, 0.1, 0.01]:
    for _ in range(100):
        w_gender += eta * np.mean((trnY - logistic(Zs + w_gender * feat_gender)) * feat_gender)

print(w_gender)

def without_gender(df):
    X, _ = process_df(df.drop('Gender', axis=1))
    return cls.predict_unnorm(X)

def with_gender(df):
    X, _ = process_df(df.drop('Gender', axis=1))
    feat_gender = enc_bool(df['Gender'])
    return cls.predict_unnorm(X) + w_gender * feat_gender

-0.04396989301719478


In [19]:
# Evaluate on devset
Zs = without_gender(dev_df)
Zs_gender = with_gender(dev_df)

pred_pr = logistic(Zs_gender)
pred_y = (Zs_gender > 0)
acc = metrics.accuracy_score(devY, pred_y)
loss = metrics.log_loss(devY, pred_pr)
print('acc: %.3f, loss: %.3f' % (acc, loss))

acc: 0.963, loss: 0.131


In [20]:
mask = dev_df['Gender'].values.astype(bool)

prob_XX = logistic(without_gender(dev_df[mask]))
prob_XY = logistic(without_gender(dev_df[~mask]))
print('%.3f %.3f' % (prob_XX.mean(), prob_XY.mean()))

0.033 0.041


In [22]:
prob_xx1 = prob_XX.mean()
prob_xx2 = logistic(with_gender(dev_df[mask])).mean()
print('With an independent linear expert, unk->female decreases prob by %.3f' % (prob_xx2/prob_xx1))

With an independent linear expert, unk->female decreases prob by 0.963
