## Use states as voters instead of congressmen

In [1]:
import pandas as pd
import networkx as nx
import numpy as np
import re

import glob, os
import itertools
from tqdm import tqdm

import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde
from shapely.geometry import LineString

import shapely

In [2]:
mmb = pd.read_csv('../dataset/HSall_members.csv')
pc = pd.read_csv('../dataset/HSall_parties.csv')

In [3]:
def create_members_df():
    temp_congress = mmb.groupby('icpsr', as_index=False)[['congress']].agg(lambda x: list(x))
    temp_party = mmb.groupby('icpsr', as_index=False)[['party_code']].agg(lambda x: list(set(x)))
    temp_congress = temp_congress.merge(temp_party)
    temp_congress['bioname'] = temp_congress['icpsr'].map(mmb[['icpsr', 'bioname']].set_index('icpsr').to_dict()['bioname'])
    temp_congress['state_abbrev'] = temp_congress['icpsr'].map(mmb[['icpsr', 'state_abbrev']].set_index('icpsr').to_dict()['state_abbrev'])
    temp_congress['district_code'] = temp_congress['icpsr'].map(mmb[['icpsr', 'district_code']].set_index('icpsr').to_dict()['district_code'])
    party_codes_dic = pc[['party_name', 'party_code']].set_index('party_code').to_dict()['party_name']
    temp_congress['party_name'] = temp_congress['party_code'].apply(lambda x: [party_codes_dic[y] for y in x])
    return temp_congress

members = create_members_df()

_________________________________________________________________________________________________

In [4]:
n_congress = '100'

In [23]:
members2state_dic = members.set_index('icpsr')['state_abbrev'].to_dict()
members2district_dic = members.set_index('icpsr')['district_code'].to_dict()
members_party_dict = members.set_index('icpsr')['party_name'].to_dict()

# try for congress 110
congress_prova = pd.read_csv('../dataset/download_votes_merged/congress_'+n_congress+'.csv')

# add the column of state_abbrev translating the icpsr
congress_prova['state_abbrev'] = congress_prova['icpsr'].map(members2state_dic)
congress_prova['code_district'] = congress_prova['icpsr'].map(members2state_dic)
congress_prova['member_party_name'] = congress_prova['icpsr'].map(members_party_dict)

In [22]:
congress_prova.head()

Unnamed: 0,bill_number,id,description,chamber,date,vote_desc,vote_result,congress,rollnumber,Peltzman,Clausen,name,party_short_name,icpsr,vote,state_abbrev,code_district,member_party_name
0,,RH1000444,"TO APPROVE THE HOUSE JOURNAL OF WEDNESDAY, DEC...",House,1987-12-10,,,100,444,Internal Organization,Miscellaneous Policy,"REAGAN, Ronald Wilson",Republican,99907,Abs,USA,USA,[Republican]
1,,RH1000444,"TO APPROVE THE HOUSE JOURNAL OF WEDNESDAY, DEC...",House,1987-12-10,,,100,444,Internal Organization,Miscellaneous Policy,"CALLAHAN, Herbert Leon (Sonny)",Republican,15090,Nay,AL,AL,[Republican]
2,,RH1000444,"TO APPROVE THE HOUSE JOURNAL OF WEDNESDAY, DEC...",House,1987-12-10,,,100,444,Internal Organization,Miscellaneous Policy,"DICKINSON, William Louis",Republican,10717,Nay,AL,AL,[Republican]
3,,RH1000444,"TO APPROVE THE HOUSE JOURNAL OF WEDNESDAY, DEC...",House,1987-12-10,,,100,444,Internal Organization,Miscellaneous Policy,"NICHOLS, William Flynt",Democrat,11037,Yea,AL,AL,[Democrat]
4,,RH1000444,"TO APPROVE THE HOUSE JOURNAL OF WEDNESDAY, DEC...",House,1987-12-10,,,100,444,Internal Organization,Miscellaneous Policy,"BEVILL, Tom",Democrat,11000,Yea,AL,AL,[Democrat]


Create the member-based edgelist (if you didn't do it already)

In [52]:
frac = 1.0

In [53]:
def create_state_edgelist_from_congress(congress, frac):
	edgelist = pd.DataFrame()

	def state_party_rnd_func(state, congress):
		# select all the lines of the congress where the column state_abbrev is equal to state
		congress_state = congress[congress['state_abbrev'] == state]
		#randomly select 75% of the lines
		congress_state_rnd = congress_state.sample(frac=frac)
		# return the party which has the majority of the randomly selected lines
		return congress_state_rnd['member_party_name'].value_counts().index[0]

	for voteid in tqdm(set(congress['id'])):                 # iterate over all votes id (ids are unique for each vote)

		temp = congress[congress['id'] == voteid]            # select the rows where the vote id is equal to the current vote id            

		yy = temp[temp['vote']=='Yea']['state_abbrev']              # select the icpsr of the members that voted "Yea"
		nn = temp[temp['vote']=='Nay']['state_abbrev']                        

		y = itertools.combinations(yy, 2)                    # all possible combinations of 2 members that voted "Yea"
		n = itertools.combinations(nn, 2)                
		o = itertools.product(yy, nn)                        # cartesian product of the 2 series

		y = pd.DataFrame(y, columns = ['source', 'target'])  # create a dataframe from the combinations of "Yea" voters
		y['weight'] = 1                                      # add a column with the weight of the edge
		y['count'] = 1                                         

		n = pd.DataFrame(n, columns = ['source', 'target'])     
		n['weight'] = 1                                         
		n['count'] = 1                                          

		o = pd.DataFrame(o, columns = ['source', 'target'])     
		o['weight'] = -1                                     # same but the link is negative                    
		o['count'] = 1                                          

		edgelist = pd.concat([edgelist, y, n, o])
		edgelist = edgelist.groupby(['source', 'target', 'weight']).sum().reset_index()     # group by source, target and weight and sum the count                  

	edgelist = pd.concat([edgelist, pd.DataFrame({
		'source': edgelist['target'],                        # new columns based on old columns: 
		'target': edgelist['source'],                        #   'newcolumn': dataframe['oldcolumn']
		'weight': edgelist['weight'],
		'count': edgelist['count']})])

	edgelist = edgelist.loc[edgelist['source'] < edgelist['target']]                    # remove duplicates
	edgelist = edgelist.groupby(['source', 'target', 'weight']).sum().reset_index()     # group by source, target and weight and sum the count
	edgelist['party'] = edgelist.apply(lambda row: 'in' if state_party_rnd_func(row['source'], congress) == state_party_rnd_func(row['target'], congress) else 'out', axis=1)   # create a column with the party of the edge

	map_votes = edgelist.groupby(['source', 'target'])['count'].sum().to_dict()                                                                         # create a dictionary with the number of votes togheter for each pair of nodes                               

	edgelist['votes_togheter'] = edgelist[['source', 'target']].apply(lambda x: map_votes[(x['source'], x['target'])], axis=1)
	edgelist['perc'] = edgelist['count']/edgelist['votes_togheter']

	return edgelist

In [54]:
def compute_threshold_intersx(df, weight):	#df is the edgelist, weight is the weight of the edge

	def _midpoint(p1, p2):
		return {'x': (p1['x']+p2['x'])/2, 'y': (p1['y']+p2['y'])/2}

	def line_intersection(in_party, out_party, intersect_points):
		index_in = np.argmax(in_party[1])
		index_out = np.argmax(out_party[1])

        # points of the mean of the distributions 
		point_in={'x': in_party[0][index_in], 'y': in_party[1][index_in]}
		point_out={'x': out_party[0][index_out], 'y': out_party[1][index_out]}

        # medianpoint (mean of the means) of the two distributions
		midpoint = _midpoint(point_in, point_out)
        
        #find index of intersection closer to midpoint
		index_closer = np.argmin([np.sqrt( (p[0] - midpoint['x'])**2 + (p[1] - midpoint['y'])**2 ) for p in intersect_points])

        # return x value of closer intersection
		return intersect_points[index_closer][0]

	x0 = df.loc[(df['party']=='in')&(df['weight'] == weight)]['perc']
	x1 = df.loc[(df['party']=='out')&(df['weight'] == weight)]['perc']
    
	bw = len(x0)**(-1./(2+4))
	kde0 = gaussian_kde(x0, bw_method=bw)
	bw = len(x1)**(-1./(2+4))
	kde1 = gaussian_kde(x1, bw_method=bw)

	xmin = min(x0.min(), x1.min())
	xmax = max(x0.max(), x1.max())
	dx = 0.2 * (xmax - xmin) # add a 20% margin, as the kde is wider than the data
	xmin -= dx
	xmax += dx

	x = np.linspace(xmin, xmax, 500)
	kde0_x = kde0(x)
	kde1_x = kde1(x)
	inters_x = np.minimum(kde0_x, kde1_x)

	idx = np.argwhere(np.diff(np.sign(kde0_x - kde1_x))).flatten()
    
	threshold = line_intersection([x, kde0_x], [x, kde0_x], [[x,y] for x,y in zip (x[idx], kde1_x[idx])])
	area_inters_x = np.trapz(inters_x, x)

	return threshold, area_inters_x

In [56]:
edgelist_prova = create_state_edgelist_from_congress(congress_prova, frac)

  0%|          | 0/939 [00:00<?, ?it/s]

100%|██████████| 939/939 [00:45<00:00, 20.50it/s]


In [37]:
edgelist_prova.head(15)

Unnamed: 0,source,target,weight,count,party,votes_togheter,perc
0,AK,AL,-1,1740,out,5643,0.308347
1,AK,AL,1,3903,out,5643,0.691653
2,AK,AR,-1,1091,out,3206,0.340299
3,AK,AR,1,2115,out,3206,0.659701
4,AK,AZ,-1,1262,in,4135,0.3052
5,AK,AZ,1,2873,in,4135,0.6948
6,AK,CA,-1,13230,out,35601,0.371619
7,AK,CA,1,22371,out,35601,0.628381
8,AK,CO,-1,1744,out,4972,0.350764
9,AK,CO,1,3228,out,4972,0.649236


In [38]:
len_df = len(pd.read_csv('../dataset/download_votes_merged/congress_'+n_congress+'.csv')['bill_number'].unique())
threshold_pos, area_pos = compute_threshold_intersx(edgelist_prova, 1)
threshold_neg, area_neg = compute_threshold_intersx(edgelist_prova, -1)

df_output = pd.DataFrame({
	'pos_threshold': [threshold_pos],
	'pos_area': [area_pos],
	'neg_threshold': [threshold_neg],
	'neg_area': [area_neg],
	'n_votes': [len_df]
})

df_output.to_csv('./ensemble_edgelist_prova/prova_threshold.csv', index=False)

Filter the edgelist using the thresholds

In [40]:
df_pos = edgelist_prova.loc[(edgelist_prova['weight']==1)&(edgelist_prova['perc']>threshold_pos)].copy()
df_neg = edgelist_prova.loc[(edgelist_prova['weight']==-1)&(edgelist_prova['perc']>threshold_neg)].copy()
edgelist_prova_filtered = pd.concat([df_pos, df_neg])

In [41]:
edgelist_prova.head(10)

Unnamed: 0,source,target,weight,count,party,votes_togheter,perc
0,AK,AL,-1,1740,out,5643,0.308347
1,AK,AL,1,3903,out,5643,0.691653
2,AK,AR,-1,1091,out,3206,0.340299
3,AK,AR,1,2115,out,3206,0.659701
4,AK,AZ,-1,1262,in,4135,0.3052
5,AK,AZ,1,2873,in,4135,0.6948
6,AK,CA,-1,13230,out,35601,0.371619
7,AK,CA,1,22371,out,35601,0.628381
8,AK,CO,-1,1744,out,4972,0.350764
9,AK,CO,1,3228,out,4972,0.649236


In [42]:
edgelist_prova_filtered.head(10)

Unnamed: 0,source,target,weight,count,party,votes_togheter,perc
1,AK,AL,1,3903,out,5643,0.691653
3,AK,AR,1,2115,out,3206,0.659701
5,AK,AZ,1,2873,in,4135,0.6948
9,AK,CO,1,3228,out,4972,0.649236
15,AK,FL,1,9676,out,14645,0.660703
19,AK,HI,1,1063,in,1635,0.650153
21,AK,IA,1,3237,in,4914,0.65873
23,AK,ID,1,1086,in,1609,0.674953
29,AK,KS,1,2779,in,4126,0.673534
31,AK,KY,1,3956,out,5871,0.67382


_____________________________

Compute the least eigenvalues for a given edgelist: all it needs is a column 'source', a column 'target' and a column 'weight'

In [44]:
import sys
sys.path.append('../local/bin/')
from sygno.nodes_position import _get_xcoord

In [47]:
def compute_eigen(df):
    def compute_d_max(G):
        max_d = 0
        for edge in G.edges(data=True):
            temp = G.degree[edge[0]] + G.degree[edge[1]]
            if temp > max_d:
                max_d = temp
        return max_d / 2

    G = nx.from_pandas_edgelist(df, 'source', 'target', ['weight'])
    df, least_eigenvalue = _get_xcoord(G, compact=False)
    print(least_eigenvalue)
    max_d = compute_d_max(G)
    print(max_d)
    eig = 1-(least_eigenvalue/(max_d-1))
    return eig

In [48]:
eig = compute_eigen(edgelist_prova_filtered)
eig

8.354
50.0


0.8295102040816327

______________________________________________________________