## Use states as voters instead of congressmen

In [1]:
import pandas as pd
import networkx as nx
import numpy as np
import re

import glob, os
import itertools
from tqdm import tqdm

import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde
from shapely.geometry import LineString

import shapely

In [2]:
mmb = pd.read_csv('../dataset/HSall_members.csv')
pc = pd.read_csv('../dataset/HSall_parties.csv')

In [23]:
def create_members_df():
    temp_congress = mmb.groupby('icpsr', as_index=False)[['congress']].agg(lambda x: list(x))
    temp_party = mmb.groupby('icpsr', as_index=False)[['party_code']].agg(lambda x: list(set(x)))
    temp_congress = temp_congress.merge(temp_party)
    temp_congress['bioname'] = temp_congress['icpsr'].map(mmb[['icpsr', 'bioname']].set_index('icpsr').to_dict()['bioname'])
    temp_congress['state_abbrev'] = temp_congress['icpsr'].map(mmb[['icpsr', 'state_abbrev']].set_index('icpsr').to_dict()['state_abbrev'])
    temp_congress['district_code'] = temp_congress['icpsr'].map(mmb[['icpsr', 'district_code']].set_index('icpsr').to_dict()['district_code'])
    party_codes_dic = pc[['party_name', 'party_code']].set_index('party_code').to_dict()['party_name']
    temp_congress['party_name'] = temp_congress['party_code'].apply(lambda x: [party_codes_dic[y] for y in x])
    return temp_congress

members = create_members_df()

_________________________________________________________________________________________________

In [24]:
members2state_dic = members.set_index('icpsr')['state_abbrev'].to_dict()
members2district_dic = members.set_index('icpsr')['district_code'].to_dict()
members_party_dict = members.set_index('icpsr')['party_name'].to_dict()

In [25]:
# try for congress 110
congress_110 = pd.read_csv('../dataset/download_votes_merged/congress_110.csv')

In [26]:
# add the column of state_abbrev translating the icpsr
congress_110['state_abbrev'] = congress_110['icpsr'].map(members2state_dic)
congress_110['code_district'] = congress_110['icpsr'].map(members2state_dic)

Create the member-based edgelist (if you didn't do it already)

In [12]:
def create_edgelist_from_congress(congress, members_party_dict):
	edgelist = pd.DataFrame()

	for voteid in tqdm(set(congress['id'])):                 # iterate over all votes id (ids are unique for each vote)

		temp = congress[congress['id'] == voteid]            # select the rows where the vote id is equal to the current vote id            

		yy = temp[temp['vote']=='Yea']['icpsr']              # select the icpsr of the members that voted "Yea"
		nn = temp[temp['vote']=='Nay']['icpsr']                         

		y = itertools.combinations(yy, 2)                    # all possible combinations of 2 members that voted "Yea"
		n = itertools.combinations(nn, 2)                
		o = itertools.product(yy, nn)                        # cartesian product of the 2 series

		y = pd.DataFrame(y, columns = ['source', 'target'])  # create a dataframe from the combinations of "Yea" voters
		y['weight'] = 1                                      # add a column with the weight of the edge
		y['count'] = 1                                         

		n = pd.DataFrame(n, columns = ['source', 'target'])     
		n['weight'] = 1                                         
		n['count'] = 1                                          

		o = pd.DataFrame(o, columns = ['source', 'target'])     
		o['weight'] = -1                                     # same but the link is negative                    
		o['count'] = 1                                          

		edgelist = pd.concat([edgelist, y, n, o])
		edgelist = edgelist.groupby(['source', 'target', 'weight']).sum().reset_index()     # group by source, target and weight and sum the count                  

	edgelist = pd.concat([edgelist, pd.DataFrame({
		'source': edgelist['target'],                        # new columns based on old columns: 
		'target': edgelist['source'],                        #   'newcolumn': dataframe['oldcolumn']
		'weight': edgelist['weight'],
		'count': edgelist['count']})])

	edgelist = edgelist.loc[edgelist['source'] < edgelist['target']]                    # remove duplicates
	edgelist = edgelist.groupby(['source', 'target', 'weight']).sum().reset_index()     # group by source, target and weight and sum the count
	edgelist['party'] = edgelist.apply(lambda row: 'in' if members_party_dict[row['source']] == members_party_dict[row['target']] else 'out', axis=1)   # create a column with the party of the edge

	map_votes = edgelist.groupby(['source', 'target'])['count'].sum().to_dict()                                                                         # create a dictionary with the number of votes togheter for each pair of nodes                               

	edgelist['votes_togheter'] = edgelist[['source', 'target']].apply(lambda x: map_votes[(x['source'], x['target'])], axis=1)
	edgelist['perc'] = edgelist['count']/edgelist['votes_togheter']

	return edgelist

In [13]:
edgelist110 = create_edgelist_from_congress(congress_110, members_party_dict)

100%|██████████| 1865/1865 [05:43<00:00,  5.42it/s]


In [27]:
edgelist110['source_state'] = edgelist110['source'].map(members2state_dic)
edgelist110['target_state'] = edgelist110['target'].map(members2state_dic)
edgelist110['source_district'] = edgelist110['source'].map(members2district_dic)
edgelist110['target_district'] = edgelist110['target'].map(members2district_dic)

In [28]:
edgelist110.head(10)

Unnamed: 0,source,target,weight,count,party,votes_togheter,perc,source_state,target_state,source_district,target_district
0,2605,10713,-1,75,in,1647,0.045537,MI,MI,12.0,13.0
1,2605,10713,1,1572,in,1647,0.954463,MI,MI,12.0,13.0
2,2605,12036,-1,55,in,1747,0.031483,MI,WI,12.0,7.0
3,2605,12036,1,1692,in,1747,0.968517,MI,WI,12.0,7.0
4,2605,13035,-1,45,in,1609,0.027968,MI,NY,12.0,13.0
5,2605,13035,1,1564,in,1609,0.972032,MI,NY,12.0,13.0
6,2605,13047,-1,896,out,1694,0.528926,MI,FL,12.0,13.0
7,2605,13047,1,798,out,1694,0.471074,MI,FL,12.0,13.0
8,2605,14045,-1,857,out,1759,0.487209,MI,OH,12.0,16.0
9,2605,14045,1,902,out,1759,0.512791,MI,OH,12.0,16.0


Generate the state networks ensemble

In [56]:
# count all the different states
all_states = set(edgelist110['source_state']).union(set(edgelist110['target_state']))
all_states_pairs = list(itertools.combinations(all_states, 2))

In [67]:
def state_edge_from_random_sample(edgelist, state1, state2, f):
    temp = edgelist[(edgelist['source_state'] == state1) & (edgelist['target_state'] == state2)]
    n_pos = temp[temp['weight'] == 1]['count'].sum()    # number of times the two states voted together
    n_neg = temp[temp['weight'] == -1]['count'].sum()   # number of times the two states voted against each other
    s = [1]*n_pos + [-1]*n_neg
    s = np.random.choice(s, int(len(s)*f), replace=False)
    if sum(s) > 0:
        return 1
    elif sum(s) < 0:
        return -1

In [68]:
# define a function that for every pair of states returns the edgelist of the edges between the two states created from a random sample of the same size of the original edgelist
def create_state_edgelist(edgelist, all_states_pairs, frac):
    state_edgelist = pd.DataFrame()
    for state1, state2 in all_states_pairs:
        edge = state_edge_from_random_sample(edgelist, state1, state2, frac)
        state_edgelist = pd.concat([state_edgelist, pd.DataFrame({'source_state': [state1], 'target_state': [state2], 'weight': [edge]})])
    return state_edgelist

In [70]:
# create an ensemble of 100 edgelists from congress 110
dim_ensemble = 20

if not os.path.exists('./ensemble_edgelist110'):
    os.makedirs('./ensemble_edgelist110')

for i in tqdm(range(dim_ensemble)):
    temp = create_state_edgelist(edgelist110, all_states_pairs, 0.5)
    temp.to_csv('./ensemble_edgelist110/edgelist110_'+str(i)+'.csv', index=False)


100%|██████████| 20/20 [17:01<00:00, 51.07s/it]
