# Topic disaggregation

### Repeat the same process on data previously disaggregated according to the categories of the vote

In [2]:
import pandas as pd
import networkx as nx
import numpy as np
import re

import glob, os
import itertools
from tqdm import tqdm

import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde
import seaborn as sns
from shapely.geometry import LineString
from matplotlib.lines import Line2D

In [3]:
import sys
import importlib

sys.path.insert(0, '../local/bin')

import edgelist_creation
from edgelist_creation import *

In [4]:
importlib.reload(edgelist_creation)

<module 'edgelist_creation' from '/home/leonardo/projects/USA_Congress_SigNet/notebooks/../local/bin/edgelist_creation.py'>

In [10]:
members = pd.read_csv('../dataset/HSall_members.csv')
party_codes = pd.read_csv('../dataset/HSall_parties.csv')

members_info = create_members_df(members, party_codes)

member_party_dict = members_info.set_index('icpsr')['party_name'].to_dict()		# member_id -> party_name

In [11]:
def create_df_from_congress(congress, members_party_dict):
	edgelist = pd.DataFrame()

	for voteid in tqdm(set(congress['id'])):                 # iterate over all votes id (ids are unique for each vote)

		temp = congress[congress['id'] == voteid]            # select the rows where the vote id is equal to the current vote id            

		yy = temp[temp['vote']=='Yea']['icpsr']              # select the icpsr of the members that voted "Yea"
		nn = temp[temp['vote']=='Nay']['icpsr']                         

		y = itertools.combinations(yy, 2)                    # all possible combinations of 2 members that voted "Yea"
		n = itertools.combinations(nn, 2)                
		o = itertools.product(yy, nn)                        # cartesian product of the 2 series

		y = pd.DataFrame(y, columns = ['source', 'target'])  # create a dataframe from the combinations of "Yea" voters
		y['weight'] = 1                                      # add a column with the weight of the edge
		y['count'] = 1                                         

		n = pd.DataFrame(n, columns = ['source', 'target'])     
		n['weight'] = 1                                         
		n['count'] = 1                                          

		o = pd.DataFrame(o, columns = ['source', 'target'])     
		o['weight'] = -1                                     # same but the link is negative                    
		o['count'] = 1                                          

		edgelist = pd.concat([edgelist, y, n, o])                    

	edgelist = pd.concat([edgelist, pd.DataFrame({
		'source': edgelist['target'],                        # new columns based on old columns: 
		'target': edgelist['source'],                        #   'newcolumn': dataframe['oldcolumn']
		'weight': edgelist['weight'],
		'count': edgelist['count']})])

	edgelist = edgelist.loc[edgelist['source'] < edgelist['target']]                    # remove duplicates
	edgelist = edgelist.groupby(['source', 'target', 'weight']).sum().reset_index()     # group by source, target and weight and sum the count
	edgelist['party'] = edgelist.apply(lambda row: 'in' if members_party_dict[row['source']] == members_party_dict[row['target']] else 'out', axis=1)   # create a column with the party of the edge

	map_votes = edgelist.groupby(['source', 'target'])['count'].sum().to_dict()                                                                         # create a dictionary with the number of votes togheter for each pair of nodes                               

	edgelist['votes_togheter'] = edgelist[['source', 'target']].apply(lambda x: map_votes[(x['source'], x['target'])], axis=1)
	edgelist['perc'] = edgelist['count']/edgelist['votes_togheter']

	return edgelist

In [12]:
congress_path_senate = '../dataset/download_votes_merged/*'
congress_path_senate = '../dataset/download_votes_merged_senate/*'	# Path to all congresses files

In [None]:
# Initialize a weighted bipartite graph
G = nx.Graph()

# Iterate through each CSV file
for csv_file in tqdm(glob.glob(congress_path)):
    # Read the CSV file into a pandas DataFrame
    df = pd.read_csv(csv_file)
    
    # Count the occurrences of each pair of Clausen and Peltzman categories
    counts = df.groupby(["Clausen", "Peltzman"]).size().reset_index(name="count")
    
    # Add edges to the graph with weights based on the counts
    for _, row in counts.iterrows():
        clausen_category = row["Clausen"]
        peltzman_category = row["Peltzman"]
        count = row["count"]
        
        G.add_node(clausen_category, bipartite=0)
        G.add_node(peltzman_category, bipartite=1)
        G.add_edge(clausen_category, peltzman_category, weight=count)

# Now you have a weighted bipartite graph (G) with edges representing the co-occurrences
# between Clausen and Peltzman categories and weights representing the number of occurrences.

In [None]:
print('Clausen categories: ', [n for n in G.nodes() if G.nodes[n]['bipartite'] == 0])
print('Number of Clausen categories: ', len([n for n in G.nodes() if G.nodes[n]['bipartite'] == 0]))

print('Peltzman categories: ', [n for n in G.nodes() if G.nodes[n]['bipartite'] == 1])
print('Number of Peltzman categories: ', len([n for n in G.nodes() if G.nodes[n]['bipartite'] == 1]))

In [None]:
def compute_kde_threshold(df, weight):
    
	def _midpoint(p1, p2):
		return {'x': (p1['x']+p2['x'])/2, 'y': (p1['y']+p2['y'])/2}

	def line_intersection(in_state, out_state, intersect_points):
		index_in = np.argmax(in_state[1])
		index_out = np.argmax(out_state[1])

        # points of the mean of the distributions 
		point_in={'x': in_state[0][index_in], 'y': in_state[1][index_in]}
		point_out={'x': out_state[0][index_out], 'y': out_state[1][index_out]}

        # medianpoint (mean of the means) of the two distributions
		midpoint = _midpoint(point_in, point_out)
        
        #find index of intersection closer to midpoint
		index_closer = np.argmin([np.sqrt( (p[0] - midpoint['x'])**2 + (p[1] - midpoint['y'])**2 ) for p in intersect_points])

        # return x value of closer intersection
		return intersect_points[index_closer][0]
    

	#label = "agree" if weight == 1 else "disagree"
	x0 = df.loc[(df['state']=='in')&(df['weight'] == weight)]['perc']
	x1 = df.loc[(df['state']=='out')&(df['weight'] == weight)]['perc']

	bw = len(x0)**(-1./(2+4))
	kde0 = gaussian_kde(x0, bw_method=bw)
	bw = len(x1)**(-1./(2+4))
	kde1 = gaussian_kde(x1, bw_method=bw)

	xmin = min(x0.min(), x1.min())
	xmax = max(x0.max(), x1.max())
	dx = 0.2 * (xmax - xmin) # add a 20% margin, as the kde is wider than the data
	xmin -= dx
	xmax += dx

	x = np.linspace(xmin, xmax, 500)
	kde0_x = kde0(x)
	kde1_x = kde1(x)

	idx = np.argwhere(np.diff(np.sign(kde0_x - kde1_x))).flatten()

	threshold = line_intersection([x, kde0_x], [x, kde0_x], [[x,y] for x,y in zip (x[idx], kde1_x[idx])])

	return threshold

find the thresholds for each congress

In [13]:
for csv_file in tqdm(glob.glob(congress_path)):
	congress = pd.read_csv(csv_file)
	edgelist = create_df_from_congress(congress, members_party_dict)
	threshoold_minus_1 = compute_kde_threshold(edgelist, -1)
	threshoold_plus_1 = compute_kde_threshold(edgelist, 1)


NameError: name 'congress_path' is not defined

In [None]:
# Function to scale the edge weights for edge thickness
def scale_edge_weights(weight_values, max_thickness=8):
    max_weight = max(weight_values)
    if max_weight == 0:
        return [0] * len(weight_values)
    return [max_thickness * w / max_weight for w in weight_values]

In [None]:
edge_color = 'black'  # Change this to any color you prefer

node_colors = [0 if G.nodes[n]['bipartite'] == 0 else 1 for n in G.nodes()]
node_colors = ['skyblue' if color == 0 else 'lightcoral' for color in node_colors]  # Choose node colors

edge_weights = [G[u][v]['weight'] for u, v in G.edges()]
edge_thickness = scale_edge_weights(edge_weights)

plt.figure(figsize=(15, 8))
pos = nx.bipartite_layout(G, [n for n in G.nodes() if G.nodes[n]['bipartite'] == 1])
nx.draw_networkx_nodes(G, pos=pos, node_color=node_colors, node_size=100)
nx.draw_networkx_edges(G, pos=pos, width=edge_thickness, edge_color=edge_color, alpha=0.3)
nx.draw_networkx_labels(G, pos=pos, font_size=10, font_color='black', font_family='sans-serif')

legend_elements = [
    Line2D([0], [0], marker='o', color='w', markerfacecolor='skyblue', markersize=10, label='Clausen cat.'),
    Line2D([0], [0], marker='o', color='w', markerfacecolor='lightcoral', markersize=10, label='Peltzman cat.')
]
plt.legend(handles=legend_elements, loc='upper center')

plt.axis('off')
plt.show()