# Topic disaggregation

### Repeat the same process on data previously disaggregated according to the categories of the vote

In [4]:
import pandas as pd
import networkx as nx
import numpy as np
import re

import glob, os
import itertools
from tqdm import tqdm

from scipy.stats import gaussian_kde


import shutil

In [5]:
import sys
sys.path.insert(0, '../local/bin')

Merge categories into "clustered topics"

In [6]:
def merge_congress_files(folder_names, merged_folder_name):
    # Create the merged folder if it doesn't exist
    if not os.path.exists(merged_folder_name):
        os.makedirs(merged_folder_name)

    for category in folder_names:
        category_folder = os.path.join(os.getcwd(), category)
        merged_folder = os.path.join(os.getcwd(), merged_folder_name)

        # Iterate through the files in the category folder
        for root, _, files in os.walk(category_folder):
            for file in files:
                if file.startswith("congress_") and file.endswith(".csv"):
                    congress_number = int(file.split("_")[1].split(".")[0])
                    congress_file = os.path.join(root, file)
                    merged_congress_file = os.path.join(merged_folder, f"congress_{congress_number}.csv")

                    # If the file already exists in the merged folder, append data
                    if os.path.exists(merged_congress_file):
                        existing_data = pd.read_csv(merged_congress_file)
                        new_data = pd.read_csv(congress_file)
                        merged_data = pd.concat([existing_data, new_data], ignore_index=True)
                        merged_data.to_csv(merged_congress_file, index=False)
                    else:
                        # If the file doesn't exist in the merged folder, copy it
                        shutil.copy(congress_file, merged_folder)

In [None]:
# Example usage:
category_folders = ["../dataset/votes_house_clausen/Civil_Liberties/", "../dataset/votes_house_clausen/Miscellaneous_Policy/", "../dataset/votes_house_clausen/Government_Management/"]
merged_folder = "../dataset/try_merged/"
merge_congress_files(category_folders, merged_folder)

Generate meaningful sets listing the categories belonging to each set:

In [10]:
chamber = 'house'
category_set = 'peltzman'
p = '../dataset/votes_'+chamber+'_'+category_set+'/'

In [12]:
internal = [p+'Budget_General_Interest/', 
			p+'Budget_Special_Interest/',
			p+'Regulation_General_Interest/', 
			p+'Regulation_Special_Interest/', 
			p+'Domestic_Social_Policy/', 
			p+'Government_Organization/',
			p+'Internal_Organization/',
			p+'D._C./']
foreign = [	p+'Defense_Policy_Budget/',
			p+'Defense_Policy_Resolution/',
			p+'Foreign_Policy_Budget/',
			p+'Foreign_Policy_Resolution/',
		   	p+'Indian_Affairs/']

internal_folder = p+'internal/'
foreign_folder = p+'foreign/'


In [13]:
if not os.path.exists(internal_folder):
	os.makedirs(internal_folder)

if not os.path.exists(foreign_folder):
	os.makedirs(foreign_folder)

merge_congress_files(internal, internal_folder)
merge_congress_files(foreign, foreign_folder)

  new_data = pd.read_csv(congress_file)


In [7]:
members = pd.read_csv('../dataset/HSall_members.csv')
party_codes = pd.read_csv('../dataset/HSall_parties.csv')

members_info = create_members_df(members, party_codes)

member_party_dict = members_info.set_index('icpsr')['party_name'].to_dict()		# member_id -> party_name

________________________________

## Create edgelists

In [1]:
def create_members_df(members, party_codes):
    temp_congress = members.groupby('icpsr', as_index=False)[['congress']].agg(lambda x: list(x))                                                           # group by icpsr and aggregate the congress numbers into a list
    temp_party = members.groupby('icpsr', as_index=False)[['party_code']].agg(lambda x: list(set(x)))                                                       # group by icpsr and aggregate the party codes into a list
    temp_congress = temp_congress.merge(temp_party)                                                                                                         # merge the two dataframes
    temp_congress['bioname'] = temp_congress['icpsr'].map(members[['icpsr', 'bioname']].set_index('icpsr').to_dict()['bioname'])                            # insert the bioname based on the icpsr 
    temp_congress['state_abbrev'] = temp_congress['icpsr'].map(members[['icpsr', 'state_abbrev']].set_index('icpsr').to_dict()['state_abbrev'])             # insert the state_abbrev based on the icpsr
    party_codes_dic = party_codes[['party_name', 'party_code']].set_index('party_code').to_dict()['party_name']                                             # create a dictionary for the party codes
    temp_congress['party_name'] = temp_congress['party_code'].apply(lambda x: [party_codes_dic[y] for y in x])                                              # insert the party name based on the party code
    return temp_congress

In [2]:
def create_edgelist_from_congress(congress, members_party_dict):
	edgelist = pd.DataFrame()

	for voteid in tqdm(set(congress['id'])):                 # iterate over all votes id (ids are unique for each vote)

		temp = congress[congress['id'] == voteid]            # select the rows where the vote id is equal to the current vote id            

		yy = temp[temp['vote']=='Yea']['icpsr']              # select the icpsr of the members that voted "Yea"
		nn = temp[temp['vote']=='Nay']['icpsr']                         

		y = itertools.combinations(yy, 2)                    # all possible combinations of 2 members that voted "Yea"
		n = itertools.combinations(nn, 2)                
		o = itertools.product(yy, nn)                        # cartesian product of the 2 series

		y = pd.DataFrame(y, columns = ['source', 'target'])  # create a dataframe from the combinations of "Yea" voters
		y['weight'] = 1                                      # add a column with the weight of the edge
		y['count'] = 1                                         

		n = pd.DataFrame(n, columns = ['source', 'target'])     
		n['weight'] = 1                                         
		n['count'] = 1                                          

		o = pd.DataFrame(o, columns = ['source', 'target'])     
		o['weight'] = -1                                     # same but the link is negative                    
		o['count'] = 1                                          

		edgelist = pd.concat([edgelist, y, n, o])
		edgelist = edgelist.groupby(['source', 'target', 'weight']).sum().reset_index()     # group by source, target and weight and sum the count                  

	edgelist = pd.concat([edgelist, pd.DataFrame({
		'source': edgelist['target'],                        # new columns based on old columns: 
		'target': edgelist['source'],                        #   'newcolumn': dataframe['oldcolumn']
		'weight': edgelist['weight'],
		'count': edgelist['count']})])

	edgelist = edgelist.loc[edgelist['source'] < edgelist['target']]                    # remove duplicates
	edgelist = edgelist.groupby(['source', 'target', 'weight']).sum().reset_index()     # group by source, target and weight and sum the count
	edgelist['party'] = edgelist.apply(lambda row: 'in' if members_party_dict[row['source']] == members_party_dict[row['target']] else 'out', axis=1)   # create a column with the party of the edge

	map_votes = edgelist.groupby(['source', 'target'])['count'].sum().to_dict()                                                                         # create a dictionary with the number of votes togheter for each pair of nodes                               

	edgelist['votes_togheter'] = edgelist[['source', 'target']].apply(lambda x: map_votes[(x['source'], x['target'])], axis=1)
	edgelist['perc'] = edgelist['count']/edgelist['votes_togheter']

	return edgelist

#### Internal:

In [20]:
for csv in tqdm(glob.glob('../dataset/votes_house_peltzman/Internal/*')):
		n_congress = int(os.path.basename(csv).replace('congress_', '').replace('.csv', ''))
		df_congress = pd.read_csv(csv)
		edgelist = create_edgelist_from_congress(df_congress, member_party_dict)
		edgelist.to_csv('../dataset/edgelists/votes_house_peltzman_edges/Internal/congress_'+str(n_congress)+'.csv', index=False)

100%|██████████| 143/143 [00:09<00:00, 15.70it/s]
100%|██████████| 136/136 [00:08<00:00, 16.08it/s]
100%|██████████| 406/406 [00:27<00:00, 14.77it/s]
100%|██████████| 722/722 [00:56<00:00, 12.76it/s]
100%|██████████| 677/677 [00:54<00:00, 12.45it/s]
100%|██████████| 659/659 [00:51<00:00, 12.89it/s]
100%|██████████| 118/118 [00:08<00:00, 14.29it/s]
100%|██████████| 533/533 [00:38<00:00, 13.73it/s]
100%|██████████| 802/802 [01:11<00:00, 11.17it/s]
100%|██████████| 943/943 [01:16<00:00, 12.28it/s]
100%|██████████| 108/108 [00:06<00:00, 17.53it/s]
100%|██████████| 187/187 [00:11<00:00, 15.72it/s]
100%|██████████| 1430/1430 [02:20<00:00, 10.19it/s]
100%|██████████| 680/680 [00:55<00:00, 12.22it/s]
100%|██████████| 1227/1227 [01:46<00:00, 11.48it/s]
100%|██████████| 116/116 [00:07<00:00, 14.94it/s]
100%|██████████| 155/155 [00:10<00:00, 14.90it/s]
100%|██████████| 722/722 [00:57<00:00, 12.65it/s]
100%|██████████| 916/916 [01:19<00:00, 11.46it/s]
100%|██████████| 988/988 [01:33<00:00, 10.52it

#### Foreign:

In [21]:
for csv in tqdm(glob.glob('../dataset/votes_house_peltzman/Foreign/*')):
		n_congress = int(os.path.basename(csv).replace('congress_', '').replace('.csv', ''))
		df_congress = pd.read_csv(csv)
		edgelist = create_edgelist_from_congress(df_congress, member_party_dict)
		edgelist.to_csv('../dataset/edgelists/votes_house_peltzman_edges/Foreign/congress_'+str(n_congress)+'.csv', index=False)

100%|██████████| 26/26 [00:01<00:00, 15.56it/s]
100%|██████████| 27/27 [00:01<00:00, 17.01it/s]
100%|██████████| 55/55 [00:03<00:00, 15.59it/s]
100%|██████████| 121/121 [00:08<00:00, 13.82it/s]
100%|██████████| 122/122 [00:08<00:00, 14.26it/s]
100%|██████████| 157/157 [00:11<00:00, 13.73it/s]
100%|██████████| 22/22 [00:01<00:00, 13.82it/s]
100%|██████████| 95/95 [00:06<00:00, 15.66it/s]
100%|██████████| 94/94 [00:07<00:00, 13.34it/s]
100%|██████████| 129/129 [00:09<00:00, 13.64it/s]
100%|██████████| 36/36 [00:02<00:00, 14.67it/s]
100%|██████████| 33/33 [00:02<00:00, 14.11it/s]
100%|██████████| 271/271 [00:22<00:00, 12.26it/s]
100%|██████████| 234/234 [00:17<00:00, 13.51it/s]
100%|██████████| 239/239 [00:16<00:00, 14.10it/s]
100%|██████████| 21/21 [00:01<00:00, 16.83it/s]
100%|██████████| 26/26 [00:01<00:00, 15.07it/s]
100%|██████████| 102/102 [00:06<00:00, 14.92it/s]
100%|██████████| 148/148 [00:11<00:00, 13.27it/s]
100%|██████████| 120/120 [00:08<00:00, 13.52it/s]
100%|██████████| 57/

_________________

## Compute and create thresholds

In [22]:
def compute_threshold_intersx(df, weight):	#df is the edgelist, weight is the weight of the edge

	def _midpoint(p1, p2):
		return {'x': (p1['x']+p2['x'])/2, 'y': (p1['y']+p2['y'])/2}

	def line_intersection(in_party, out_party, intersect_points):
		index_in = np.argmax(in_party[1])
		index_out = np.argmax(out_party[1])

        # points of the mean of the distributions 
		point_in={'x': in_party[0][index_in], 'y': in_party[1][index_in]}
		point_out={'x': out_party[0][index_out], 'y': out_party[1][index_out]}

        # medianpoint (mean of the means) of the two distributions
		midpoint = _midpoint(point_in, point_out)
        
        #find index of intersection closer to midpoint
		index_closer = np.argmin([np.sqrt( (p[0] - midpoint['x'])**2 + (p[1] - midpoint['y'])**2 ) for p in intersect_points])

        # return x value of closer intersection
		return intersect_points[index_closer][0]

	x0 = df.loc[(df['party']=='in')&(df['weight'] == weight)]['perc']
	x1 = df.loc[(df['party']=='out')&(df['weight'] == weight)]['perc']
    
	bw = len(x0)**(-1./(2+4))
	kde0 = gaussian_kde(x0, bw_method=bw)
	bw = len(x1)**(-1./(2+4))
	kde1 = gaussian_kde(x1, bw_method=bw)

	xmin = min(x0.min(), x1.min())
	xmax = max(x0.max(), x1.max())
	dx = 0.2 * (xmax - xmin) # add a 20% margin, as the kde is wider than the data
	xmin -= dx
	xmax += dx

	x = np.linspace(xmin, xmax, 500)
	kde0_x = kde0(x)
	kde1_x = kde1(x)
	inters_x = np.minimum(kde0_x, kde1_x)

	idx = np.argwhere(np.diff(np.sign(kde0_x - kde1_x))).flatten()
    
	threshold = line_intersection([x, kde0_x], [x, kde0_x], [[x,y] for x,y in zip (x[idx], kde1_x[idx])])
	area_inters_x = np.trapz(inters_x, x)

	return threshold, area_inters_x

#### Internal:

In [23]:
for csv in tqdm(glob.glob('../dataset/edgelists/votes_house_peltzman_edges/Internal/*')):
	n_congress = int(os.path.basename(csv).replace('congress_', '').replace('.csv', ''))
	df_congress = pd.read_csv(csv)

	threshold_pos, area_pos = compute_threshold_intersx(df_congress, 1)
	threshold_neg, area_neg = compute_threshold_intersx(df_congress, -1)
	len_df = len(df_congress)

	# Create a DataFrame with the desired columns and header
	df_output = pd.DataFrame({
		'pos_threshold': [threshold_pos],
		'pos_area': [area_pos],
		'neg_threshold': [threshold_neg],
		'neg_area': [area_neg],
		'n_votes': [len_df]
	})

	# Save the DataFrame to CSV with the specified filename and header
	output_filename = f'{n_congress}_dic_thresholds_norm.csv'
	df_output.to_csv('../dataset/thresholds/house_peltzman_thresholds/Internal/' + output_filename, header=True, index=False)

100%|██████████| 37/37 [00:43<00:00,  1.17s/it]


#### Foreign:

In [24]:
for csv in glob.glob('../dataset/edgelists/votes_house_peltzman_edges/Foreign/*'):
	n_congress = int(os.path.basename(csv).replace('congress_', '').replace('.csv', ''))
	df_congress = pd.read_csv(csv)

	threshold_pos, area_pos = compute_threshold_intersx(df_congress, 1)
	threshold_neg, area_neg = compute_threshold_intersx(df_congress, -1)
	len_df = len(df_congress)

	# Create a DataFrame with the desired columns and header
	df_output = pd.DataFrame({
		'pos_threshold': [threshold_pos],
		'pos_area': [area_pos],
		'neg_threshold': [threshold_neg],
		'neg_area': [area_neg],
		'n_votes': [len_df]
	})

	# Save the DataFrame to CSV with the specified filename and header
	output_filename = f'{n_congress}_dic_thresholds_norm.csv'
	df_output.to_csv('../dataset/thresholds/house_peltzman_thresholds/Foreign/' + output_filename, header=True, index=False)