# Topic disaggregation

### Repeat the same process on data previously disaggregated according to the categories of the vote

In [2]:
import pandas as pd
import numpy as np

import glob, os
import itertools
from tqdm import tqdm

from scipy.stats import gaussian_kde

import shutil

import sys
sys.path.insert(0, '../local/bin')

Merge categories into "clustered topics"

In [11]:
def merge_congress_files(folder_names, merged_folder_name):
    # Create the merged folder if it doesn't exist
    if not os.path.exists(merged_folder_name):
        os.makedirs(merged_folder_name)

    for category in tqdm(folder_names):
        category_folder = os.path.join(os.getcwd(), category)
        merged_folder = os.path.join(os.getcwd(), merged_folder_name)

        # Iterate through the files in the category folder
        for root, _, files in os.walk(category_folder):
            for file in files:
                if file.startswith("congress_") and file.endswith(".csv"):
                    congress_number = int(file.split("_")[1].split(".")[0])
                    congress_file = os.path.join(root, file)
                    merged_congress_file = os.path.join(merged_folder, f"congress_{congress_number}.csv")

                    # If the file already exists in the merged folder, append data
                    if os.path.exists(merged_congress_file):
                        existing_data = pd.read_csv(merged_congress_file)
                        new_data = pd.read_csv(congress_file)
                        merged_data = pd.concat([existing_data, new_data], ignore_index=True)
                        merged_data.to_csv(merged_congress_file, index=False)
                    else:
                        # If the file doesn't exist in the merged folder, copy it
                        shutil.copy(congress_file, merged_folder)

In [4]:
def create_members_df(members, party_codes):
    temp_congress = members.groupby('icpsr', as_index=False)[['congress']].agg(lambda x: list(x))                                                           # group by icpsr and aggregate the congress numbers into a list
    temp_party = members.groupby('icpsr', as_index=False)[['party_code']].agg(lambda x: list(set(x)))                                                       # group by icpsr and aggregate the party codes into a list
    temp_congress = temp_congress.merge(temp_party)                                                                                                         # merge the two dataframes
    temp_congress['bioname'] = temp_congress['icpsr'].map(members[['icpsr', 'bioname']].set_index('icpsr').to_dict()['bioname'])                            # insert the bioname based on the icpsr 
    temp_congress['state_abbrev'] = temp_congress['icpsr'].map(members[['icpsr', 'state_abbrev']].set_index('icpsr').to_dict()['state_abbrev'])             # insert the state_abbrev based on the icpsr
    party_codes_dic = party_codes[['party_name', 'party_code']].set_index('party_code').to_dict()['party_name']                                             # create a dictionary for the party codes
    temp_congress['party_name'] = temp_congress['party_code'].apply(lambda x: [party_codes_dic[y] for y in x])                                              # insert the party name based on the party code
    return temp_congress

Generate meaningful sets listing the categories belonging to each set:

In [5]:
chamber = 'senate'
category_set = 'peltzman'
p = '../dataset/votes_'+chamber+'_'+category_set+'/'

In [6]:
internal = [p+'Budget_General_Interest/', 
			p+'Budget_Special_Interest/',
			p+'Regulation_General_Interest/', 
			p+'Regulation_Special_Interest/', 
			p+'Domestic_Social_Policy/', 
			p+'Government_Organization/',
			p+'Internal_Organization/',
			p+'D._C./']

foreign = [	p+'Defense_Policy_Budget/',
			p+'Defense_Policy_Resolution/',
			p+'Foreign_Policy_Budget/',
			p+'Foreign_Policy_Resolution/',
		   	p+'Indian_Affairs/']

internal_folder = p+'internal/'
foreign_folder = p+'foreign/'


In [9]:
internal_peltz = ['Budget General Interest','Budget Special Interest','Regulation General Interest','Regulation Special Interest','Domestic Social Policy','Government Organization','Internal Organization','D. C.']
foreign_peltz = ['Defense Policy Budget','Defense Policy Resolution','Foreign Policy Budget','Foreign Policy Resolution','Indian Affairs']

len(internal_peltz), len(foreign_peltz)

(8, 5)

If already merged, don't run the following cells

In [None]:
if not os.path.exists(internal_folder):
	os.makedirs(internal_folder)

if not os.path.exists(foreign_folder):
	os.makedirs(foreign_folder)

merge_congress_files(internal, internal_folder)
merge_congress_files(foreign, foreign_folder)

In [13]:
output_folder_internal = '../dataset/edgelists/votes_'+chamber+'_peltzman_edges/internal/'
output_folder_foreign = '../dataset/edgelists/votes_'+chamber+'_peltzman_edges/foreign/'

if not os.path.exists(output_folder_internal):
    os.makedirs(output_folder_internal)

if not os.path.exists(output_folder_foreign):
    os.makedirs(output_folder_foreign)

In [14]:
members = pd.read_csv('../dataset/HSall_members.csv')
party_codes = pd.read_csv('../dataset/HSall_parties.csv')

members_info = create_members_df(members, party_codes)

member_party_dict = members_info.set_index('icpsr')['party_name'].to_dict()		# member_id -> party_name

________________________________

## Create edgelists

In [15]:
def create_edgelist_from_congress(congress, members_party_dict):
	edgelist = pd.DataFrame()

	for voteid in tqdm(set(congress['id'])):                 # iterate over all votes id (ids are unique for each vote)

		temp = congress[congress['id'] == voteid]            # select the rows where the vote id is equal to the current vote id            

		yy = temp[temp['vote']=='Yea']['icpsr']              # select the icpsr of the members that voted "Yea"
		nn = temp[temp['vote']=='Nay']['icpsr']                         

		y = itertools.combinations(yy, 2)                    # all possible combinations of 2 members that voted "Yea"
		n = itertools.combinations(nn, 2)                
		o = itertools.product(yy, nn)                        # cartesian product of the 2 series

		y = pd.DataFrame(y, columns = ['source', 'target'])  # create a dataframe from the combinations of "Yea" voters
		y['weight'] = 1                                      # add a column with the weight of the edge
		y['count'] = 1                                         

		n = pd.DataFrame(n, columns = ['source', 'target'])     
		n['weight'] = 1                                         
		n['count'] = 1                                          

		o = pd.DataFrame(o, columns = ['source', 'target'])     
		o['weight'] = -1                                     # same but the link is negative                    
		o['count'] = 1                                          

		edgelist = pd.concat([edgelist, y, n, o])
		edgelist = edgelist.groupby(['source', 'target', 'weight']).sum().reset_index()     # group by source, target and weight and sum the count                  

	edgelist = pd.concat([edgelist, pd.DataFrame({
		'source': edgelist['target'],                        # new columns based on old columns: 
		'target': edgelist['source'],                        #   'newcolumn': dataframe['oldcolumn']
		'weight': edgelist['weight'],
		'count': edgelist['count']})])

	edgelist = edgelist.loc[edgelist['source'] < edgelist['target']]                    # remove duplicates
	edgelist = edgelist.groupby(['source', 'target', 'weight']).sum().reset_index()     # group by source, target and weight and sum the count
	edgelist['party'] = edgelist.apply(lambda row: 'in' if members_party_dict[row['source']] == members_party_dict[row['target']] else 'out', axis=1)   # create a column with the party of the edge

	map_votes = edgelist.groupby(['source', 'target'])['count'].sum().to_dict()                                                                         # create a dictionary with the number of votes togheter for each pair of nodes                               

	edgelist['votes_togheter'] = edgelist[['source', 'target']].apply(lambda x: map_votes[(x['source'], x['target'])], axis=1)
	edgelist['perc'] = edgelist['count']/edgelist['votes_togheter']

	return edgelist

#### Internal:

In [None]:
for csv in tqdm(glob.glob('../dataset/votes_'+chamber+'_peltzman/internal/*')):
		n_congress = int(os.path.basename(csv).replace('congress_', '').replace('.csv', ''))
		df_congress = pd.read_csv(csv)
		edgelist = create_edgelist_from_congress(df_congress, member_party_dict)
		edgelist.to_csv('../dataset/edgelists/votes_'+chamber+'_peltzman_edges/internal/congress_'+str(n_congress)+'.csv', index=False)

#### Foreign:

In [None]:
for csv in tqdm(glob.glob('../dataset/votes_'+chamber+'_peltzman/foreign/*')):
		n_congress = int(os.path.basename(csv).replace('congress_', '').replace('.csv', ''))
		df_congress = pd.read_csv(csv)
		edgelist = create_edgelist_from_congress(df_congress, member_party_dict)
		edgelist.to_csv('../dataset/edgelists/votes_'+chamber+'_peltzman_edges/foreign/congress_'+str(n_congress)+'.csv', index=False)

_________________

## Compute and create thresholds

In [17]:
def compute_threshold_intersx(df, weight):	#df is the edgelist, weight is the weight of the edge

	def _midpoint(p1, p2):
		return {'x': (p1['x']+p2['x'])/2, 'y': (p1['y']+p2['y'])/2}

	def line_intersection(in_party, out_party, intersect_points):
		index_in = np.argmax(in_party[1])
		index_out = np.argmax(out_party[1])

        # points of the mean of the distributions 
		point_in={'x': in_party[0][index_in], 'y': in_party[1][index_in]}
		point_out={'x': out_party[0][index_out], 'y': out_party[1][index_out]}

        # medianpoint (mean of the means) of the two distributions
		midpoint = _midpoint(point_in, point_out)
        
        #find index of intersection closer to midpoint
		index_closer = np.argmin([np.sqrt( (p[0] - midpoint['x'])**2 + (p[1] - midpoint['y'])**2 ) for p in intersect_points])

        # return x value of closer intersection
		return intersect_points[index_closer][0]

	x0 = df.loc[(df['party']=='in')&(df['weight'] == weight)]['perc']
	x1 = df.loc[(df['party']=='out')&(df['weight'] == weight)]['perc']
    
	bw = len(x0)**(-1./(2+4))
	kde0 = gaussian_kde(x0, bw_method=bw)
	bw = len(x1)**(-1./(2+4))
	kde1 = gaussian_kde(x1, bw_method=bw)

	xmin = min(x0.min(), x1.min())
	xmax = max(x0.max(), x1.max())
	dx = 0.2 * (xmax - xmin) # add a 20% margin, as the kde is wider than the data
	xmin -= dx
	xmax += dx

	x = np.linspace(xmin, xmax, 500)
	kde0_x = kde0(x)
	kde1_x = kde1(x)
	inters_x = np.minimum(kde0_x, kde1_x)

	idx = np.argwhere(np.diff(np.sign(kde0_x - kde1_x))).flatten()
    
	threshold = line_intersection([x, kde0_x], [x, kde0_x], [[x,y] for x,y in zip (x[idx], kde1_x[idx])])
	area_inters_x = np.trapz(inters_x, x)

	return threshold, area_inters_x

In [18]:
output_thresholds_folder_internal = '../dataset/thresholds/'+chamber+'_peltzman_thresholds/internal/'
output_thresholds_folder_foreign = '../dataset/thresholds/'+chamber+'_peltzman_thresholds/foreign/'

if not os.path.exists(output_thresholds_folder_internal):
    os.makedirs(output_thresholds_folder_internal)

if not os.path.exists(output_thresholds_folder_foreign):
    os.makedirs(output_thresholds_folder_foreign)

Internal Peltzman categories:

In [66]:
for csv in tqdm(glob.glob('../dataset/edgelists/votes_'+chamber+'_peltzman_edges/internal/*')):
	n_congress = int(os.path.basename(csv).replace('congress_', '').replace('.csv', ''))
	df_edges = pd.read_csv(csv)
	len_df = 0

	#as "n_votes" count the number of different occurences in the column "bill_number" in the correspoding votes file
	len_df = len(pd.read_csv('../dataset/votes_'+chamber+'_peltzman/internal/congress_' + str(n_congress) + '.csv')['bill_number'].unique())
	
	threshold_pos, area_pos = compute_threshold_intersx(df_edges, 1)
	threshold_neg, area_neg = compute_threshold_intersx(df_edges, -1)

	# Create a DataFrame with the desired columns and header
	df_output = pd.DataFrame({
		'pos_threshold': [threshold_pos],
		'pos_area': [area_pos],
		'neg_threshold': [threshold_neg],
		'neg_area': [area_neg],
		'n_votes': [len_df]
	})

	# Save the DataFrame to CSV with the specified filename and header
	output_filename = f'{n_congress}_dic_thresholds_norm.csv'
	df_output.to_csv('../dataset/thresholds/'+chamber+'_peltzman_thresholds/internal/' + output_filename, header=True, index=False)

100%|██████████| 37/37 [00:10<00:00,  3.66it/s]


Foreign peltzman categories:

In [67]:
for csv in tqdm(glob.glob('../dataset/edgelists/votes_'+chamber+'_peltzman_edges/foreign/*')):
	n_congress = int(os.path.basename(csv).replace('congress_', '').replace('.csv', ''))
	df_edges = pd.read_csv(csv)
	len_df = 0

	#as "n_votes" count the number of different occurences in the column "bill_number" in the correspoding votes file
	len_df = len(pd.read_csv('../dataset/votes_'+chamber+'_peltzman/foreign/congress_' + str(n_congress) + '.csv')['bill_number'].unique())

	threshold_pos, area_pos = compute_threshold_intersx(df_edges, 1)
	threshold_neg, area_neg = compute_threshold_intersx(df_edges, -1)

	# Create a DataFrame with the desired columns and header
	df_output = pd.DataFrame({
		'pos_threshold': [threshold_pos],
		'pos_area': [area_pos],
		'neg_threshold': [threshold_neg],
		'neg_area': [area_neg],
		'n_votes': [len_df]
	})

	# Save the DataFrame to CSV with the specified filename and header
	output_filename = f'{n_congress}_dic_thresholds_norm.csv'
	df_output.to_csv('../dataset/thresholds/'+chamber+'_peltzman_thresholds/foreign/' + output_filename, header=True, index=False)

100%|██████████| 37/37 [00:05<00:00,  7.34it/s]
