In [None]:
import pandas as pd
import networkx as nx
from modules import ps
import sys
sys.path.append("C:/Users/kubic/Desktop/machine learning/Polarization/")

import functions

import utils
import os
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.sparse import csr_matrix
from sklearn.decomposition import PCA
from sklearn.manifold import MDS
import umap
import numpy as np
from itertools import combinations
from scipy.stats import gaussian_kde

In [None]:
def make_all_edges(df):
    df = df.groupby(by = ["rollnumber", "cast_code"]).apply(lambda x: pd.DataFrame(list(combinations(x["icpsr"], 2)))) # "combinations" makes all possible pairs of icpsr codes for every vote value
    df.columns = ("src", "trg")
    df = df.groupby(by = ["src", "trg"]).size().reset_index().rename(columns = {0: "nij"})                        # Counts how many times a pair of congressmen appears in df (i.e. they co-voted)
    return df

def make_pdfs(edges, nodes):
    party_lookup = nodes.set_index("icpsr")["party_code"].to_dict()
    edges["party_src"] = edges["src"].map(party_lookup)
    edges["party_trg"] = edges["trg"].map(party_lookup)
    edges["same_party"] = edges["party_src"] == edges["party_trg"]
    edges["nij"] /= edges["nij"].max()  # Normalize co-vote counts

    sp_pdf = gaussian_kde(edges[edges["same_party"]]["nij"])
    cp_pdf = gaussian_kde(edges[~edges["same_party"]]["nij"])
    return edges, sp_pdf, cp_pdf

def find_intersection(kde1, kde2, init_interval=0.01, scope=[0.4,1], convergence=0.0001):
    x_left, x_right = scope[0], scope[0] + init_interval
    while x_right < scope[1]:
        left, right = kde1(x_left)[0] - kde2(x_left)[0], kde1(x_right)[0] - kde2(x_right)[0]
        if left * right < 0:
            if init_interval <= convergence:
                return x_right
            return find_intersection(kde1, kde2, init_interval / 10, [x_left, x_right])
        x_left, x_right = x_right, x_right + init_interval
    return scope[0]

def save_network(edges, congress, threshold, output_folder):
    edges = edges[edges["nij"] > threshold]
    edges = edges[["src", "trg"]].astype(int)
    
    edges_output = os.path.join(output_folder, f"congress{congress}_edges.csv")
    edges.to_csv(edges_output, sep=",", index=False, header=False)
    
    print(f"Network saved: {edges_output}")

def process_congresses(congress_list, input_folder, output_folder):
    os.makedirs(output_folder, exist_ok=True)

    for congress in congress_list:
        print(f"Processing Congress {congress}...")
        input_votes = os.path.join(input_folder, f"H{congress}_filtered_USA_votes.csv")
        
        if not os.path.exists(input_votes):
            print(f"Warning: Data file for Congress {congress} not found, skipping.")
            continue

        # Load data
        votes_df = pd.read_csv(input_votes)

        # Create edgelist
        edges_df = make_all_edges(votes_df)

        # Generate PDFs
        edges_df, sp_pdf, cp_pdf = make_pdfs(edges_df, votes_df)

        # Compute intersection (threshold)
        threshold = find_intersection(sp_pdf, cp_pdf)

        # Save network
        save_network(edges_df, congress, threshold, output_folder)

# List of congress numbers
congresses = ['097']

# Run the process
input_folder = "Data/USA/Filtered/"
output_folder = "Data/USA/Micheles/"

process_congresses(congresses, input_folder, output_folder)

In [None]:
def calc_pol(congress, data_path="data/USA/Raw/", edge_list_folder="Data/USA/Micheles/"):
    edge_list_file = os.path.join(edge_list_folder, f"congress{congress}_edges.csv")
    
    if not os.path.exists(edge_list_file):
        print(f"Warning: Edge list for Congress {congress} not found, skipping.")
        return None  # Skip if no edge list exists

    edge_df = pd.read_csv(edge_list_file, header=None, names=['Source', 'Target'])

    # Create the graph
    G = nx.from_pandas_edgelist(edge_df, 'Source', 'Target')

    # Convert nodes to integers
    G = nx.relabel_nodes(G, lambda x: int(x))

    # Load members' data
    members_file = os.path.join(data_path, f"H{congress}_members.csv")
    if not os.path.exists(members_file):
        print(f"Warning: Members file for Congress {congress} not found, skipping.")
        return None

    members_df = pd.read_csv(members_file).dropna(subset=["nominate_dim1"])
    members_df["icpsr"] = members_df["icpsr"].astype(int)

    # Create dictionary of opinions
    opinions_x = dict(zip(members_df["icpsr"], members_df["nominate_dim1"]))

    # Filter only existing nodes in the graph
    opinions = {node: opinions_x[node] for node in G.nodes if node in opinions_x}

    if not opinions:  # If no valid opinions exist, skip
        print(f"Warning: No valid opinions found for Congress {congress}, skipping.")
        return None

    # Normalize opinions between -1 and 1
    min_opinion, max_opinion = min(opinions.values()), max(opinions.values())
    opinions = {k: 2 * (v - min_opinion) / (max_opinion - min_opinion) - 1 for k, v in opinions.items()}

    # Compute polarization score
    pol_score = ps.ge(opinions, {}, G)
    
    return pol_score
congresses =  ['095', '096', '097','098', '099', '100', '101', '102', '103','104', '105', '106', '107','108', '109', '110', '111', '112','113', '114', '115', '116','117','118']


pol_scores = {}
for congress in congresses:
    pol_score = calc_pol(congress)  # Run function
    pol_scores[int(congress)] = pol_score  
    print(f"Congress {int(congress)}: Polarization Score = {pol_score}")


plt.figure(figsize=(10, 5))
plt.plot(pol_scores.keys(), pol_scores.values(), marker='o', linestyle='-', color='b', label="Polarization Score")
plt.xlabel("Congress")
plt.ylabel("Polarization Score")
plt.title("Polarization Score by Congress")
plt.xticks(list(pol_scores.keys()))  # Set x-axis labels to be congress numbers
plt.legend()
plt.grid(True)
plt.show()