In [1]:
import pandas as pd
import networkx as nx
import numpy as np
import utils
import os
from modules import ps


In [2]:
def clean_and_prepare_congress_data(congress, data_path="data/USA/Raw/", output_path="data/USA/Filtered/"):
        # Define file names
        members_file = f"{data_path}H{congress}_members.csv"
        votes_file = f"{data_path}H{congress}_votes.csv"
        output_file = f"{output_path}H{congress}_filtered_USA_votes.csv"

        # Load datasets
        members = pd.read_csv(members_file)
        votes = pd.read_csv(votes_file)

        # Ensure icpsr is treated as an integer
        members["icpsr"] = members["icpsr"].astype("Int64")
        votes["icpsr"] = votes["icpsr"].astype("Int64")

        # Merge datasets on icpsr
        merged = votes.merge(members, on="icpsr")

        # Select relevant columns
        merged = merged[["icpsr", "state_abbrev", "party_code", "cast_code", 
                         "rollnumber", "nominate_dim1", "nominate_dim2"]]

        # Normalize `cast_code`
        merged["cast_code"] = merged["cast_code"].apply(
            lambda x: 1 if 1 <= x <= 3 else (2 if 4 <= x <= 6 else x)
        )

        # Remove invalid votes
        merged = merged[~merged["cast_code"].isin([7, 9])]

        # Save the cleaned dataset
        merged.to_csv(output_file, index=False)

        return f"✅ Successfully processed Congress {congress} and saved to {output_file}"

In [3]:

congresses = [
    '095', '096', '097','098', '099', '100', '101', '102', '103','104',
    '105', '106', '107','108', '109', '110', '111', '112','113', '114',
    '115', '116','117','118'
]

for congress in congresses:
    clean_and_prepare_congress_data(congress)