Before running this code, I did the manual work of sorting Open Secrets Campaign Finance data into separate folders. The folders were titled based on the content type (e.g. Individual Contribution, Pac Contributions, etc.).

List of all Congressional Canidates

In [1]:
import pandas as pd
import glob
import concurrent.futures

# Path to the CSV files
cands = r'C:\Users\Kameron\Documents\ESG Thesis\Data\Political_Contributions\Campaign Finance\cands\*.txt'

# Get a list of all files in the directory
all_files = glob.glob(cands)

# List of custom headers
headers = ["Field", "Cycle", "FECCandID", "CID", "FirstLastP", "Party", "DistIDRunFor", "DistIDCurr", "CurrCand", "CycleCand", "CRPICO", "RecipCode", "NoPacs"]

# Function to read a single CSV file with specified headers
def read_csv(file):
    try:
        df = pd.read_csv(file, encoding='ISO-8859-1', delimiter=',', quotechar='|', names=headers, low_memory=False)
        print(f"Successfully read {file}")
        return df
    except Exception as e:
        print(f"Error reading {file}: {e}")
        return None

# Use ThreadPoolExecutor to read files in parallel
with concurrent.futures.ThreadPoolExecutor() as executor:
    dfs = list(executor.map(read_csv, all_files))

# Concatenate all the dataframes in the list into a single dataframe
if dfs:
    big_df = pd.concat(dfs, ignore_index=True)
    print(f"Shape of the concatenated dataframe: {big_df.shape}")
    
    # Save the concatenated dataframe to a single CSV file
    output_path = r'C:\Users\Kameron\Documents\ESG Thesis\Data\Political_Contributions\Campaign Finance\cands.csv'
    big_df.to_csv(output_path, index=False)
    print(f"Concatenated file saved to {output_path}")
else:
    print("No dataframes to concatenate.")


Successfully read C:\Users\Kameron\Documents\ESG Thesis\Data\Political_Contributions\Campaign Finance\cands\cands14.txt
Successfully read C:\Users\Kameron\Documents\ESG Thesis\Data\Political_Contributions\Campaign Finance\cands\cands20.txt
Successfully read C:\Users\Kameron\Documents\ESG Thesis\Data\Political_Contributions\Campaign Finance\cands\cands12.txt
Successfully read C:\Users\Kameron\Documents\ESG Thesis\Data\Political_Contributions\Campaign Finance\cands\cands10.txt
Successfully read C:\Users\Kameron\Documents\ESG Thesis\Data\Political_Contributions\Campaign Finance\cands\cands22.txt
Successfully read C:\Users\Kameron\Documents\ESG Thesis\Data\Political_Contributions\Campaign Finance\cands\cands18.txt
Successfully read C:\Users\Kameron\Documents\ESG Thesis\Data\Political_Contributions\Campaign Finance\cands\cands16.txt
Shape of the concatenated dataframe: (49077, 13)
Concatenated file saved to C:\Users\Kameron\Documents\ESG Thesis\Data\Political_Contributions\Campaign Finance\

List of Committees

In [2]:
import pandas as pd
import glob
import concurrent.futures

# Path to the CSV files
cmtes = r'C:\Users\Kameron\Documents\ESG Thesis\Data\Political_Contributions\Campaign Finance\cmtes\*.txt'

# Get a list of all files in the directory
all_files = glob.glob(cmtes)

# List of custom headers
headers = ["Cycle", "CmteID", "PACShort", "Affiliate", "Ultorg", "RecipID", "RecipCode", "FECCandID", "Party", "PrimCode", "Source", "Sensitive", "Foreign", "Active"]

# Function to read a single CSV file with specified headers
def read_csv(file):
    try:
        df = pd.read_csv(file, encoding='ISO-8859-1', delimiter=',', quotechar='|', names=headers, low_memory=False)
        print(f"Successfully read {file}")
        return df
    except Exception as e:
        print(f"Error reading {file}: {e}")
        return None

# Use ThreadPoolExecutor to read files in parallel
with concurrent.futures.ThreadPoolExecutor() as executor:
    dfs = list(executor.map(read_csv, all_files))

# Filter out any None values in the list
dfs = [df for df in dfs if df is not None]

# Concatenate all the dataframes in the list into a single dataframe
if dfs:
    big_df = pd.concat(dfs, ignore_index=True)
    print(f"Shape of the concatenated dataframe: {big_df.shape}")
    
    # Save the concatenated dataframe to a single CSV file
    output_path = r'C:\Users\Kameron\Documents\ESG Thesis\Data\Political_Contributions\Campaign Finance\cmtes.csv'
    big_df.to_csv(output_path, index=False)
    print(f"Concatenated file saved to {output_path}")
else:
    print("No dataframes to concatenate.")


Successfully read C:\Users\Kameron\Documents\ESG Thesis\Data\Political_Contributions\Campaign Finance\cmtes\cmtes10.txt
Successfully read C:\Users\Kameron\Documents\ESG Thesis\Data\Political_Contributions\Campaign Finance\cmtes\cmtes12.txt
Successfully read C:\Users\Kameron\Documents\ESG Thesis\Data\Political_Contributions\Campaign Finance\cmtes\cmtes16.txt
Successfully read C:\Users\Kameron\Documents\ESG Thesis\Data\Political_Contributions\Campaign Finance\cmtes\cmtes14.txt
Successfully read C:\Users\Kameron\Documents\ESG Thesis\Data\Political_Contributions\Campaign Finance\cmtes\cmtes22.txt
Successfully read C:\Users\Kameron\Documents\ESG Thesis\Data\Political_Contributions\Campaign Finance\cmtes\cmtes20.txt
Successfully read C:\Users\Kameron\Documents\ESG Thesis\Data\Political_Contributions\Campaign Finance\cmtes\cmtes18.txt
Shape of the concatenated dataframe: (115651, 14)
Concatenated file saved to C:\Users\Kameron\Documents\ESG Thesis\Data\Political_Contributions\Campaign Finance

In [3]:
import pandas as pd
import glob
import concurrent.futures

# Path to the CSV files
pacs2cand = r'C:\Users\Kameron\Documents\ESG Thesis\Data\Political_Contributions\Campaign Finance\pacs\*.txt'

# Get a list of all files in the directory
all_files = glob.glob(pacs2cand)

# List of custom headers
headers = ["Cycle", "FECRecNo", "PACID", "CID", "Amount", "Date", "RealCode", "Type", "DI", "FECCandID"]

# Function to read a single CSV file with specified headers
def read_csv(file):
    try:
        df = pd.read_csv(file, encoding='ISO-8859-1', delimiter=',', quotechar='|', names=headers, low_memory=False)
        print(f"Successfully read {file}")
        return df
    except Exception as e:
        print(f"Error reading {file}: {e}")
        return None

# Use ThreadPoolExecutor to read files in parallel
with concurrent.futures.ThreadPoolExecutor() as executor:
    dfs = list(executor.map(read_csv, all_files))

# Filter out any None values in the list
dfs = [df for df in dfs if df is not None]

# Concatenate all the dataframes in the list into a single dataframe
if dfs:
    big_df = pd.concat(dfs, ignore_index=True)
    print(f"Shape of the concatenated dataframe: {big_df.shape}")
    
    # Save the concatenated dataframe to a single CSV file
    output_path = r'C:\Users\Kameron\Documents\ESG Thesis\Data\Political_Contributions\Campaign Finance\pacs.csv'
    big_df.to_csv(output_path, index=False)
    print(f"Concatenated file saved to {output_path}")
else:
    print("No dataframes to concatenate.")


Successfully read C:\Users\Kameron\Documents\ESG Thesis\Data\Political_Contributions\Campaign Finance\pacs\pacs12.txt
Successfully read C:\Users\Kameron\Documents\ESG Thesis\Data\Political_Contributions\Campaign Finance\pacs\pacs10.txt
Successfully read C:\Users\Kameron\Documents\ESG Thesis\Data\Political_Contributions\Campaign Finance\pacs\pacs14.txt
Successfully read C:\Users\Kameron\Documents\ESG Thesis\Data\Political_Contributions\Campaign Finance\pacs\pacs18.txt
Successfully read C:\Users\Kameron\Documents\ESG Thesis\Data\Political_Contributions\Campaign Finance\pacs\pacs16.txt
Successfully read C:\Users\Kameron\Documents\ESG Thesis\Data\Political_Contributions\Campaign Finance\pacs\pacs22.txt
Successfully read C:\Users\Kameron\Documents\ESG Thesis\Data\Political_Contributions\Campaign Finance\pacs\pacs20.txt
Shape of the concatenated dataframe: (3592977, 10)
Concatenated file saved to C:\Users\Kameron\Documents\ESG Thesis\Data\Political_Contributions\Campaign Finance\pacs.csv


List of Pac contributions to canidates

In [4]:
import pandas as pd
import glob
import concurrent.futures

# Path to the CSV files
pac2pac = r'C:\Users\Kameron\Documents\ESG Thesis\Data\Political_Contributions\Campaign Finance\pac_other\*.txt'

# Get a list of all files in the directory
all_files = glob.glob(pac2pac)

# List of custom headers
headers = [
    "Cycle", "FECRecNo", "Filerid", "DonorCmte", "ContribLendTrans", "City", "State", 
    "Zip", "FECOccEmp", "Primcode", "Date", "Amount", "RecipID", "Party", "Otherid", 
    "RecipCode", "RecipPrimcode", "Amend", "Report", "PG", "Microfilm", "Type", "RealCode", "Source"
]

# Function to read a single CSV file with specified headers
def read_csv(file):
    try:
        df = pd.read_csv(file, encoding='ISO-8859-1', delimiter=',', quotechar='|', names=headers, low_memory=False)
        print(f"Successfully read {file}")
        return df
    except Exception as e:
        print(f"Error reading {file}: {e}")
        return None

# Use ThreadPoolExecutor to read files in parallel
with concurrent.futures.ThreadPoolExecutor() as executor:
    dfs = list(executor.map(read_csv, all_files))

# Filter out any None values in the list
dfs = [df for df in dfs if df is not None]

# Concatenate all the dataframes in the list into a single dataframe
if dfs:
    big_df = pd.concat(dfs, ignore_index=True)
    print(f"Shape of the concatenated dataframe: {big_df.shape}")
    
    # Save the concatenated dataframe to a single CSV file
    output_path = r'C:\Users\Kameron\Documents\ESG Thesis\Data\Political_Contributions\Campaign Finance\pac2pac.csv'
    big_df.to_csv(output_path, index=False)
    print(f"Concatenated file saved to {output_path}")
else:
    print("No dataframes to concatenate.")


Successfully read C:\Users\Kameron\Documents\ESG Thesis\Data\Political_Contributions\Campaign Finance\pac_other\pac_other10.txt
Successfully read C:\Users\Kameron\Documents\ESG Thesis\Data\Political_Contributions\Campaign Finance\pac_other\pac_other14.txt
Successfully read C:\Users\Kameron\Documents\ESG Thesis\Data\Political_Contributions\Campaign Finance\pac_other\pac_other12.txt
Successfully read C:\Users\Kameron\Documents\ESG Thesis\Data\Political_Contributions\Campaign Finance\pac_other\pac_other16.txt
Successfully read C:\Users\Kameron\Documents\ESG Thesis\Data\Political_Contributions\Campaign Finance\pac_other\pac_other18.txt
Successfully read C:\Users\Kameron\Documents\ESG Thesis\Data\Political_Contributions\Campaign Finance\pac_other\pac_other20.txt
Successfully read C:\Users\Kameron\Documents\ESG Thesis\Data\Political_Contributions\Campaign Finance\pac_other\pac_other22.txt
Shape of the concatenated dataframe: (2631064, 24)
Concatenated file saved to C:\Users\Kameron\Documents

List of Citizens donating to poltical canidates

In [1]:
import pandas as pd
import glob
import concurrent.futures

# Path to the CSV files
indiv = r'C:\Users\Kameron\Documents\ESG Thesis\Data\Political_Contributions\Campaign Finance\indivs\*.txt'

# Get a list of all files in the directory
all_files = glob.glob(indiv)

# List of custom headers and their associated data types
data_types = {
    "Cycle": str,
    "FECTransID": str,
    "ContribID": str,
    "Contrib": str,
    "RecipID": str,
    "Orgname": str,
    "UltOrg": str,
    "RealCode": str,
    "Date": str,  # Consider converting this to datetime after reading
    "Amount": 'int64',
    "Street": str,
    "City": str,
    "State": str,
    "Zip": str,
    "RecipCode": str,
    "Type": str,
    "CmteID": str,
    "OtherID": str,
    "Gender": str,
    "Microfilm": str,
    "Occupation": str,
    "Employer": str,
    "Source": str
}

# Function to read a single CSV file with specified headers and data types
def read_csv(file):
    try:
        df = pd.read_csv(file, encoding='ISO-8859-1', delimiter=',', quotechar='|', dtype=data_types, names=list(data_types.keys()), low_memory=False)
        print(f"Successfully read {file}")
        return df
    except Exception as e:
        print(f"Error reading {file}: {e}")
        return None

# Use ThreadPoolExecutor to read files in parallel
with concurrent.futures.ThreadPoolExecutor() as executor:
    dfs = list(executor.map(read_csv, all_files))

# Filter out any None values in the list
dfs = [df for df in dfs if df is not None]

# Concatenate all the dataframes in the list into a single dataframe
if dfs:
    big_df = pd.concat(dfs, ignore_index=True)
    print(f"Shape of the concatenated dataframe: {big_df.shape}")
    
    # Save the concatenated dataframe to a single CSV file
    output_path = r'C:\Users\Kameron\Documents\ESG Thesis\Data\Political_Contributions\indivss.csv'
    big_df.to_csv(output_path, index=False)
    print(f"Concatenated file saved to {output_path}")
else:
    print("No dataframes to concatenate.")


ValueError: Table schema does not match schema used to create file: 
table:
Cycle: string
FECTransID: string
ContribID: string
Contrib: string
RecipID: string
Orgname: string
UltOrg: string
RealCode: string
Date: string
Amount: int64
Street: null
City: string
State: string
Zip: string
RecipCode: string
Type: string
CmteID: string
OtherID: null
Gender: string
Microfilm: string
Occupation: string
Employer: string
Source: string
-- schema metadata --
pandas: '{"index_columns": [{"kind": "range", "name": null, "start": 6300' + 2866 vs. 
file:
Cycle: string
FECTransID: string
ContribID: string
Contrib: string
RecipID: string
Orgname: string
UltOrg: string
RealCode: string
Date: string
Amount: int64
Street: null
City: string
State: string
Zip: string
RecipCode: string
Type: string
CmteID: string
OtherID: string
Gender: string
Microfilm: string
Occupation: string
Employer: string
Source: string
-- schema metadata --
pandas: '{"index_columns": [{"kind": "range", "name": null, "start": 0, "' + 2862