In [1]:
#(MASTER)
import os
import pandas as pd
import csv

input_file = "itpas2023_2024.txt"
output_file = "itpas2023_2024.csv"

# FEC column names (21 fields)
# FEC PAS2 File Columns

columns = [
    'CMTE_ID',
    'AMNDT_IND',
    'RPT_TP',
    'TRANSACTION_PGI',
    'IMAGE_NUM',
    'TRANSACTION_TP',
    'ENTITY_TP',
    'NAME',
    'CITY',
    'STATE',
    'ZIP_CODE',
    'EMPLOYER',
    'OCCUPATION',
    'TRANSACTION_DT',
    'TRANSACTION_AMT',
    'OTHER_ID',
    'CAND_ID',
    'TRAN_ID',
    'FILE_NUM',
    'MEMO_CD',
    'MEMO_TEXT',
    'SUB_ID'
]


# ---------- 1) Fast line count (for percentage progress) ----------
def count_lines(path, bufsize=64 * 1024 * 1024):  # 64 MB blocks
    total = 0
    with open(path, "rb") as fh:
        while True:
            block = fh.read(bufsize)
            if not block:
                break
            total += block.count(b"\n")
    return total

print("üìè Counting total lines (quick scan)...")
total_rows = count_lines(input_file)
print(f"üî¢ Total lines detected: {total_rows:,}")

# ---------- 2) Stream convert with progress ----------
chunksize = 500_000  # safe for 8 GB RAM
first_chunk = True
rows_processed = 0

reader = pd.read_csv(
    input_file,
    sep="|",
    names=columns,
    dtype=str,
    chunksize=chunksize,
    engine="python",        # tolerant parser
    on_bad_lines="skip",    # skip malformed rows (e.g., extra '|')
    encoding="utf-8",
    encoding_errors="ignore",
    quoting=csv.QUOTE_NONE, # treat quotes literally
    escapechar="\\"
)

for chunk in reader:
    # Write incrementally
    chunk.to_csv(output_file, mode="a", index=False, header=first_chunk)
    first_chunk = False

    # Update progress
    rows_processed += len(chunk)
    pct = (rows_processed / total_rows) * 100 if total_rows else 0.0
    print(f"‚úÖ Processed {rows_processed:,} rows ({pct:.2f}%)")

print(f"\nüéâ Conversion complete!\nüíæ Saved as: {output_file}\nüìä Total rows written: {rows_processed:,}")


üìè Counting total lines (quick scan)...
üî¢ Total lines detected: 703,985
‚úÖ Processed 500,000 rows (71.02%)
‚úÖ Processed 703,985 rows (100.00%)

üéâ Conversion complete!
üíæ Saved as: itpas2023_2024.csv
üìä Total rows written: 703,985


In [2]:
import pandas as pd

d1 = pd.read_csv("itpas2023_2024.csv")

  d1 = pd.read_csv("itpas2023_2024.csv")


In [3]:
d1.columns

Index(['CMTE_ID', 'AMNDT_IND', 'RPT_TP', 'TRANSACTION_PGI', 'IMAGE_NUM',
       'TRANSACTION_TP', 'ENTITY_TP', 'NAME', 'CITY', 'STATE', 'ZIP_CODE',
       'EMPLOYER', 'OCCUPATION', 'TRANSACTION_DT', 'TRANSACTION_AMT',
       'OTHER_ID', 'CAND_ID', 'TRAN_ID', 'FILE_NUM', 'MEMO_CD', 'MEMO_TEXT',
       'SUB_ID'],
      dtype='object')

In [4]:
keep_cols = ['CMTE_ID', 'NAME', 'TRANSACTION_DT', 'TRANSACTION_AMT', 'CAND_ID','ENTITY_TP']

In [5]:
# Relevant columns for your project
relevant_columns = [
    'NAME',
    'CAND_ID',
    'CMTE_ID',
    'TRANSACTION_DT',
    'TRANSACTION_AMT',
    'TRANSACTION_PGI',
    'ENTITY_TP',
    'STATE',
    'TRANSACTION_TP'
]

# Filter your dataframe
d1_filtered = d1[relevant_columns]

In [6]:
d1_filtered.head(20)

Unnamed: 0,NAME,CAND_ID,CMTE_ID,TRANSACTION_DT,TRANSACTION_AMT,TRANSACTION_PGI,ENTITY_TP,STATE,TRANSACTION_TP
0,CAPITO FOR WEST VIRGINIA,S4WV00159,C00777706,12122022.0,5000,P2026,CCM,WV,24K
1,JOHN CARTER FOR CONGRESS,H2TX31044,C00777706,12122022.0,2500,P2024,CCM,TX,24K
2,PETE SESSIONS FOR CONGRESS,H2TX03126,C00777706,12122022.0,2948,P2024,CCM,TX,24K
3,CAPITO FOR WEST VIRGINIA,S4WV00159,C00777706,1042023.0,-5000,P2026,CCM,WV,24K
4,CAPITO FOR WEST VIRGINIA,S4WV00159,C00777706,1042023.0,5000,P2026,CCM,WV,24K
5,KEVIN MCCARTHY FOR CONGRESS,H6CA22125,C00550392,1262023.0,2500,P2024,CCM,CA,24K
6,FRIENDS OF DAVID SCHWEIKERT,H4AZ06045,C00333104,1052023.0,5000,P2024,CCM,VA,24K
7,FRIENDS OF JOHN BARRASSO,S6WY00068,C00333104,1102023.0,1500,P2024,CCM,WY,24K
8,ADRIAN SMITH FOR CONGRESS,H6NE03115,C00325324,1262023.0,1000,P2024,CCM,NE,24K
9,LOUDERMILK FOR CONGRESS,H4GA11061,C00490235,1272023.0,1000,P2024,CCM,GA,24K


In [7]:
d1_filtered.shape

(703985, 9)

In [8]:
d1_filtered.isnull().sum()

NAME                802
CAND_ID            2104
CMTE_ID               0
TRANSACTION_DT     8142
TRANSACTION_AMT       0
TRANSACTION_PGI    1119
ENTITY_TP          1573
STATE              1031
TRANSACTION_TP        0
dtype: int64

In [9]:
d1_filtered['TRANSACTION_PGI'].unique()

array(['P2026', 'P2024', 'G2024', 'P', 'P2022', 'S2023', 'P2028', 'G2022',
       'G2023', 'S2022', 'R2022', 'G2026', 'O2022', 'P2023', 'G2020',
       'C2024', 'E2022', 'G2006', 'P2020', 'G2021', 'P2021', 'G2028',
       'P2019', 'O2023', 'S2024', nan, 'O2024', 'G', 'P2025', 'C2028',
       'G2025', 'G2019', 'R2024', 'P2013', 'G024', 'O', 'P2012', 'G2500',
       'P2029', 'P2002', 'P1500', 'P2004', 'C2023', 'C', 'P2010', 'P2000',
       'P2924', 'R2026', 'G2018', 'P2018', 'G2016', 'C5000', 'P1000',
       'G2014', 'G1000', 'P2016', 'G2924', 'O2028', 'G2008', 'P2027',
       'G2027', 'P400', 'G3000', 'P2008', 'P2014', 'G2010', 'R2023',
       'R2020', 'C2022', 'E2024', 'P2500', 'G2012', 'S2013', 'P5000',
       'P2202', 'P3300', 'G1700', 'G5000', 'P24', 'O2026', 'P0', 'S2000',
       'G2004', 'O2020', 'P2040', 'G3034', 'G2002', 'P2030', 'G2204',
       'G1014', 'P1790', 'G2017', 'C2026', 'S2025', 'G2030', 'S', 'S2026',
       '00', 'G224', 'G24', 'O2025'], dtype=object)

In [10]:
# Filter for presidential elections only
presidential_cycles = ['P2020', 'G2020', 'P2024', 'G2024']
df_presidential = d1_filtered[d1_filtered['TRANSACTION_PGI'].isin(presidential_cycles)]

In [11]:
df_presidential.shape

(460943, 9)

In [12]:
df_presidential.to_csv("itpas2023_2024_filtered.csv", index=False)

In [13]:
df_presidential['CAND_ID'].unique()

array(['H2TX31044', 'H2TX03126', 'H6CA22125', ..., 'H4RI02115',
       'H0LA08025', 'H4NY03093'], dtype=object)

In [14]:
df_presidential = df_presidential[df_presidential['TRANSACTION_PGI'].isin(['P2020', 'G2020', 'P2024', 'G2024'])]
df_presidential = df_presidential[df_presidential['CAND_ID'].str.startswith('P', na=False)]

In [15]:
df_presidential.shape

(45049, 9)

In [16]:
df_presidential.isnull().sum()

NAME                 52
CAND_ID               0
CMTE_ID               0
TRANSACTION_DT     3029
TRANSACTION_AMT       0
TRANSACTION_PGI       0
ENTITY_TP            54
STATE                82
TRANSACTION_TP        0
dtype: int64

In [17]:
df_presidential.head(20)

Unnamed: 0,NAME,CAND_ID,CMTE_ID,TRANSACTION_DT,TRANSACTION_AMT,TRANSACTION_PGI,ENTITY_TP,STATE,TRANSACTION_TP
1180,BLUE WAVE COMMUNICATIONS LLC,P80001571,C00748582,1092023.0,7000,P2024,ORG,IL,24A
1181,BLUE WAVE COMMUNICATIONS LLC,P80001571,C00748582,1232023.0,7000,P2024,ORG,IL,24A
4808,MOULTON FOR CONGRESS,P00011866,C00215285,3152023.0,2500,P2024,CCM,MA,24K
13726,VIVEK 2024,P40011082,C00716498,3082023.0,2000,P2024,CCM,AL,24K
14078,"PLANNED PARENTHOOD ACTION FUND, INC.",P40010977,C00489799,2142023.0,200,P2024,PAC,NY,24A
14079,"PLANNED PARENTHOOD ACTION FUND, INC.",P80001571,C00489799,2142023.0,200,P2024,PAC,NY,24A
17349,TIM SCOTT FOR AMERICA,P40012155,C00514026,6292023.0,2000,P2024,CCM,SC,24K
18170,BIDEN FOR PRESIDENT,P80000722,C00003251,5172023.0,5000,P2024,CCM,PA,24K
21555,"LEAGUE OF CONSERVATION VOTERS, INC.",P40013039,C00486845,5242023.0,39,P2024,ORG,DC,24A
21593,BUMPERACTIVE,P80000722,C90010620,5092023.0,2089,P2024,ORG,TX,24E


In [18]:
df_presidential['STATE'].unique()

array(['IL', 'MA', 'AL', 'NY', 'SC', 'PA', 'DC', 'TX', 'CA', 'IA', 'MD',
       'MO', 'NC', 'VA', 'ME', 'GA', 'IN', 'FL', 'CO', 'OH', 'AZ', 'NJ',
       'NM', 'AR', 'NV', 'LA', 'KY', 'VT', 'NH', 'WY', 'MI', 'ZZ', 'MN',
       'VI', 'DE', nan, 'UT', 'CT', 'MT', 'WA', 'TN', 'WI', 'OR', 'OK',
       'RI', 'AK', 'KS', 'WV', 'NE', 'HI', 'MS', 'ID', 'ND'], dtype=object)

In [19]:
# Filter to US states only
us_states = ['AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'FL', 'GA', 
             'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD', 
             'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', 
             'NM', 'NY', 'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 
             'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY', 'DC']

df_presidential = df_presidential[df_presidential['STATE'].isin(us_states)]

In [20]:
df_presidential.shape

(44904, 9)

In [21]:
df_presidential.isnull().sum()

NAME                  0
CAND_ID               0
CMTE_ID               0
TRANSACTION_DT     3019
TRANSACTION_AMT       0
TRANSACTION_PGI       0
ENTITY_TP             2
STATE                 0
TRANSACTION_TP        0
dtype: int64

In [22]:
df_presidential = df_presidential.dropna(subset=['TRANSACTION_DT'])

In [23]:
df_presidential.shape

(41885, 9)

In [24]:
df_presidential.isnull().sum()

NAME               0
CAND_ID            0
CMTE_ID            0
TRANSACTION_DT     0
TRANSACTION_AMT    0
TRANSACTION_PGI    0
ENTITY_TP          2
STATE              0
TRANSACTION_TP     0
dtype: int64

In [25]:
df_presidential['ENTITY_TP'].unique()

array(['ORG', 'CCM', 'PAC', 'CAN', 'IND', 'COM', 'PTY', nan], dtype=object)

In [26]:
df_presidential['ENTITY_TP'] = df_presidential['ENTITY_TP'].fillna('IND')

In [27]:
# Map entity type codes to full names
entity_mapping = {
    'IND': 'Individual',
    'COM': 'Committee',
    'PAC': 'Political Action Committee',
    'ORG': 'Organization',
    'CCM': 'Candidate Committee',
    'CAN': 'Candidate',
    'PTY': 'Party Organization'
}

df_presidential['ENTITY_TP'] = df_presidential['ENTITY_TP'].map(entity_mapping)

In [28]:
df_presidential.isnull().sum()

NAME               0
CAND_ID            0
CMTE_ID            0
TRANSACTION_DT     0
TRANSACTION_AMT    0
TRANSACTION_PGI    0
ENTITY_TP          0
STATE              0
TRANSACTION_TP     0
dtype: int64

In [29]:
# Map state codes to full names
state_mapping = {
    'AL': 'Alabama', 'AK': 'Alaska', 'AZ': 'Arizona', 'AR': 'Arkansas',
    'CA': 'California', 'CO': 'Colorado', 'CT': 'Connecticut', 'DE': 'Delaware',
    'FL': 'Florida', 'GA': 'Georgia', 'HI': 'Hawaii', 'ID': 'Idaho',
    'IL': 'Illinois', 'IN': 'Indiana', 'IA': 'Iowa', 'KS': 'Kansas',
    'KY': 'Kentucky', 'LA': 'Louisiana', 'ME': 'Maine', 'MD': 'Maryland',
    'MA': 'Massachusetts', 'MI': 'Michigan', 'MN': 'Minnesota', 'MS': 'Mississippi',
    'MO': 'Missouri', 'MT': 'Montana', 'NE': 'Nebraska', 'NV': 'Nevada',
    'NH': 'New Hampshire', 'NJ': 'New Jersey', 'NM': 'New Mexico', 'NY': 'New York',
    'NC': 'North Carolina', 'ND': 'North Dakota', 'OH': 'Ohio', 'OK': 'Oklahoma',
    'OR': 'Oregon', 'PA': 'Pennsylvania', 'RI': 'Rhode Island', 'SC': 'South Carolina',
    'SD': 'South Dakota', 'TN': 'Tennessee', 'TX': 'Texas', 'UT': 'Utah',
    'VT': 'Vermont', 'VA': 'Virginia', 'WA': 'Washington', 'WV': 'West Virginia',
    'WI': 'Wisconsin', 'WY': 'Wyoming', 'DC': 'District of Columbia'
}

df_presidential['STATE'] = df_presidential['STATE'].map(state_mapping)

In [30]:
df_presidential['TRANSACTION_DT'] = pd.to_datetime(df_presidential['TRANSACTION_DT'], format='%m%d%Y')

In [31]:
df_presidential.dtypes

NAME                       object
CAND_ID                    object
CMTE_ID                    object
TRANSACTION_DT     datetime64[ns]
TRANSACTION_AMT             int64
TRANSACTION_PGI            object
ENTITY_TP                  object
STATE                      object
TRANSACTION_TP             object
dtype: object

In [32]:
df_presidential.to_csv("Expen_Can_Com_Indep_2023_2024.csv", index=False)