In [1]:
#(MASTER)
import os
import pandas as pd
import csv

input_file = "itpas2019_2020.txt"
output_file = "itpas2019_2020.csv"

# FEC column names (21 fields)
# FEC PAS2 File Columns

columns = [
    'CMTE_ID',
    'AMNDT_IND',
    'RPT_TP',
    'TRANSACTION_PGI',
    'IMAGE_NUM',
    'TRANSACTION_TP',
    'ENTITY_TP',
    'NAME',
    'CITY',
    'STATE',
    'ZIP_CODE',
    'EMPLOYER',
    'OCCUPATION',
    'TRANSACTION_DT',
    'TRANSACTION_AMT',
    'OTHER_ID',
    'CAND_ID',
    'TRAN_ID',
    'FILE_NUM',
    'MEMO_CD',
    'MEMO_TEXT',
    'SUB_ID'
]


# ---------- 1) Fast line count (for percentage progress) ----------
def count_lines(path, bufsize=64 * 1024 * 1024):  # 64 MB blocks
    total = 0
    with open(path, "rb") as fh:
        while True:
            block = fh.read(bufsize)
            if not block:
                break
            total += block.count(b"\n")
    return total

print("üìè Counting total lines (quick scan)...")
total_rows = count_lines(input_file)
print(f"üî¢ Total lines detected: {total_rows:,}")

# ---------- 2) Stream convert with progress ----------
chunksize = 500_000  # safe for 8 GB RAM
first_chunk = True
rows_processed = 0

reader = pd.read_csv(
    input_file,
    sep="|",
    names=columns,
    dtype=str,
    chunksize=chunksize,
    engine="python",        # tolerant parser
    on_bad_lines="skip",    # skip malformed rows (e.g., extra '|')
    encoding="utf-8",
    encoding_errors="ignore",
    quoting=csv.QUOTE_NONE, # treat quotes literally
    escapechar="\\"
)

for chunk in reader:
    # Write incrementally
    chunk.to_csv(output_file, mode="a", index=False, header=first_chunk)
    first_chunk = False

    # Update progress
    rows_processed += len(chunk)
    pct = (rows_processed / total_rows) * 100 if total_rows else 0.0
    print(f"‚úÖ Processed {rows_processed:,} rows ({pct:.2f}%)")

print(f"\nüéâ Conversion complete!\nüíæ Saved as: {output_file}\nüìä Total rows written: {rows_processed:,}")


üìè Counting total lines (quick scan)...
üî¢ Total lines detected: 887,829
‚úÖ Processed 500,000 rows (56.32%)
‚úÖ Processed 887,829 rows (100.00%)

üéâ Conversion complete!
üíæ Saved as: itpas2019_2020.csv
üìä Total rows written: 887,829


In [2]:
import pandas as pd

d1 = pd.read_csv("itpas2019_2020.csv")

  d1 = pd.read_csv("itpas2019_2020.csv")


In [3]:
d1.columns

Index(['CMTE_ID', 'AMNDT_IND', 'RPT_TP', 'TRANSACTION_PGI', 'IMAGE_NUM',
       'TRANSACTION_TP', 'ENTITY_TP', 'NAME', 'CITY', 'STATE', 'ZIP_CODE',
       'EMPLOYER', 'OCCUPATION', 'TRANSACTION_DT', 'TRANSACTION_AMT',
       'OTHER_ID', 'CAND_ID', 'TRAN_ID', 'FILE_NUM', 'MEMO_CD', 'MEMO_TEXT',
       'SUB_ID'],
      dtype='object')

In [4]:
keep_cols = ['CMTE_ID', 'NAME', 'TRANSACTION_DT', 'TRANSACTION_AMT', 'CAND_ID','ENTITY_TP']

In [5]:
# Relevant columns for your project
relevant_columns = [
    'NAME',
    'CAND_ID',
    'CMTE_ID',
    'TRANSACTION_DT',
    'TRANSACTION_AMT',
    'TRANSACTION_PGI',
    'ENTITY_TP',
    'STATE',
    'TRANSACTION_TP'
]

# Filter your dataframe
d1_filtered = d1[relevant_columns]

In [6]:
d1_filtered.head(20)

Unnamed: 0,NAME,CAND_ID,CMTE_ID,TRANSACTION_DT,TRANSACTION_AMT,TRANSACTION_PGI,ENTITY_TP,STATE,TRANSACTION_TP
0,TED YOHO FOR CONGRESS,H2FL06109,C00567180,1082019.0,1880,P2020,PAC,FL,24K
1,TEAM GRAHAM INC,H4SC03087,C00104885,12202018.0,3000,G2020,CCM,SC,24K
2,TIM SCOTT FOR SENATE,H0SC01279,C00104885,12202018.0,1000,P2022,CCM,SC,24K
3,FRIENDS OF JIM CLYBURN,H2SC02042,C00104885,1072019.0,470,P2020,CCM,SC,24K
4,JACKSON ADVOCATE,S8MS00287,C00688408,12122018.0,1000,G2018,ORG,MS,24E
5,FRIENDS OF MARK WARNER,S6VA00093,C00414425,1292019.0,1000,P,CCM,VA,24K
6,EMMER FOR CONGRESS,H4MN06087,C00325324,1252019.0,1500,P2020,CCM,MN,24K
7,CLAY JR. FOR CONGRESS,H0MO01066,C00366013,1102019.0,1000,P2020,CCM,MO,24K
8,MALONEY FOR CONGRESS,H2NY14037,C00366013,1022019.0,2500,P2020,CCM,NY,24K
9,MCHENRY FOR CONGRESS,H4NC10047,C00366013,1102019.0,1000,P2020,CCM,NC,24K


In [7]:
d1_filtered.shape

(887829, 9)

In [8]:
d1_filtered.isnull().sum()

NAME               1342
CAND_ID            1867
CMTE_ID               0
TRANSACTION_DT     5606
TRANSACTION_AMT       0
TRANSACTION_PGI    1860
ENTITY_TP          5159
STATE              1400
TRANSACTION_TP        0
dtype: int64

In [9]:
d1_filtered['TRANSACTION_PGI'].unique()

array(['P2020', 'G2020', 'P2022', 'G2018', 'P', 'P2024', 'P2019', 'P2018',
       'C2020', 'G2019', 'P2015', 'G2022', 'S2018', 'G2016', 'O2018',
       'G2024', 'E2018', 'P2016', 'G2014', 'C2018', 'G', 'S2020', 'O2020',
       'S2019', nan, 'P2014', 'P2017', 'P2021', 'R2018', 'R2016', 'P1000',
       'O2019', 'G2006', 'P2012', 'G2012', 'S2010', 'R2014', 'G5000', 'S',
       'R2019', 'P2013', 'S2015', 'P2040', 'G1000', 'R2020', 'C2022',
       'G2017', 'P2010', 'P3020', 'G2021', 'E2020', 'O', 'P5000', 'G2005',
       'P2026', 'P202', 'G1300', 'C2016', '2020', 'P019', '2019', 'C2019',
       'P020', 'S2017', 'C2024', 'G2025', 'G2002', 'G2015', 'P2023',
       'G2010', 'G2008', 'P2002', 'P220', 'P2000', 'G2200', 'P2008',
       'G2000', 'G1500', 'R2017', 'G2075', 'P1500', 'O2024', 'P2075',
       'P2025', 'P2929', 'P2030', 'G2500', 'G2029', 'S2024', 'P2202',
       'R2021', 'G2929', 'R', 'G2800', 'G2202', 'S2021', '02020', 'P3000',
       'G200', 'G2092', 'O2021', 'G2026', 'R2024', 'G3000

In [10]:
# Filter for presidential elections only
presidential_cycles = ['P2020', 'G2020', 'P2024', 'G2024']
df_presidential = d1_filtered[d1_filtered['TRANSACTION_PGI'].isin(presidential_cycles)]

In [11]:
df_presidential.shape

(855278, 9)

In [12]:
df_presidential.to_csv("itpas2019_2020_filtered.csv", index=False)

In [13]:
df_presidential['CAND_ID'].unique()

array(['H2FL06109', 'H4SC03087', 'H2SC02042', ..., 'H8IA04163',
       'S4FL00611', 'S2MO00593'], dtype=object)

In [14]:
df_presidential = df_presidential[df_presidential['TRANSACTION_PGI'].isin(['P2020', 'G2020', 'P2024', 'G2024'])]
df_presidential = df_presidential[df_presidential['CAND_ID'].str.startswith('P', na=False)]

In [15]:
df_presidential.shape

(308294, 9)

In [16]:
df_presidential.isnull().sum()

NAME                 44
CAND_ID               0
CMTE_ID               0
TRANSACTION_DT     1655
TRANSACTION_AMT       0
TRANSACTION_PGI       0
ENTITY_TP           506
STATE               115
TRANSACTION_TP        0
dtype: int64

In [17]:
df_presidential.head(20)

Unnamed: 0,NAME,CAND_ID,CMTE_ID,TRANSACTION_DT,TRANSACTION_AMT,TRANSACTION_PGI,ENTITY_TP,STATE,TRANSACTION_TP
1083,CAMPAIGN SOLUTIONS,P80001571,C00608489,1012019.0,20000,G2020,ORG,VA,24E
1084,CONNELL DONATELLI INC.,P80001571,C00608489,1012019.0,2000,G2020,ORG,VA,24E
1085,POLITICAL LIST BROKERS LLC,P80001571,C00608489,1012019.0,10000,G2020,ORG,VA,24E
1086,RIGHT COUNTRY LISTS,P80001571,C00608489,1012019.0,1000,G2020,ORG,VA,24E
1087,INFOCISION MANAGEMENT CORPORATION,P80001571,C00608489,1012019.0,35000,G2020,ORG,OH,24E
1088,MESSAGE MADE EASY LLC,P80001571,C00608489,1012019.0,1000,G2020,ORG,OH,24E
1089,CAMPAIGN SOLUTIONS,P80001571,C00608489,1102019.0,12633,G2020,ORG,VA,24E
1090,RRTVMEDIA LLC,P80001571,C00608489,1012019.0,97000,G2020,ORG,FL,24E
1091,CAMPAIGN SOLUTIONS,P80001571,C00608489,1312019.0,14944,G2020,ORG,VA,24E
1092,CAMPAIGN SOLUTIONS,P80001571,C00608489,1312019.0,9254,G2020,ORG,VA,24E


In [18]:
df_presidential['STATE'].unique()

array(['VA', 'OH', 'FL', 'KY', 'CA', 'IA', 'NJ', 'DC', 'MN', 'NY', 'AZ',
       'MD', 'TX', 'NC', 'MA', 'IL', 'WA', 'CO', 'MI', 'SC', 'OR', 'GA',
       'AL', 'MO', 'WI', 'PA', 'IN', nan, 'NV', 'RI', 'WY', 'PR', 'UT',
       'CT', 'KS', 'VT', 'OK', 'DE', 'LA', 'NH', 'WV', 'TN', 'NE', 'ME',
       'ZZ', 'MS', 'AK', 'MT', 'ID', 'AA', 'AR', 'HI', 'NM', 'ND'],
      dtype=object)

In [19]:
# Filter to US states only
us_states = ['AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'FL', 'GA', 
             'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD', 
             'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', 
             'NM', 'NY', 'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 
             'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY', 'DC']

df_presidential = df_presidential[df_presidential['STATE'].isin(us_states)]

In [20]:
df_presidential.shape

(308141, 9)

In [21]:
df_presidential.isnull().sum()

NAME                  0
CAND_ID               0
CMTE_ID               0
TRANSACTION_DT     1645
TRANSACTION_AMT       0
TRANSACTION_PGI       0
ENTITY_TP           462
STATE                 0
TRANSACTION_TP        0
dtype: int64

In [22]:
df_presidential = df_presidential.dropna(subset=['TRANSACTION_DT'])

In [23]:
df_presidential.shape

(306496, 9)

In [24]:
df_presidential.isnull().sum()

NAME                 0
CAND_ID              0
CMTE_ID              0
TRANSACTION_DT       0
TRANSACTION_AMT      0
TRANSACTION_PGI      0
ENTITY_TP          420
STATE                0
TRANSACTION_TP       0
dtype: int64

In [25]:
df_presidential['ENTITY_TP'].unique()

array(['ORG', 'CCM', 'IND', 'COM', 'CAN', 'PAC', nan, 'PTY'], dtype=object)

In [26]:
df_presidential['ENTITY_TP'] = df_presidential['ENTITY_TP'].fillna('IND')

In [27]:
# Map entity type codes to full names
entity_mapping = {
    'IND': 'Individual',
    'COM': 'Committee',
    'PAC': 'Political Action Committee',
    'ORG': 'Organization',
    'CCM': 'Candidate Committee',
    'CAN': 'Candidate',
    'PTY': 'Party Organization'
}

df_presidential['ENTITY_TP'] = df_presidential['ENTITY_TP'].map(entity_mapping)

In [28]:
df_presidential.isnull().sum()

NAME               0
CAND_ID            0
CMTE_ID            0
TRANSACTION_DT     0
TRANSACTION_AMT    0
TRANSACTION_PGI    0
ENTITY_TP          0
STATE              0
TRANSACTION_TP     0
dtype: int64

In [29]:
# Map state codes to full names
state_mapping = {
    'AL': 'Alabama', 'AK': 'Alaska', 'AZ': 'Arizona', 'AR': 'Arkansas',
    'CA': 'California', 'CO': 'Colorado', 'CT': 'Connecticut', 'DE': 'Delaware',
    'FL': 'Florida', 'GA': 'Georgia', 'HI': 'Hawaii', 'ID': 'Idaho',
    'IL': 'Illinois', 'IN': 'Indiana', 'IA': 'Iowa', 'KS': 'Kansas',
    'KY': 'Kentucky', 'LA': 'Louisiana', 'ME': 'Maine', 'MD': 'Maryland',
    'MA': 'Massachusetts', 'MI': 'Michigan', 'MN': 'Minnesota', 'MS': 'Mississippi',
    'MO': 'Missouri', 'MT': 'Montana', 'NE': 'Nebraska', 'NV': 'Nevada',
    'NH': 'New Hampshire', 'NJ': 'New Jersey', 'NM': 'New Mexico', 'NY': 'New York',
    'NC': 'North Carolina', 'ND': 'North Dakota', 'OH': 'Ohio', 'OK': 'Oklahoma',
    'OR': 'Oregon', 'PA': 'Pennsylvania', 'RI': 'Rhode Island', 'SC': 'South Carolina',
    'SD': 'South Dakota', 'TN': 'Tennessee', 'TX': 'Texas', 'UT': 'Utah',
    'VT': 'Vermont', 'VA': 'Virginia', 'WA': 'Washington', 'WV': 'West Virginia',
    'WI': 'Wisconsin', 'WY': 'Wyoming', 'DC': 'District of Columbia'
}

df_presidential['STATE'] = df_presidential['STATE'].map(state_mapping)

In [30]:
df_presidential['TRANSACTION_DT'] = pd.to_datetime(df_presidential['TRANSACTION_DT'], format='%m%d%Y')

In [31]:
df_presidential.dtypes

NAME                       object
CAND_ID                    object
CMTE_ID                    object
TRANSACTION_DT     datetime64[ns]
TRANSACTION_AMT             int64
TRANSACTION_PGI            object
ENTITY_TP                  object
STATE                      object
TRANSACTION_TP             object
dtype: object

In [32]:
df_presidential.to_csv("Expen_Can_Com_Indep_2019_2020.csv", index=False)