In [2]:
#(MASTER)
import os
import pandas as pd
import csv

input_file = "weball20.txt"
output_file = "weball_2019_2020.csv"

# FEC column names (21 fields)
columns = [
    'CAND_ID',
    'CAND_NAME',
    'CAND_ICI',
    'PTY_CD',
    'CAND_PTY_AFFILIATION',
    'TTL_RECEIPTS',
    'TRANS_FROM_AUTH',
    'TTL_DISB',
    'TRANS_TO_AUTH',
    'COH_BOP',
    'COH_COP',
    'CAND_CONTRIB',
    'CAND_LOANS',
    'OTHER_LOANS',
    'CAND_LOAN_REPAY',
    'OTHER_LOAN_REPAY',
    'DEBTS_OWED_BY',
    'TTL_INDIV_CONTRIB',
    'CAND_OFFICE_ST',
    'CAND_OFFICE_DISTRICT',
    'SPEC_ELECTION',
    'PRIM_ELECTION',
    'RUN_ELECTION',
    'GEN_ELECTION',
    'GEN_ELECTION_PERCENT',
    'OTHER_POL_CMTE_CONTRIB',
    'POL_PTY_CONTRIB',
    'CVG_END_DT',
    'INDIV_REFUNDS',
    'CMTE_REFUNDS'
]

# ---------- 1) Fast line count (for percentage progress) ----------
def count_lines(path, bufsize=64 * 1024 * 1024):  # 64 MB blocks
    total = 0
    with open(path, "rb") as fh:
        while True:
            block = fh.read(bufsize)
            if not block:
                break
            total += block.count(b"\n")
    return total

print("üìè Counting total lines (quick scan)...")
total_rows = count_lines(input_file)
print(f"üî¢ Total lines detected: {total_rows:,}")

# ---------- 2) Stream convert with progress ----------
chunksize = 500_000  # safe for 8 GB RAM
first_chunk = True
rows_processed = 0

reader = pd.read_csv(
    input_file,
    sep="|",
    names=columns,
    dtype=str,
    chunksize=chunksize,
    engine="python",        # tolerant parser
    on_bad_lines="skip",    # skip malformed rows (e.g., extra '|')
    encoding="utf-8",
    encoding_errors="ignore",
    quoting=csv.QUOTE_NONE, # treat quotes literally
    escapechar="\\"
)

for chunk in reader:
    # Write incrementally
    chunk.to_csv(output_file, mode="a", index=False, header=first_chunk)
    first_chunk = False

    # Update progress
    rows_processed += len(chunk)
    pct = (rows_processed / total_rows) * 100 if total_rows else 0.0
    print(f"‚úÖ Processed {rows_processed:,} rows ({pct:.2f}%)")

print(f"\nüéâ Conversion complete!\nüíæ Saved as: {output_file}\nüìä Total rows written: {rows_processed:,}")


üìè Counting total lines (quick scan)...
üî¢ Total lines detected: 3,980
‚úÖ Processed 3,980 rows (100.00%)

üéâ Conversion complete!
üíæ Saved as: weball_2019_2020.csv
üìä Total rows written: 3,980
