In [1]:
#(MASTER)
import os
import pandas as pd
import csv

input_file = "itcont(2021_2022).txt"
output_file = "itcont_2021_2022.csv"

# FEC column names (21 fields)
columns = [
    "CMTE_ID","AMNDT_IND","RPT_TP","TRANSACTION_PGI","IMAGE_NUM",
    "TRANSACTION_TP","ENTITY_TP","NAME","CITY","STATE","ZIP_CODE",
    "EMPLOYER","OCCUPATION","TRANSACTION_DT","TRANSACTION_AMT",
    "OTHER_ID","TRAN_ID","FILE_NUM","MEMO_CD","MEMO_TEXT","SUB_ID"
]

# ---------- 1) Fast line count (for percentage progress) ----------
def count_lines(path, bufsize=64 * 1024 * 1024):  # 64 MB blocks
    total = 0
    with open(path, "rb") as fh:
        while True:
            block = fh.read(bufsize)
            if not block:
                break
            total += block.count(b"\n")
    return total

print("üìè Counting total lines (quick scan)...")
total_rows = count_lines(input_file)
print(f"üî¢ Total lines detected: {total_rows:,}")

# ---------- 2) Stream convert with progress ----------
chunksize = 500_000  # safe for 8 GB RAM
first_chunk = True
rows_processed = 0

reader = pd.read_csv(
    input_file,
    sep="|",
    names=columns,
    dtype=str,
    chunksize=chunksize,
    engine="python",        # tolerant parser
    on_bad_lines="skip",    # skip malformed rows (e.g., extra '|')
    encoding="utf-8",
    encoding_errors="ignore",
    quoting=csv.QUOTE_NONE, # treat quotes literally
    escapechar="\\"
)

for chunk in reader:
    # Write incrementally
    chunk.to_csv(output_file, mode="a", index=False, header=first_chunk)
    first_chunk = False

    # Update progress
    rows_processed += len(chunk)
    pct = (rows_processed / total_rows) * 100 if total_rows else 0.0
    print(f"‚úÖ Processed {rows_processed:,} rows ({pct:.2f}%)")

print(f"\nüéâ Conversion complete!\nüíæ Saved as: {output_file}\nüìä Total rows written: {rows_processed:,}")


üìè Counting total lines (quick scan)...
üî¢ Total lines detected: 63,885,978
‚úÖ Processed 500,000 rows (0.78%)
‚úÖ Processed 1,000,000 rows (1.57%)
‚úÖ Processed 1,500,000 rows (2.35%)
‚úÖ Processed 2,000,000 rows (3.13%)
‚úÖ Processed 2,500,000 rows (3.91%)
‚úÖ Processed 3,000,000 rows (4.70%)
‚úÖ Processed 3,500,000 rows (5.48%)
‚úÖ Processed 4,000,000 rows (6.26%)
‚úÖ Processed 4,500,000 rows (7.04%)
‚úÖ Processed 5,000,000 rows (7.83%)
‚úÖ Processed 5,500,000 rows (8.61%)
‚úÖ Processed 6,000,000 rows (9.39%)
‚úÖ Processed 6,500,000 rows (10.17%)
‚úÖ Processed 7,000,000 rows (10.96%)
‚úÖ Processed 7,500,000 rows (11.74%)
‚úÖ Processed 8,000,000 rows (12.52%)
‚úÖ Processed 8,500,000 rows (13.30%)
‚úÖ Processed 9,000,000 rows (14.09%)
‚úÖ Processed 9,500,000 rows (14.87%)
‚úÖ Processed 10,000,000 rows (15.65%)
‚úÖ Processed 10,500,000 rows (16.44%)
‚úÖ Processed 11,000,000 rows (17.22%)
‚úÖ Processed 11,500,000 rows (18.00%)
‚úÖ Processed 12,000,000 rows (18.78%)
‚úÖ Processed 12,

In [2]:
#(MASTER)
import os
import pandas as pd
import csv

input_file = "itcont(2023_2024).txt"
output_file = "itcont_2023_2024.csv"

# FEC column names (21 fields)
columns = [
    "CMTE_ID","AMNDT_IND","RPT_TP","TRANSACTION_PGI","IMAGE_NUM",
    "TRANSACTION_TP","ENTITY_TP","NAME","CITY","STATE","ZIP_CODE",
    "EMPLOYER","OCCUPATION","TRANSACTION_DT","TRANSACTION_AMT",
    "OTHER_ID","TRAN_ID","FILE_NUM","MEMO_CD","MEMO_TEXT","SUB_ID"
]

# ---------- 1) Fast line count (for percentage progress) ----------
def count_lines(path, bufsize=64 * 1024 * 1024):  # 64 MB blocks
    total = 0
    with open(path, "rb") as fh:
        while True:
            block = fh.read(bufsize)
            if not block:
                break
            total += block.count(b"\n")
    return total

print("üìè Counting total lines (quick scan)...")
total_rows = count_lines(input_file)
print(f"üî¢ Total lines detected: {total_rows:,}")

# ---------- 2) Stream convert with progress ----------
chunksize = 500_000  # safe for 8 GB RAM
first_chunk = True
rows_processed = 0

reader = pd.read_csv(
    input_file,
    sep="|",
    names=columns,
    dtype=str,
    chunksize=chunksize,
    engine="python",        # tolerant parser
    on_bad_lines="skip",    # skip malformed rows (e.g., extra '|')
    encoding="utf-8",
    encoding_errors="ignore",
    quoting=csv.QUOTE_NONE, # treat quotes literally
    escapechar="\\"
)

for chunk in reader:
    # Write incrementally
    chunk.to_csv(output_file, mode="a", index=False, header=first_chunk)
    first_chunk = False

    # Update progress
    rows_processed += len(chunk)
    pct = (rows_processed / total_rows) * 100 if total_rows else 0.0
    print(f"‚úÖ Processed {rows_processed:,} rows ({pct:.2f}%)")

print(f"\nüéâ Conversion complete!\nüíæ Saved as: {output_file}\nüìä Total rows written: {rows_processed:,}")


üìè Counting total lines (quick scan)...
üî¢ Total lines detected: 58,271,239
‚úÖ Processed 500,000 rows (0.86%)
‚úÖ Processed 1,000,000 rows (1.72%)
‚úÖ Processed 1,500,000 rows (2.57%)
‚úÖ Processed 2,000,000 rows (3.43%)
‚úÖ Processed 2,500,000 rows (4.29%)
‚úÖ Processed 3,000,000 rows (5.15%)
‚úÖ Processed 3,500,000 rows (6.01%)
‚úÖ Processed 4,000,000 rows (6.86%)
‚úÖ Processed 4,500,000 rows (7.72%)
‚úÖ Processed 5,000,000 rows (8.58%)
‚úÖ Processed 5,500,000 rows (9.44%)
‚úÖ Processed 6,000,000 rows (10.30%)
‚úÖ Processed 6,500,000 rows (11.15%)
‚úÖ Processed 7,000,000 rows (12.01%)
‚úÖ Processed 7,500,000 rows (12.87%)
‚úÖ Processed 8,000,000 rows (13.73%)
‚úÖ Processed 8,500,000 rows (14.59%)
‚úÖ Processed 9,000,000 rows (15.45%)
‚úÖ Processed 9,500,000 rows (16.30%)
‚úÖ Processed 10,000,000 rows (17.16%)
‚úÖ Processed 10,500,000 rows (18.02%)
‚úÖ Processed 11,000,000 rows (18.88%)
‚úÖ Processed 11,500,000 rows (19.74%)
‚úÖ Processed 12,000,000 rows (20.59%)
‚úÖ Processed 12

In [3]:
#(MASTER 2019-2020)
import os
import pandas as pd
import csv

input_file = "itcont(2019_2020).txt"
output_file = "itcont_2019_2020.csv"

# FEC column names (21 fields)
columns = [
    "CMTE_ID","AMNDT_IND","RPT_TP","TRANSACTION_PGI","IMAGE_NUM",
    "TRANSACTION_TP","ENTITY_TP","NAME","CITY","STATE","ZIP_CODE",
    "EMPLOYER","OCCUPATION","TRANSACTION_DT","TRANSACTION_AMT",
    "OTHER_ID","TRAN_ID","FILE_NUM","MEMO_CD","MEMO_TEXT","SUB_ID"
]

# ---------- 1) Fast line count (for percentage progress) ----------
def count_lines(path, bufsize=64 * 1024 * 1024):  # 64 MB blocks
    total = 0
    with open(path, "rb") as fh:
        while True:
            block = fh.read(bufsize)
            if not block:
                break
            total += block.count(b"\n")
    return total

print("üìè Counting total lines (quick scan)...")
total_rows = count_lines(input_file)
print(f"üî¢ Total lines detected: {total_rows:,}")

# ---------- 2) Stream convert with progress ----------
chunksize = 500_000  # safe for 8 GB RAM
first_chunk = True
rows_processed = 0

reader = pd.read_csv(
    input_file,
    sep="|",
    names=columns,
    dtype=str,
    chunksize=chunksize,
    engine="python",        # tolerant parser
    on_bad_lines="skip",    # skip malformed rows (e.g., extra '|')
    encoding="utf-8",
    encoding_errors="ignore",
    quoting=csv.QUOTE_NONE, # treat quotes literally
    escapechar="\\"
)

for chunk in reader:
    # Write incrementally
    chunk.to_csv(output_file, mode="a", index=False, header=first_chunk)
    first_chunk = False

    # Update progress
    rows_processed += len(chunk)
    pct = (rows_processed / total_rows) * 100 if total_rows else 0.0
    print(f"‚úÖ Processed {rows_processed:,} rows ({pct:.2f}%)")

print(f"\nüéâ Conversion complete!\nüíæ Saved as: {output_file}\nüìä Total rows written: {rows_processed:,}")


üìè Counting total lines (quick scan)...
üî¢ Total lines detected: 69,377,425
‚úÖ Processed 500,000 rows (0.72%)
‚úÖ Processed 1,000,000 rows (1.44%)
‚úÖ Processed 1,500,000 rows (2.16%)
‚úÖ Processed 2,000,000 rows (2.88%)
‚úÖ Processed 2,500,000 rows (3.60%)
‚úÖ Processed 3,000,000 rows (4.32%)
‚úÖ Processed 3,500,000 rows (5.04%)
‚úÖ Processed 4,000,000 rows (5.77%)
‚úÖ Processed 4,500,000 rows (6.49%)
‚úÖ Processed 5,000,000 rows (7.21%)
‚úÖ Processed 5,500,000 rows (7.93%)
‚úÖ Processed 6,000,000 rows (8.65%)
‚úÖ Processed 6,500,000 rows (9.37%)
‚úÖ Processed 7,000,000 rows (10.09%)
‚úÖ Processed 7,500,000 rows (10.81%)
‚úÖ Processed 8,000,000 rows (11.53%)
‚úÖ Processed 8,500,000 rows (12.25%)
‚úÖ Processed 9,000,000 rows (12.97%)
‚úÖ Processed 9,500,000 rows (13.69%)
‚úÖ Processed 10,000,000 rows (14.41%)
‚úÖ Processed 10,500,000 rows (15.13%)
‚úÖ Processed 11,000,000 rows (15.86%)
‚úÖ Processed 11,500,000 rows (16.58%)
‚úÖ Processed 12,000,000 rows (17.30%)
‚úÖ Processed 12,5