In [1]:
#(MASTER 2023 -2024)
import pandas as pd
from pathlib import Path

# Input & output file paths
INPUT_FILE = Path("itcont_2023_2024.csv")
OUTPUT_FILE = Path("itcont_2023_2024_filtered.csv")

# Columns to KEEP
KEEP_COLS = [
    "CMTE_ID",           # Link to committees/candidates
    "TRANSACTION_DT",    # Date for timeline analysis
    "TRANSACTION_AMT",   # Dollar amounts
    "STATE",             # Geographic analysis
    "CITY",              # Detailed geography
    "ZIP_CODE",          # Granular location data
    "ENTITY_TP",         # Individual vs organization
    "TRANSACTION_TP",    # Transaction type (filter refunds)
    "NAME",              # Donor identification
    "EMPLOYER",          # Donor demographics
    "OCCUPATION",        # Donor profiles
    "MEMO_CD",           # Avoid double-counting
    "SUB_ID"             # Unique identifier
]
# Count total rows (for progress display)
def count_lines(path):
    with open(path, "r", encoding="utf-8", errors="ignore") as f:
        for i, _ in enumerate(f, 1):
            pass
    return i

print("üîç Counting total rows...")
total_rows = count_lines(INPUT_FILE)
print(f"üìä Total rows: {total_rows:,}\n")

# Read in chunks
chunksize = 250_000
rows_processed = 0

# Delete old output if exists
if OUTPUT_FILE.exists():
    OUTPUT_FILE.unlink()

# Stream through the file chunk-by-chunk
for chunk in pd.read_csv(INPUT_FILE, dtype=str, chunksize=chunksize, low_memory=False):
    # ‚úÖ Keep only desired columns
    chunk = chunk[KEEP_COLS]

    # Append to new CSV
    chunk.to_csv(OUTPUT_FILE, mode="a", index=False, header=not OUTPUT_FILE.exists())

    # Progress tracking
    rows_processed += len(chunk)
    pct = (rows_processed / total_rows) * 100 if total_rows else 0.0
    print(f"‚úÖ Processed {rows_processed:,} rows ({pct:.2f}%)")

print(f"\nüéâ Done! File saved as: {OUTPUT_FILE}")


üîç Counting total rows...
üìä Total rows: 58,271,240

‚úÖ Processed 250,000 rows (0.43%)
‚úÖ Processed 500,000 rows (0.86%)
‚úÖ Processed 750,000 rows (1.29%)
‚úÖ Processed 1,000,000 rows (1.72%)
‚úÖ Processed 1,250,000 rows (2.15%)
‚úÖ Processed 1,500,000 rows (2.57%)
‚úÖ Processed 1,750,000 rows (3.00%)
‚úÖ Processed 2,000,000 rows (3.43%)
‚úÖ Processed 2,250,000 rows (3.86%)
‚úÖ Processed 2,500,000 rows (4.29%)
‚úÖ Processed 2,750,000 rows (4.72%)
‚úÖ Processed 3,000,000 rows (5.15%)
‚úÖ Processed 3,250,000 rows (5.58%)
‚úÖ Processed 3,500,000 rows (6.01%)
‚úÖ Processed 3,750,000 rows (6.44%)
‚úÖ Processed 4,000,000 rows (6.86%)
‚úÖ Processed 4,250,000 rows (7.29%)
‚úÖ Processed 4,500,000 rows (7.72%)
‚úÖ Processed 4,750,000 rows (8.15%)
‚úÖ Processed 5,000,000 rows (8.58%)
‚úÖ Processed 5,250,000 rows (9.01%)
‚úÖ Processed 5,500,000 rows (9.44%)
‚úÖ Processed 5,750,000 rows (9.87%)
‚úÖ Processed 6,000,000 rows (10.30%)
‚úÖ Processed 6,250,000 rows (10.73%)
‚úÖ Processed 6,500,00

In [None]:
#2021 to 2022

In [2]:
#(MASTER)
import pandas as pd
from pathlib import Path

# Input & output file paths
INPUT_FILE = Path("itcont_2021_2022.csv")
OUTPUT_FILE = Path("itcont_2021_2022_filtered.csv")

# Columns to KEEP
KEEP_COLS = [
    "CMTE_ID",           # Link to committees/candidates
    "TRANSACTION_DT",    # Date for timeline analysis
    "TRANSACTION_AMT",   # Dollar amounts
    "STATE",             # Geographic analysis
    "CITY",              # Detailed geography
    "ZIP_CODE",          # Granular location data
    "ENTITY_TP",         # Individual vs organization
    "TRANSACTION_TP",    # Transaction type (filter refunds)
    "NAME",              # Donor identification
    "EMPLOYER",          # Donor demographics
    "OCCUPATION",        # Donor profiles
    "MEMO_CD",           # Avoid double-counting
    "SUB_ID"             # Unique identifier
]

# Count total rows (for progress display)
def count_lines(path):
    with open(path, "r", encoding="utf-8", errors="ignore") as f:
        for i, _ in enumerate(f, 1):
            pass
    return i

print("üîç Counting total rows...")
total_rows = count_lines(INPUT_FILE)
print(f"üìä Total rows: {total_rows:,}\n")

# Read in chunks
chunksize = 250_000
rows_processed = 0

# Delete old output if exists
if OUTPUT_FILE.exists():
    OUTPUT_FILE.unlink()

# Stream through the file chunk-by-chunk
for chunk in pd.read_csv(INPUT_FILE, dtype=str, chunksize=chunksize, low_memory=False):
    # ‚úÖ Keep only desired columns
    chunk = chunk[KEEP_COLS]

    # Append to new CSV
    chunk.to_csv(OUTPUT_FILE, mode="a", index=False, header=not OUTPUT_FILE.exists())

    # Progress tracking
    rows_processed += len(chunk)
    pct = (rows_processed / total_rows) * 100 if total_rows else 0.0
    print(f"‚úÖ Processed {rows_processed:,} rows ({pct:.2f}%)")

print(f"\nüéâ Done! File saved as: {OUTPUT_FILE}")


üîç Counting total rows...
üìä Total rows: 63,885,979

‚úÖ Processed 250,000 rows (0.39%)
‚úÖ Processed 500,000 rows (0.78%)
‚úÖ Processed 750,000 rows (1.17%)
‚úÖ Processed 1,000,000 rows (1.57%)
‚úÖ Processed 1,250,000 rows (1.96%)
‚úÖ Processed 1,500,000 rows (2.35%)
‚úÖ Processed 1,750,000 rows (2.74%)
‚úÖ Processed 2,000,000 rows (3.13%)
‚úÖ Processed 2,250,000 rows (3.52%)
‚úÖ Processed 2,500,000 rows (3.91%)
‚úÖ Processed 2,750,000 rows (4.30%)
‚úÖ Processed 3,000,000 rows (4.70%)
‚úÖ Processed 3,250,000 rows (5.09%)
‚úÖ Processed 3,500,000 rows (5.48%)
‚úÖ Processed 3,750,000 rows (5.87%)
‚úÖ Processed 4,000,000 rows (6.26%)
‚úÖ Processed 4,250,000 rows (6.65%)
‚úÖ Processed 4,500,000 rows (7.04%)
‚úÖ Processed 4,750,000 rows (7.44%)
‚úÖ Processed 5,000,000 rows (7.83%)
‚úÖ Processed 5,250,000 rows (8.22%)
‚úÖ Processed 5,500,000 rows (8.61%)
‚úÖ Processed 5,750,000 rows (9.00%)
‚úÖ Processed 6,000,000 rows (9.39%)
‚úÖ Processed 6,250,000 rows (9.78%)
‚úÖ Processed 6,500,000 

In [None]:
import pandas as pd
from pathlib import Path

# Input & output file paths
INPUT_FILE = Path("itcont_2021_2022.csv")
OUTPUT_FILE = Path("itcont_2021_2022_cleaned.csv")

# Columns to keep
KEEP_COLS = [
    "CMTE_ID",           # Link to committees/candidates
    "TRANSACTION_DT",    # Date for timeline analysis
    "TRANSACTION_AMT",   # Dollar amounts
    "STATE",             # Geographic analysis
    "CITY",              # Detailed geography
    "ZIP_CODE",          # Granular location data
    "ENTITY_TP",         # Individual vs organization
    "TRANSACTION_TP",    # Transaction type (filter refunds)
    "NAME",              # Donor identification
    "EMPLOYER",          # Donor demographics
    "OCCUPATION",        # Donor profiles
    "MEMO_CD",           # Avoid double-counting
    "SUB_ID"             # Unique identifier
]

# Count total rows (for progress)
def count_lines(path):
    with open(path, "r", encoding="utf-8", errors="ignore") as f:
        for i, _ in enumerate(f, 1):
            pass
    return i

print("üîç Counting total rows...")
total_rows = count_lines(INPUT_FILE)
print(f"üìä Total rows: {total_rows:,}\n")

# Read & process in chunks
chunksize = 250_000
rows_processed = 0

# Delete old output if exists
if OUTPUT_FILE.exists():
    OUTPUT_FILE.unlink()

with pd.read_csv(INPUT_FILE, dtype=str, chunksize=chunksize) as reader:
    for chunk in reader:
        # Keep only selected columns
        chunk = chunk[KEEP_COLS].copy()

        # Clean string columns
        for col in chunk.columns:
            if chunk[col].dtype == "string" or chunk[col].dtype == object:
                chunk[col] = chunk[col].astype("string").str.strip()

        # Convert numeric and date fields
        chunk["TRANSACTION_AMT"] = pd.to_numeric(chunk["TRANSACTION_AMT"], errors="coerce")
        chunk["TRANSACTION_DT"] = pd.to_datetime(chunk["TRANSACTION_DT"], format="%m%d%Y", errors="coerce")

        # Write cleaned data incrementally
        chunk.to_csv(OUTPUT_FILE, mode="a", index=False, header=not OUTPUT_FILE.exists())

        # Update and print progress
        rows_processed += len(chunk)
        pct = (rows_processed / total_rows) * 100 if total_rows else 0.0
        print(f"‚úÖ Processed {rows_processed:,} rows ({pct:.2f}%)")

print(f"\nüéâ Cleaning complete! File saved as: {OUTPUT_FILE}")

üîç Counting total rows...
üìä Total rows: 64,885,981

‚úÖ Processed 250,000 rows (0.39%)
‚úÖ Processed 500,000 rows (0.77%)
‚úÖ Processed 750,000 rows (1.16%)
‚úÖ Processed 1,000,000 rows (1.54%)
‚úÖ Processed 1,250,000 rows (1.93%)
‚úÖ Processed 1,500,000 rows (2.31%)
‚úÖ Processed 1,750,000 rows (2.70%)
‚úÖ Processed 2,000,000 rows (3.08%)
‚úÖ Processed 2,250,000 rows (3.47%)
‚úÖ Processed 2,500,000 rows (3.85%)
‚úÖ Processed 2,750,000 rows (4.24%)
‚úÖ Processed 3,000,000 rows (4.62%)
‚úÖ Processed 3,250,000 rows (5.01%)
‚úÖ Processed 3,500,000 rows (5.39%)
‚úÖ Processed 3,750,000 rows (5.78%)
‚úÖ Processed 4,000,000 rows (6.16%)
‚úÖ Processed 4,250,000 rows (6.55%)
‚úÖ Processed 4,500,000 rows (6.94%)
‚úÖ Processed 4,750,000 rows (7.32%)
‚úÖ Processed 5,000,000 rows (7.71%)
‚úÖ Processed 5,250,000 rows (8.09%)
‚úÖ Processed 5,500,000 rows (8.48%)
‚úÖ Processed 5,750,000 rows (8.86%)
‚úÖ Processed 6,000,000 rows (9.25%)
‚úÖ Processed 6,250,000 rows (9.63%)
‚úÖ Processed 6,500,000 

In [4]:
import pandas as pd
from pathlib import Path

# Input and output paths
INPUT_FILE = Path("itcont_2021_2022_cleaned.csv")
OUTPUT_FILE = Path("itcont_2021_2022_cleaned_v2.csv")

chunksize = 250_000

# Count total rows for progress tracking
def count_lines(path):
    with open(path, "r", encoding="utf-8", errors="ignore") as f:
        for i, _ in enumerate(f, 1):
            pass
    return i

print("üîç Counting total rows...")
total_rows = count_lines(INPUT_FILE)
print(f"üìä Total rows: {total_rows:,}\n")

# Delete old output if it exists
if OUTPUT_FILE.exists():
    OUTPUT_FILE.unlink()

rows_processed = 0

# Process file in chunks
with pd.read_csv(INPUT_FILE, dtype=str, chunksize=chunksize) as reader:
    for chunk in reader:
        # Drop CITY column if it exists
        if "CITY" in chunk.columns:
            chunk = chunk.drop(columns=["CITY"])

        # Clean string columns
        for col in chunk.columns:
            if chunk[col].dtype == "string" or chunk[col].dtype == object:
                chunk[col] = chunk[col].astype("string").str.strip()

        # Convert numeric and date fields
        if "TRANSACTION_AMT" in chunk.columns:
            chunk["TRANSACTION_AMT"] = pd.to_numeric(chunk["TRANSACTION_AMT"], errors="coerce")
        if "TRANSACTION_DT" in chunk.columns:
            chunk["TRANSACTION_DT"] = pd.to_datetime(chunk["TRANSACTION_DT"], format="%m%d%Y", errors="coerce")

        # Append to new file
        chunk.to_csv(OUTPUT_FILE, mode="a", index=False, header=not OUTPUT_FILE.exists())

        # Update and print progress
        rows_processed += len(chunk)
        pct = (rows_processed / total_rows) * 100 if total_rows else 0.0
        print(f"‚úÖ Processed {rows_processed:,} rows ({pct:.2f}%)")

print(f"\nüéâ Done! 'CITY' column removed and file saved as: {OUTPUT_FILE}")


üîç Counting total rows...
üìä Total rows: 64,885,981

‚úÖ Processed 250,000 rows (0.39%)
‚úÖ Processed 500,000 rows (0.77%)
‚úÖ Processed 750,000 rows (1.16%)
‚úÖ Processed 1,000,000 rows (1.54%)
‚úÖ Processed 1,250,000 rows (1.93%)
‚úÖ Processed 1,500,000 rows (2.31%)
‚úÖ Processed 1,750,000 rows (2.70%)
‚úÖ Processed 2,000,000 rows (3.08%)
‚úÖ Processed 2,250,000 rows (3.47%)
‚úÖ Processed 2,500,000 rows (3.85%)
‚úÖ Processed 2,750,000 rows (4.24%)
‚úÖ Processed 3,000,000 rows (4.62%)
‚úÖ Processed 3,250,000 rows (5.01%)
‚úÖ Processed 3,500,000 rows (5.39%)
‚úÖ Processed 3,750,000 rows (5.78%)
‚úÖ Processed 4,000,000 rows (6.16%)
‚úÖ Processed 4,250,000 rows (6.55%)
‚úÖ Processed 4,500,000 rows (6.94%)
‚úÖ Processed 4,750,000 rows (7.32%)
‚úÖ Processed 5,000,000 rows (7.71%)
‚úÖ Processed 5,250,000 rows (8.09%)
‚úÖ Processed 5,500,000 rows (8.48%)
‚úÖ Processed 5,750,000 rows (8.86%)
‚úÖ Processed 6,000,000 rows (9.25%)
‚úÖ Processed 6,250,000 rows (9.63%)
‚úÖ Processed 6,500,000 

In [1]:
#(MASTER 2019-2020)
import pandas as pd
from pathlib import Path

# Input & output file paths
INPUT_FILE = Path("itcont_2019_2020.csv")
OUTPUT_FILE = Path("itcont_2019_2020_filtered.csv")

# Columns to KEEP
KEEP_COLS = [
    "CMTE_ID",           # Link to committees/candidates
    "TRANSACTION_DT",    # Date for timeline analysis
    "TRANSACTION_AMT",   # Dollar amounts
    "STATE",             # Geographic analysis
    "CITY",              # Detailed geography
    "ZIP_CODE",          # Granular location data
    "ENTITY_TP",         # Individual vs organization
    "TRANSACTION_TP",    # Transaction type (filter refunds)
    "NAME",              # Donor identification
    "EMPLOYER",          # Donor demographics
    "OCCUPATION",        # Donor profiles
    "MEMO_CD",           # Avoid double-counting
    "SUB_ID"             # Unique identifier
]

# Count total rows (for progress display)
def count_lines(path):
    with open(path, "r", encoding="utf-8", errors="ignore") as f:
        for i, _ in enumerate(f, 1):
            pass
    return i

print("üîç Counting total rows...")
total_rows = count_lines(INPUT_FILE)
print(f"üìä Total rows: {total_rows:,}\n")

# Read in chunks
chunksize = 250_000
rows_processed = 0

# Delete old output if exists
if OUTPUT_FILE.exists():
    OUTPUT_FILE.unlink()

# Stream through the file chunk-by-chunk
for chunk in pd.read_csv(INPUT_FILE, dtype=str, chunksize=chunksize, low_memory=False):
    # ‚úÖ Keep only desired columns
    chunk = chunk[KEEP_COLS]

    # Append to new CSV
    chunk.to_csv(OUTPUT_FILE, mode="a", index=False, header=not OUTPUT_FILE.exists())

    # Progress tracking
    rows_processed += len(chunk)
    pct = (rows_processed / total_rows) * 100 if total_rows else 0.0
    print(f"‚úÖ Processed {rows_processed:,} rows ({pct:.2f}%)")

print(f"\nüéâ Done! File saved as: {OUTPUT_FILE}")


üîç Counting total rows...
üìä Total rows: 69,377,426

‚úÖ Processed 250,000 rows (0.36%)
‚úÖ Processed 500,000 rows (0.72%)
‚úÖ Processed 750,000 rows (1.08%)
‚úÖ Processed 1,000,000 rows (1.44%)
‚úÖ Processed 1,250,000 rows (1.80%)
‚úÖ Processed 1,500,000 rows (2.16%)
‚úÖ Processed 1,750,000 rows (2.52%)
‚úÖ Processed 2,000,000 rows (2.88%)
‚úÖ Processed 2,250,000 rows (3.24%)
‚úÖ Processed 2,500,000 rows (3.60%)
‚úÖ Processed 2,750,000 rows (3.96%)
‚úÖ Processed 3,000,000 rows (4.32%)
‚úÖ Processed 3,250,000 rows (4.68%)
‚úÖ Processed 3,500,000 rows (5.04%)
‚úÖ Processed 3,750,000 rows (5.41%)
‚úÖ Processed 4,000,000 rows (5.77%)
‚úÖ Processed 4,250,000 rows (6.13%)
‚úÖ Processed 4,500,000 rows (6.49%)
‚úÖ Processed 4,750,000 rows (6.85%)
‚úÖ Processed 5,000,000 rows (7.21%)
‚úÖ Processed 5,250,000 rows (7.57%)
‚úÖ Processed 5,500,000 rows (7.93%)
‚úÖ Processed 5,750,000 rows (8.29%)
‚úÖ Processed 6,000,000 rows (8.65%)
‚úÖ Processed 6,250,000 rows (9.01%)
‚úÖ Processed 6,500,000 