## PDS Group 7

In [19]:
import pandas as pd
import numpy as np
import zipfile
import warnings

pd.set_option("mode.copy_on_write", True)

In [None]:
# Ignore warnings so we can see our progress updates clearly
warnings.filterwarnings("ignore", category=pd.errors.DtypeWarning)

zip_path = "/Users/far/Downloads/arcos_all.zip"
inner_name = "arcos_all.tsv"

# Focus on only these columns to reduce memory usage
needed_cols = [
    "BUYER_STATE",
    "BUYER_COUNTY",
    "TRANSACTION_DATE",
    "CALC_BASE_WT_IN_GM",
]

# Process in chunks to avoid loading the entire large file into memory at once
# We chose 200K rather than 100K to keep the number of chunks smaller
chunk_size = 200_000

# Count chunks first so we can show accurate progress percentages during processing
total_chunks = 0
with zipfile.ZipFile(zip_path) as z:
    with z.open(inner_name) as f:
        reader = pd.read_csv(f, sep="\t", chunksize=chunk_size)
        for _ in reader:
            total_chunks += 1

print(f"Total chunks in file: {total_chunks}")


partial_results = []

with zipfile.ZipFile(zip_path) as z:
    with z.open(inner_name) as f:
        reader = pd.read_csv(f, sep="\t", chunksize=chunk_size)

        for i, chunk in enumerate(reader, start=1):

            # Show periodic progress updates to monitor long-running process
            if i % 20 == 0 or i == 1:
                progress = i / total_chunks * 100
                print(
                    f"Processing chunk {i}/{total_chunks}  |  {progress:.2f}% completed"
                )

            # Select only needed columns to minimize memory usage
            df = chunk[needed_cols].copy()

            # Do all transformations (rename, extract year, aggregate) within each chunk
            # to avoid creating another large dataset (~8GB) in memory
            df = df.rename(
                columns={
                    "BUYER_STATE": "state",
                    "BUYER_COUNTY": "county_name",
                    "TRANSACTION_DATE": "transaction_date",
                    "CALC_BASE_WT_IN_GM": "opioid_grams",
                }
            )

            df["year"] = pd.to_datetime(df["transaction_date"]).dt.year

            grouped = df.groupby(["state", "county_name", "year"], as_index=False)[
                "opioid_grams"
            ].sum()

            partial_results.append(grouped)

print("Combining chunk summaries...")

# Combine pre-aggregated results from all chunks
all_data = pd.concat(partial_results, ignore_index=True)

# Final aggregation needed because same county-year may appear in multiple chunks
arcos_by_county_year = all_data.groupby(
    ["state", "county_name", "year"], as_index=False
)["opioid_grams"].sum()

# Save to CSV for easy sharing and future use without reprocessing
output_path = "arcos_by_county_year.csv"
arcos_by_county_year.to_csv(output_path, index=False)

print("Done! Saved:", output_path)
print(arcos_by_county_year.head())

Total chunks in file: 3800
Processing chunk 1/3800  |  0.03% completed
Processing chunk 20/3800  |  0.53% completed
Processing chunk 40/3800  |  1.05% completed
Processing chunk 60/3800  |  1.58% completed
Processing chunk 80/3800  |  2.11% completed
Processing chunk 100/3800  |  2.63% completed
Processing chunk 120/3800  |  3.16% completed
Processing chunk 140/3800  |  3.68% completed
Processing chunk 160/3800  |  4.21% completed
Processing chunk 180/3800  |  4.74% completed
Processing chunk 200/3800  |  5.26% completed
Processing chunk 220/3800  |  5.79% completed
Processing chunk 240/3800  |  6.32% completed
Processing chunk 260/3800  |  6.84% completed
Processing chunk 280/3800  |  7.37% completed
Processing chunk 300/3800  |  7.89% completed
Processing chunk 320/3800  |  8.42% completed
Processing chunk 340/3800  |  8.95% completed
Processing chunk 360/3800  |  9.47% completed
Processing chunk 380/3800  |  10.00% completed
Processing chunk 400/3800  |  10.53% completed
Processing 