# Exploración de datos

## Librerías

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
from PyPDF2 import PdfReader
from pathlib import Path


import sys
root = Path().resolve().parent
sys.path.append(str(root))

from src.budget_buddy.utils.io import ensureDirs

In [None]:
manifest_path = Path("../data/processed/manifest_pdfs.csv")
df = pd.read_csv(manifest_path)
df.head()

## Total PDF's

In [None]:
len(df), df["pdf_filename"].nunique()

## Conteo por año/bloque

In [None]:
df.groupby("year")["pdf_filename"].nunique().reset_index(name="n_pdfs")


In [None]:
df.groupby(["year", "block"])["pdf_filename"].nunique() \
  .reset_index(name="n_pdfs") \
  .sort_values(["year", "block"])


# Tamaños de archivo

In [None]:
df["size_mb"] = df["size_bytes"] / (1024 * 1024)
df["size_mb"].describe()

In [None]:
df["size_mb"].hist(bins=50)
plt.xlabel("tamaño (MB)")
plt.ylabel("n_pdfs")
plt.show()

## Duplicados

In [None]:
dups_path = Path("../data/processed/manifest_duplicates.csv")
dups = pd.read_csv(dups_path)
dups.head()

In [None]:
dups["pdf_filename"].value_counts().head(20)

In [None]:
if "sha256" in dups.columns:
    dups["sha256"].value_counts().head(20)

## Conteo por categoría

In [None]:
cats_path = Path("../data/processed/categories.csv")
cats = pd.read_csv(cats_path)
cats.head()

In [None]:
cats[cats["missing"] == 0] \
    .groupby("category")["pdf_filename"] \
    .nunique() \
    .reset_index(name="n_pdfs") \
    .sort_values("n_pdfs", ascending=False)

## Cruces de data

In [None]:
valid = cats[cats["missing"] == 0].copy()
cats_year = valid.merge(df[["pdf_path", "year"]], on="pdf_path", how="left")

cats_year.groupby(["year", "category"])["pdf_filename"] \
    .nunique() \
    .reset_index(name="n_pdfs") \
    .sort_values(["year", "category"])

In [None]:
tables_dir = Path("outputs/tables")
ensureDirs([tables_dir])

cats_year_counts = cats_year.groupby(["year", "category"])["pdf_filename"] \
    .nunique() \
    .reset_index(name="n_pdfs") \
    .sort_values(["year", "category"])

cats_year_counts.to_csv(tables_dir / "cats_year_counts.csv", index=False)

## Exploración por PDF

In [None]:
def countPages(pdf_path):
    try:
        reader = PdfReader(str(pdf_path))
        return len(reader.pages)
    except Exception:
        return None

df["n_pages"] = df["pdf_path"].apply(lambda p: countPages(Path(p)))
df["n_pages"].describe()