In [12]:
# parameters (papermill picks this up)
PROJECT_ID = "nice-proposal-467718-q6"
REGION = "us-west1"
BRONZE_PATH = "gs://meu-bucket-premier/bronze/"
RUN_TS = None  # auto-set if None

# Kaggle dataset in the form "owner/dataset"
KAGGLE_DATASET = "hugomathien/soccer"   # <- ajuste para o seu dataset
# Optionally filter which files to upload (None = all)
KAGGLE_FILES_FILTER = None              # e.g., ["*.csv"] ou ["database.sqlite"]


In [13]:
# bootstrap: deps + checagens
import sys, os, shutil, fnmatch, subprocess
from pathlib import Path
import datetime as dt

# Instalar pacotes que podem faltar
def ensure(pkg, pip_name=None):
    try:
        __import__(pkg)
    except ModuleNotFoundError:
        pip = pip_name or pkg
        print(f"Instalando {pip} ...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", "--quiet", pip])

ensure("kaggle")
ensure("google.cloud.storage", "google-cloud-storage")

# Timestamp lógico
if RUN_TS is None:
    RUN_TS = dt.datetime.utcnow().strftime("%Y%m%d-%H%M%S")
print("RUN_TS =", RUN_TS)

# Kaggle token
home = Path.home()
kaggle_json = home / ".kaggle" / "kaggle.json"
if not kaggle_json.exists():
    raise RuntimeError("kaggle.json não encontrado em ~/.kaggle/kaggle.json — suba o arquivo e rode: chmod 600 ~/.kaggle/kaggle.json")
os.chmod(kaggle_json, 0o600)
print("kaggle.json OK")


RUN_TS = 20250815-153852
kaggle.json OK


In [14]:
# baixa dataset do Kaggle e descompacta em /tmp/kaggle_dl
workdir = Path("/tmp/kaggle_dl")
if workdir.exists():
    shutil.rmtree(workdir)
workdir.mkdir(parents=True, exist_ok=True)

print("Baixando dataset:", KAGGLE_DATASET)
subprocess.check_call(["kaggle", "datasets", "download", "-d", KAGGLE_DATASET, "-p", str(workdir), "-q"])

# Descompacta
for z in workdir.glob("*.zip"):
    shutil.unpack_archive(str(z), extract_dir=str(workdir))
    z.unlink()

files = [p for p in workdir.rglob("*") if p.is_file()]
print(f"{len(files)} arquivos baixados do Kaggle.")


Baixando dataset: hugomathien/soccer
Dataset URL: https://www.kaggle.com/datasets/hugomathien/soccer
License(s): ODbL-1.0
1 arquivos baixados do Kaggle.


In [15]:
# filtro opcional de arquivos
if KAGGLE_FILES_FILTER:
    filtered = []
    for pattern in KAGGLE_FILES_FILTER:
        filtered += [p for p in files if fnmatch.fnmatch(p.name, pattern)]
    # remove duplicados mantendo ordem
    seen = set()
    files = [x for x in filtered if not (x in seen or seen.add(x))]
    print(f"Após filtro {KAGGLE_FILES_FILTER}: {len(files)} arquivos.")


In [16]:
# upload para GCS (Bronze)
from google.cloud import storage

client = storage.Client(project=PROJECT_ID)
bucket_name = BRONZE_PATH.replace("gs://","").split("/")[0]
prefix = "/".join(BRONZE_PATH.replace("gs://","").split("/")[1:]).rstrip("/")

dataset_tag = KAGGLE_DATASET.replace("/", "_")
gcs_prefix = f"{prefix}/{RUN_TS}/{dataset_tag}" if prefix else f"{RUN_TS}/{dataset_tag}"
bucket = client.bucket(bucket_name)

uploaded = 0
for p in files:
    rel = p.relative_to(Path("/tmp/kaggle_dl")).as_posix()
    blob_path = f"{gcs_prefix}/{rel}"
    bucket.blob(blob_path).upload_from_filename(str(p))
    uploaded += 1
    if uploaded % 10 == 0:
        print(f"↑ {uploaded} arquivos...")

print(f"Upload concluído: {uploaded} arquivos para gs://{bucket_name}/{gcs_prefix}/")


Upload concluído: 1 arquivos para gs://meu-bucket-premier/bronze/20250815-153852/hugomathien_soccer/
