In [12]:
%pip install stream_unzip

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [13]:
import requests
from stream_unzip import stream_unzip
import io
import csv
import zipfile
import struct
import zlib
import os

url = "https://hosted-datasets.gbif.org/eBird/2024-eBird-dwca-1.0.zip"

In [14]:
with requests.get(url, stream=True) as r:
    try:
        for filename, filesize, file_stream in stream_unzip(r.raw):
            print(filename, filesize)
    except Exception as e:
        print("ERROR:", type(e).__name__, e)

b'meta.xml' 2493
ERROR: UnfinishedIterationError 


In [15]:
r = requests.head(url, allow_redirects=True) # Para saber si acepta la descarga en stream
print("Status:", r.status_code)
print("Content-Type:", r.headers.get("Content-Type"))
print("Accept-Ranges:", r.headers.get("Accept-Ranges"))
print("Content-Encoding:", r.headers.get("Content-Encoding"))

Status: 200
Content-Type: application/zip
Accept-Ranges: bytes
Content-Encoding: None


In [16]:
r = requests.get(url, headers={"Range": "bytes=-5242880"})
open("tail.zip", "wb").write(r.content)
print(len(r.content), "bytes fetched")

5242880 bytes fetched


In [17]:
with open("tail.zip", "rb") as f:
    data = f.read()

# Look for central directory signatures
pos = data.find(b"PK\x01\x02")
entries = []
while pos != -1:
    fn_len = int.from_bytes(data[pos+28:pos+30], "little")
    extra_len = int.from_bytes(data[pos+30:pos+32], "little")
    comment_len = int.from_bytes(data[pos+32:pos+34], "little")
    filename = data[pos+46:pos+46+fn_len].decode(errors="ignore")
    header_offset = int.from_bytes(data[pos+42:pos+46], "little")
    entries.append((filename, header_offset))
    pos = data.find(b"PK\x01\x02", pos+1)

print("Found entries:", [e[0] for e in entries])

Found entries: ['meta.xml', 'eml.xml', 'eod_2024.csv']


In [18]:
with open("tail.zip", "rb") as f:
    data = f.read()

entries = []
pos = data.find(b"PK\x01\x02")
while pos != -1:
    # Read key fields
    comp_size = int.from_bytes(data[pos+20:pos+24], "little")
    uncomp_size = int.from_bytes(data[pos+24:pos+28], "little")
    fn_len = int.from_bytes(data[pos+28:pos+30], "little")
    extra_len = int.from_bytes(data[pos+30:pos+32], "little")
    comment_len = int.from_bytes(data[pos+32:pos+34], "little")
    header_offset = int.from_bytes(data[pos+42:pos+46], "little")
    filename = data[pos+46:pos+46+fn_len].decode(errors="ignore")
    entries.append({
        "filename": filename,
        "offset": header_offset,
        "comp_size": comp_size,
        "uncomp_size": uncomp_size
    })
    pos = data.find(b"PK\x01\x02", pos + 1)

# Find your CSV entry
csv_entry = next(e for e in entries if e["filename"].endswith("eod_2024.csv"))
file_path = "tail.zip"

if os.path.exists(file_path):
    os.remove(file_path)
    print(f"{file_path} has been deleted.")
else:
    print(f"{file_path} does not exist.")
print(csv_entry)

tail.zip has been deleted.
{'filename': 'eod_2024.csv', 'offset': 3377, 'comp_size': 4294967295, 'uncomp_size': 4294967295}


In [19]:
r = requests.get(url, headers={"Range": "bytes=3377-6000"})
data = r.content

# Local file header starts with PK\x03\x04
lfh_sig = data.find(b"PK\x03\x04")
comp_method = int.from_bytes(data[lfh_sig+8:lfh_sig+10], "little")
fn_len = int.from_bytes(data[lfh_sig+26:lfh_sig+28], "little")
extra_len = int.from_bytes(data[lfh_sig+28:lfh_sig+30], "little")

extra = data[lfh_sig+30+fn_len : lfh_sig+30+fn_len+extra_len]
# Parse ZIP64 extra field (0x0001)
if b"\x01\x00" in extra:
    i = extra.index(b"\x01\x00")
    real_uncomp, real_comp = struct.unpack_from("<QQ", extra, i+4)
    print("Real sizes:", real_comp, real_uncomp)


Real sizes: 71515376935 556156633708


In [None]:
output_csv = "..\data\raw\ebirdSonora.csv"

local_header_offset = csv_entry['offset']
compressed_size = real_comp  
chunk_size = 100_000_000       # 100 MB chunks

# -----------------------------
# Step 2: Helper to parse local header
# -----------------------------
def parse_local_file_header(data):
    assert data[:4] == b'PK\x03\x04'
    comp_method = int.from_bytes(data[8:10], "little")
    fn_len = int.from_bytes(data[26:28], "little")
    extra_len = int.from_bytes(data[28:30], "little")
    header_len = 30 + fn_len + extra_len
    return header_len, comp_method

# -----------------------------
# Step 3: Download local header first
# -----------------------------
r = requests.get(url, headers={"Range": f"bytes={local_header_offset}-{local_header_offset + 1024}"})
data = r.content
header_len, comp_method = parse_local_file_header(data)

if comp_method != 8:
    raise NotImplementedError("Only deflate-compressed files are supported")

compressed_start = local_header_offset + header_len
decompressor = zlib.decompressobj(-zlib.MAX_WBITS)  # raw DEFLATE
buffer = b""

# -----------------------------
# Step 4: Open output CSV
# -----------------------------
with open(output_csv, "w", newline="", encoding="utf-8") as fout:
    writer = csv.writer(fout)

    # -----------------------------
    # Step 5: Stream compressed data in chunks
    # -----------------------------
    for start in range(compressed_start, compressed_start + compressed_size, chunk_size):
        end = min(start + chunk_size - 1, compressed_start + compressed_size - 1)
        headers = {"Range": f"bytes={start}-{end}"}
        r = requests.get(url, headers=headers, stream=True)

        for chunk in r.iter_content(8192):
            if not chunk:
                continue
            decompressed = decompressor.decompress(chunk)
            buffer += decompressed

            # Split complete lines
            while b"\n" in buffer:
                line, buffer = buffer.split(b"\n", 1)
                row = line.decode("utf-8").split(",")

                # Filter by row[11] == 'Sonora'
                if len(row) > 11 and row[11] == "Sonora":
                    writer.writerow(row)

    # Process any remaining data
    if buffer:
        row = buffer.decode("utf-8").split(",")
        if len(row) > 11 and row[11] == "Sonora":
            writer.writerow(row)

print(f"Filtered rows written to {output_csv}")


  output_csv = "..\data\interim\ebirdSonora.csv"
  output_csv = "..\data\interim\ebirdSonora.csv"


KeyboardInterrupt: 

In [None]:
zip_file = "..\data\raw\ebirdSonora.zip"

# Create a zip file containing the CSV
with zipfile.ZipFile(zip_file, mode="w", compression=zipfile.ZIP_DEFLATED) as zf:
    zf.write(output_csv, arcname=output_csv)  # arcname keeps the filename inside the zip

print(f"{output_csv} has been zipped into {zip_file}")

..\data\interim\ebirdSonora.csv has been zipped into ..\data\interim\ebirdSonora.zip


  zip_file = "..\data\interim\ebirdSonora.zip"


In [24]:
if os.path.exists(output_csv):
    os.remove(output_csv)
    print(f"{output_csv} has been deleted after zipping.")

..\data\interim\ebirdSonora.csv has been deleted after zipping.
