In [1]:
import io
from google.cloud import storage

def load_file_from_gcs(bucket_name, file_path):
    # Initialize GCS client
    client = storage.Client()
    bucket = client.bucket(bucket_name)

    # Get the blob
    blob = bucket.blob(file_path)

    if not blob.exists():
        raise FileNotFoundError(f"File not found: gs://{bucket_name}/{file_path}")

    print(f"Loading file: {blob.name}")

    # Download the file as bytes into memory
    data_bytes = blob.download_as_bytes()
    file_obj = io.BytesIO(data_bytes)

    return file_obj, blob.name

In [2]:
from google.cloud import storage

def list_blobs(bucket_name, suffix=None):
    """Lists all the blobs in the bucket with optional suffix filtering.
    
    Args:
        bucket_name (str): Name of the GCS bucket
        suffix (str or list): File suffix(es) to filter by (e.g. '.jsonl', '.npy')
        
    Returns:
        list: List of blob objects matching the suffix criteria
    """
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)

    blobs = bucket.list_blobs()
    
    if suffix:
        if isinstance(suffix, str):
            suffix = [suffix]
        filtered_blobs = [blob.name for blob in blobs if any(blob.name.endswith(s) for s in suffix)]
        return filtered_blobs
    
    return list(blobs)

In [3]:
bucket_name = "proposition-vectors"
props = list_blobs(bucket_name,".npy")

In [4]:
start_ids = []
for i in range(1,8):
    id = props[160*i].split("_")[1].split("-")
    start_ids.append("".join([elem + "-" for elem in id[:5]])[:-1])

start_ids

['enwiki-00859626-0002-0001-0006',
 'enwiki-01587902-0000-0000-0002',
 'enwiki-02315769-0000-0000-0001',
 'enwiki-03013027-0007-0000-0006',
 'enwiki-03823541-0002-0000-0002',
 'enwiki-04586442-0027-0000-0002',
 'enwiki-05324078-0001-0000-0000']

In [5]:
import json
from tqdm import tqdm as tqdm
def load_jsonl(bucket_name:str,file_no:int)->list[dict]:
    file_name = f'datasets/datasets--chentong00--factoid-wiki/snapshots/60bce4923950eab87192e276c9c5e5136234a760/data/docs-{file_no:04d}_of_1000.jsonl'

    file_obj, file_name = load_file_from_gcs(bucket_name,file_path = file_name)

    text = file_obj.getvalue().decode("utf-8")

    data_list = [json.loads(line) for line in text.splitlines()]

    return data_list 

id_map = {}
for file_no in tqdm(range(1,1001)):

    data_list = load_jsonl(bucket_name,file_no)

    for line_no,data in enumerate(data_list):
        if data["id"] in start_ids:
            id_map[data["id"]] = (file_no,line_no)

Loading file: datasets/datasets--chentong00--factoid-wiki/snapshots/60bce4923950eab87192e276c9c5e5136234a760/data/docs-0001_of_1000.jsonl
Loading file: datasets/datasets--chentong00--factoid-wiki/snapshots/60bce4923950eab87192e276c9c5e5136234a760/data/docs-0002_of_1000.jsonl
Loading file: datasets/datasets--chentong00--factoid-wiki/snapshots/60bce4923950eab87192e276c9c5e5136234a760/data/docs-0003_of_1000.jsonl


KeyboardInterrupt: 