In [1]:
import io
from google.cloud import storage

def load_file_from_gcs(bucket_name, file_path):
    # Initialize GCS client
    client = storage.Client()
    bucket = client.bucket(bucket_name)

    # Get the blob
    blob = bucket.blob(file_path)

    if not blob.exists():
        raise FileNotFoundError(f"File not found: gs://{bucket_name}/{file_path}")

    # print(f"Loading file: {blob.name}")

    # Download the file as bytes into memory
    data_bytes = blob.download_as_bytes()
    file_obj = io.BytesIO(data_bytes)

    return file_obj, blob.name

In [2]:
from google.cloud import storage

def list_blobs(bucket_name, suffix=None):
    """Lists all the blobs in the bucket with optional suffix filtering.
    
    Args:
        bucket_name (str): Name of the GCS bucket
        suffix (str or list): File suffix(es) to filter by (e.g. '.jsonl', '.npy')
        
    Returns:
        list: List of blob objects matching the suffix criteria
    """
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)

    blobs = bucket.list_blobs()
    
    if suffix:
        if isinstance(suffix, str):
            suffix = [suffix]
        filtered_blobs = [blob.name for blob in blobs if any(blob.name.endswith(s) for s in suffix)]
        return filtered_blobs
    
    return list(blobs)

In [3]:
bucket_name = "proposition-vectors"
props = list_blobs(bucket_name,".npy")

In [4]:
start_ids = []
for i in range(1,8):
    id = props[160*i].split("_")[1].split("-")
    start_ids.append("".join([elem + "-" for elem in id[:5]])[:-1])

start_ids

['enwiki-00859626-0002-0001-0006',
 'enwiki-01587902-0000-0000-0002',
 'enwiki-02315769-0000-0000-0001',
 'enwiki-03013027-0007-0000-0006',
 'enwiki-03823541-0002-0000-0002',
 'enwiki-04586442-0027-0000-0002',
 'enwiki-05324078-0001-0000-0000']

In [5]:
import json
from tqdm import tqdm as tqdm
def load_jsonl(bucket_name:str,file_no:int)->list[dict]:
    file_name = f'datasets/datasets--chentong00--factoid-wiki/snapshots/60bce4923950eab87192e276c9c5e5136234a760/data/docs-{file_no:04d}_of_1000.jsonl'

    file_obj, file_name = load_file_from_gcs(bucket_name,file_path = file_name)

    text = file_obj.getvalue().decode("utf-8")

    data_list = [json.loads(line) for line in text.splitlines()]

    return data_list 

id_map = {}
for file_no in tqdm(range(1,1001)):

    data_list = load_jsonl(bucket_name,file_no)

    for line_no,data in enumerate(data_list):
        if data["id"] in start_ids:
            id_map[data["id"]] = (file_no,line_no)
            print(f"Found {data['id']} in {file_no} at line {line_no}")

 14%|█▍        | 142/1000 [19:47<2:23:04, 10.01s/it]

Found enwiki-00859626-0002-0001-0006 in 142 at line 7270


 26%|██▌       | 262/1000 [37:53<1:58:28,  9.63s/it]

Found enwiki-01587902-0000-0000-0002 in 262 at line 84115


 38%|███▊      | 382/1000 [56:19<1:21:02,  7.87s/it]

Found enwiki-02315769-0000-0000-0001 in 382 at line 93226


 50%|████▉     | 497/1000 [1:14:47<1:10:37,  8.42s/it]

Found enwiki-03013027-0007-0000-0006 in 497 at line 203112


 63%|██████▎   | 631/1000 [1:34:06<53:05,  8.63s/it]  

Found enwiki-03823541-0002-0000-0002 in 631 at line 168423


 76%|███████▌  | 757/1000 [1:52:44<39:11,  9.68s/it]

Found enwiki-04586442-0027-0000-0002 in 757 at line 247550


 88%|████████▊ | 879/1000 [2:10:49<17:39,  8.76s/it]

Found enwiki-05324078-0001-0000-0000 in 879 at line 193589


100%|█████████▉| 999/1000 [2:29:33<00:08,  8.98s/it]


FileNotFoundError: File not found: gs://proposition-vectors/datasets/datasets--chentong00--factoid-wiki/snapshots/60bce4923950eab87192e276c9c5e5136234a760/data/docs-1000_of_1000.jsonl

In [6]:
id_map

{'enwiki-00859626-0002-0001-0006': (142, 7270),
 'enwiki-01587902-0000-0000-0002': (262, 84115),
 'enwiki-02315769-0000-0000-0001': (382, 93226),
 'enwiki-03013027-0007-0000-0006': (497, 203112),
 'enwiki-03823541-0002-0000-0002': (631, 168423),
 'enwiki-04586442-0027-0000-0002': (757, 247550),
 'enwiki-05324078-0001-0000-0000': (879, 193589)}

In [11]:
ids = [
    [0,0],
    [142, 7270],
    [262, 84115],
    [382, 93226],
    [497, 203112],
    [631, 168423],
    [757, 247550],
    [879, 193589],
    [999,-1]
]

In [7]:
starts = [('enwiki-00000000-0000-0000-0000', 'enwiki-00005131-0001-0000-0000'),
 ('enwiki-00859626-0002-0001-0006', 'enwiki-00864894-0016-0000-0002'),
 ('enwiki-01587902-0000-0000-0002', 'enwiki-01592440-0010-0000-0000'),
 ('enwiki-02315769-0000-0000-0001', 'enwiki-02321829-0003-0000-0002'),
 ('enwiki-03013027-0007-0000-0006', 'enwiki-03017772-0006-0000-0004'),
 ('enwiki-03823541-0002-0000-0002', 'enwiki-03827874-0003-0000-0010'),
 ('enwiki-04586442-0027-0000-0002', 'enwiki-04590551-0015-0000-0004'),
 ('enwiki-05324078-0001-0000-0000', 'enwiki-05329181-0044-0002-0003')]

In [8]:
ends = [ ('enwiki-00855357-0012-0000-0004', 'enwiki-00859626-0002-0001-0005'),
 ('enwiki-01583453-0039-0000-0004', 'enwiki-01587902-0000-0000-0001'),
 ('enwiki-02308005-0007-0001-0004', 'enwiki-02315769-0000-0000-0000'),
 ('enwiki-03008104-0029-0001-0002', 'enwiki-03013027-0007-0000-0005'),
 ('enwiki-03818516-0004-0000-0001', 'enwiki-03823541-0002-0000-0001'),
 ('enwiki-04582327-0016-0000-0002', 'enwiki-04586442-0027-0000-0001'),
 ('enwiki-05319613-0001-0000-0007', 'enwiki-05324078-0000-0000-0003'),
 ('enwiki-06048186-0015-0000-0006', 'enwiki-06052180-0000-0000-0002')]

In [12]:
info = {}
for i in range(8):
    ids_2 = [ids[i+1][0],ids[i+1][1]-1]
    raw_dict = {
        "embedding_files":[starts[i],ends[i]],
        "text_files":[ids[i],ids_2]
    }
    info[str(i+1)] = raw_dict

In [13]:
info

{'1': {'embedding_files': [('enwiki-00000000-0000-0000-0000',
    'enwiki-00005131-0001-0000-0000'),
   ('enwiki-00855357-0012-0000-0004', 'enwiki-00859626-0002-0001-0005')],
  'text_files': [[0, 0], [142, 7269]]},
 '2': {'embedding_files': [('enwiki-00859626-0002-0001-0006',
    'enwiki-00864894-0016-0000-0002'),
   ('enwiki-01583453-0039-0000-0004', 'enwiki-01587902-0000-0000-0001')],
  'text_files': [[142, 7270], [262, 84114]]},
 '3': {'embedding_files': [('enwiki-01587902-0000-0000-0002',
    'enwiki-01592440-0010-0000-0000'),
   ('enwiki-02308005-0007-0001-0004', 'enwiki-02315769-0000-0000-0000')],
  'text_files': [[262, 84115], [382, 93225]]},
 '4': {'embedding_files': [('enwiki-02315769-0000-0000-0001',
    'enwiki-02321829-0003-0000-0002'),
   ('enwiki-03008104-0029-0001-0002', 'enwiki-03013027-0007-0000-0005')],
  'text_files': [[382, 93226], [497, 203111]]},
 '5': {'embedding_files': [('enwiki-03013027-0007-0000-0006',
    'enwiki-03017772-0006-0000-0004'),
   ('enwiki-038185

In [14]:
k = (0,0)

k[1]

0

In [16]:
from upload_to_chroma import load_file_from_gcs

In [None]:
file_no = 999

file =f'datasets/datasets--chentong00--factoid-wiki/snapshots/60bce4923950eab87192e276c9c5e5136234a760/data/docs-{file_no:04d}_of_1000.jsonl'
k = load_file_from_gcs("proposition-vectors",file)

NameError: name 'load_file_from_gcs' is not defined