In [1]:
from google.cloud import storage

In [22]:
bucket_name = "sci_papers_bootcamp"
project_name="jakub-le-wagon-bootcamp"
prefix = 'cord19/'


In [3]:
# Initialize GCS client
storage_client = storage.Client(project=project_name)
bucket = storage_client.bucket(bucket_name) 

In [23]:
# Get list of files already in the bucket
print("Listing existing files in the bucket...")
existing_files = set()

blobs = bucket.list_blobs(prefix=prefix)
for blob in blobs:
    # Remove the prefix to get just the relative path
    if blob.name.startswith(prefix):
        existing_files.add(blob.name[len(prefix):])

print(f"Found {len(existing_files)} files already in the bucket")

Listing existing files in the bucket...
Found 92783 files already in the bucket


# Processing of input json files

In [29]:
import json
from tqdm import tqdm

In [33]:
def process_article(file):
    paper_id = file['paper_id']
    body = ""
    
    body += file['metadata']['title']
    body += "\n\n"
    if len(file['abstract']) > 0:
        body += file['abstract'][0]['section']
        body += "\n\n"
        body += file['abstract'][0]['text']
        body += "\n\n"
    
    texts = [(di['section'], di['text']) for di in file['body_text']]
    texts_di = {di['section']: "" for di in file['body_text']}
    for section, text in texts:
        texts_di[section] += text
    
    for section, text in texts_di.items():
        body += section
        body += "\n\n"
        body += text
        body += "\n\n"
    
    
    return {"paper_id": paper_id, "content" : body}

In [36]:
def process_batch(batch_blobs, bucket_name):
    processed_data = []
    
    for blob in tqdm(batch_blobs, desc="Processing files"):
        # Read content directly into memory
        content = blob.download_as_text()
        
        # Parse JSON
        try:
            data = json.loads(content)
            processed_data.append(process_article(data))
        except json.JSONDecodeError as e:
            print(f"Error parsing {blob.name}: {e}")
    
    return processed_data

# Vector DB

In [76]:
!pip install --upgrade pip
!pip install --upgrade langchain-text-splitters langchain-community langgraph langchain_huggingface
!pip install faiss-gpu

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting pip
  Downloading pip-25.0.1-py3-none-any.whl.metadata (3.7 kB)
Downloading pip-25.0.1-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m25.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 25.0
    Uninstalling pip-25.0:
      Successfully uninstalled pip-25.0
Successfully installed pip-25.0.1


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [42]:
from langchain_huggingface import HuggingFaceEmbeddings

#embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [77]:
import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS

index = faiss.IndexFlatL2(len(embeddings.embed_query("hello world")))

vector_store = FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
#    persist_directory = "faiss", #https://docs.langflow.org/components-vector-stores#inputs-8
)

In [44]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=400)

In [79]:
def vectorize_batch(batch, vectore_store, splitter):
    """
    Process a batch of articles, split them into chunks, and add to FAISS vector store.
    
    Args:
        batch: List of dictionaries, each with 'paper_id' and 'content' keys
    """
    
    texts = []
    metadatas = []
    
    # Process each article in the batch
    for paper in tqdm(batch, desc="Vectorizing files"):
        paper_id = paper['paper_id']
        content = paper['content']
        
        # Skip if missing required fields
        if not paper_id or not content:
            continue
        
        # Split the article into chunks
        chunks = splitter.split_text(content)
        
        # Add each chunk with its metadata
        for i, chunk_text in enumerate(chunks):
            texts.append(chunk_text)
            metadatas.append({
                'paper_id': paper_id,
                'chunk_id': f"{paper_id}_chunk_{i}",
                'source': paper_id
            })
    
    # Skip if no chunks were created
    if not texts:
        return
    
    # Add to FAISS vector store
    print(f"Adding {len(texts)} chunks into vector DB")
    vector_store.add_texts(texts, metadatas)
    
    

# Runtime

In [None]:
%%time

# List all blobs in the bucket with the given prefix
all_blobs = list(bucket.list_blobs(prefix=prefix, max_results=100000))
total_files = len(all_blobs)
print(f"Found {total_files} files in the bucket")

batch_size = 500

# Process in batches
for i in range(0, total_files, batch_size):
    batch_end = min(i + batch_size, total_files)
    current_batch = all_blobs[i:batch_end]

    print(f"Processing batch {i//batch_size + 1}/{(total_files-1)//batch_size + 1} ({len(current_batch)} files)")

    # Process the current batch
    processed_batch = process_batch(current_batch, bucket_name)
    vectorize_batch(processed_batch, vector_store, splitter) 

    print(f"Successfully processed {len(processed_batch)} files in the current batch")

    # Clear memory
    del processed_batch

vector_store.save_local("faiss")

Found 100000 files in the bucket
Processing batch 1/200 (500 files)


Processing files:  76%|███████▋  | 382/500 [00:13<00:05, 23.18it/s]

In [57]:
#!pwd
#/home/jupyter

In [61]:
#import os
#os.mkdir("faiss")

In [66]:
vew_vs = FAISS.load_local("faiss", embeddings=embeddings, allow_dangerous_deserialization=True)

In [68]:
query = "What is the efficacy of various vaccines?"

In [71]:
vew_vs.search(query, search_type="similarity", k=5)

[Document(id='7e58775b-4c3c-4eb8-9f8f-950d6b7f8f23', metadata={'article_id': '00046b27022615aaec3782ea69c56da3d2fd2ffa', 'chunk_id': '00046b27022615aaec3782ea69c56da3d2fd2ffa_chunk_10', 'source': '00046b27022615aaec3782ea69c56da3d2fd2ffa'}, page_content='weakened immune systems. These factors also cause different diseases like obesity, metabolic syndrome, type II diabetes, and immunemediated cancers. The reasons for developing these diseases include reduction of immune cell levels and their function, a weak affinity for antigen recognition, the increased time required for humoral immune responses, and defect of memory cells [42] . Moreover, the administration of immunosuppressive medicines [46] and habitation in lowincome countries with a low socioeconomic position lead to more mortality compared with high-income countries [47] . Other important factors that impact the ineffectiveness of the COVID-19 vaccine are the high rate of obesity because of increased secretion of IL-6 and decrea