In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
pip install -r '/content/drive/MyDrive/LLMs_Project/requirements.txt'



In [4]:
import os
docs_path = '/content/drive/MyDrive/LLMs_Project/docs/'
#docs_path = '/docs/'
texts_path = '/content/drive/MyDrive/LLMs_Project/texts_extracted/'
#texts_path = '/texts_extracted/'
requirements_path = '/content/drive/MyDrive/LLMs_Project/requirements.txt'
print("Docs to be processed")
file_list = os.listdir(docs_path)
document_count = len(file_list)
print(file_list)
print(f"Total documents found: {document_count}")

Docs to be processed
['Laws of the Game 2025_26_single pages.pdf', 'FIFA Disciplinary Code_September 2025 edition_EN.pdf', 'FIFA Equipment Regulations_2025_EN.pdf', 'FWC26_Competition Regulations_EN.pdf']
Total documents found: 4


In [5]:
import pymupdf
import re
import spacy
from glob import glob
from langchain_text_splitters import SpacyTextSplitter

def clean_text(text: str) -> str:
    # Remove non-printable control characters
    text = re.sub(r'[\x00-\x1F\x7F]', '', text)
    
    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

def extract_text(file_path: str, destination_folder: str) -> str:
    text_lines = []
    print(f"Processing file: {file_path}")
    doc = pymupdf.open(file_path)
    for page in doc:
      text = page.get_text()
      text_lines.extend(text.splitlines())
      text_lines = [clean_text(line) for line in text_lines]
    with open(destination_folder + file_path.split("/")[-1].split(".")[0] + ".txt", "w") as file:
        file.write("\n".join(text_lines))
        print(f"Extracted text saved in: {destination_folder + file_path.split("/")[-1].split(".")[0] + '.txt'}")


def chunk_text(text: str, chunk_size=500, chunk_overlap=100):
    text_splitter = SpacyTextSplitter(
        pipeline="en_core_web_sm",  # Uses spaCy for sentence splitting
        chunk_size=chunk_size,      # Then groups sentences into chunks of this size
        chunk_overlap=chunk_overlap # Adds overlap between chunks
    )
    chunks = text_splitter.split_text(text)
    print(f"Number of chunks in: {len(chunks)}")
    return chunks

In [6]:
for file_path in glob(docs_path + "*.pdf", recursive=True):
    extract_text(file_path, texts_path)

Processing file: /content/drive/MyDrive/LLMs_Project/docs/Laws of the Game 2025_26_single pages.pdf
Extracted text saved in: /content/drive/MyDrive/LLMs_Project/texts_extracted/Laws of the Game 2025_26_single pages.txt
Processing file: /content/drive/MyDrive/LLMs_Project/docs/FIFA Disciplinary Code_September 2025 edition_EN.pdf
Extracted text saved in: /content/drive/MyDrive/LLMs_Project/texts_extracted/FIFA Disciplinary Code_September 2025 edition_EN.txt
Processing file: /content/drive/MyDrive/LLMs_Project/docs/FIFA Equipment Regulations_2025_EN.pdf
Extracted text saved in: /content/drive/MyDrive/LLMs_Project/texts_extracted/FIFA Equipment Regulations_2025_EN.txt
Processing file: /content/drive/MyDrive/LLMs_Project/docs/FWC26_Competition Regulations_EN.pdf
Extracted text saved in: /content/drive/MyDrive/LLMs_Project/texts_extracted/FWC26_Competition Regulations_EN.txt


In [7]:
chunks = []
for file_path in glob(texts_path + "*.txt", recursive=True):
    with open(file_path, "r") as file:
        text = file.read()
    content = chunk_text(text)
    metadata = {"source": file_path.split("/")[-1].split(".")[0]}
    for chunk in content:
        chunks.append({"chunk": chunk, "metadata": metadata})
print(f"Total number of chunks created: {len(chunks)}")



Number of chunks in: 334




Number of chunks in: 466




Number of chunks in: 234




Number of chunks in: 284
Total number of chunks created: 1318


In [8]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("Qwen/Qwen3-Embedding-0.6B")

Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


Loading weights:   0%|          | 0/310 [00:00<?, ?it/s]

In [None]:
# 0 - 334 FIFA Equipment Regulations_2025_EN
# 334 - 800 Laws of the Game 25/26
# 800 - 1034 FIFA Disciplinary Code
# 1034 - 1318 FWC26_Competition Regulations_EN

In [None]:
texts = [c["chunk"] for c in chunks]

284


In [None]:
embeddings = []
for i, text in enumerate(texts):
    print(f"Processing chunk: {i+1}/{len(texts)}")
    embedding = model.encode(text, show_progress_bar=True)
    embeddings.append(embedding)

Processing chunk: 1/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 2/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 3/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 4/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 5/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 6/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 7/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 8/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 9/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 10/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 11/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 12/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 13/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 14/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 15/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 16/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 17/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 18/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 19/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 20/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 21/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 22/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 23/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 24/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 25/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 26/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 27/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 28/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 29/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 30/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 31/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 32/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 33/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 34/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 35/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 36/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 37/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 38/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 39/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 40/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 41/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 42/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 43/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 44/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 45/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 46/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 47/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 48/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 49/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 50/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 51/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 52/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 53/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 54/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 55/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 56/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 57/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 58/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 59/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 60/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 61/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 62/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 63/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 64/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 65/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 66/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 67/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 68/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 69/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 70/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 71/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 72/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 73/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 74/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 75/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 76/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 77/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 78/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 79/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 80/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 81/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 82/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 83/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 84/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 85/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 86/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 87/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 88/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 89/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 90/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 91/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 92/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 93/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 94/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 95/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 96/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 97/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 98/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 99/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 100/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 101/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 102/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 103/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 104/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 105/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 106/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 107/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 108/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 109/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 110/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 111/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 112/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 113/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 114/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 115/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 116/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 117/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 118/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 119/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 120/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 121/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 122/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 123/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 124/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 125/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 126/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 127/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 128/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 129/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 130/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 131/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 132/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 133/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 134/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 135/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 136/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 137/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 138/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 139/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 140/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 141/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 142/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 143/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 144/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 145/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 146/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 147/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 148/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 149/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 150/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 151/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 152/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 153/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 154/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 155/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 156/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 157/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 158/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 159/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 160/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 161/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 162/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 163/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 164/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 165/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 166/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 167/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 168/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 169/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 170/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 171/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 172/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 173/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 174/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 175/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 176/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 177/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 178/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 179/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 180/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 181/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 182/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 183/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 184/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 185/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 186/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 187/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 188/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 189/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 190/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 191/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 192/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 193/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 194/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 195/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 196/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 197/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 198/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 199/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 200/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 201/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 202/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 203/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 204/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 205/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 206/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 207/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 208/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 209/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 210/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 211/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 212/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 213/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 214/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 215/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 216/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 217/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 218/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 219/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 220/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 221/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 222/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 223/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 224/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 225/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 226/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 227/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 228/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 229/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 230/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 231/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 232/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 233/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 234/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 235/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 236/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 237/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 238/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 239/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 240/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 241/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 242/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 243/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 244/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 245/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 246/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 247/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 248/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 249/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 250/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 251/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 252/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 253/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 254/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 255/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 256/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 257/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 258/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 259/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 260/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 261/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 262/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 263/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 264/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 265/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 266/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 267/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 268/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 269/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 270/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 271/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 272/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 273/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 274/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 275/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 276/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 277/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 278/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 279/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 280/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 281/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 282/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 283/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing chunk: 284/284


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [48]:
print(len(embeddings))
print(len(embeddings[0]))

284
1024


In [None]:
import json
with open('/content/drive/MyDrive/LLMs_Project/chunks.json', 'w') as fout:
    json.dump(chunks, fout)

In [51]:
import numpy as np
np.save("/content/drive/MyDrive/LLMs_Project/embedding.npy", embeddings)