In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
pip install -r '/content/drive/MyDrive/LLMs_Project/requirements.txt'

Collecting pymupdf (from -r /content/drive/MyDrive/LLMs_Project/requirements.txt (line 1))
  Downloading pymupdf-1.27.1-cp310-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Collecting langchain-text-splitters (from -r /content/drive/MyDrive/LLMs_Project/requirements.txt (line 6))
  Downloading langchain_text_splitters-1.1.1-py3-none-any.whl.metadata (3.3 kB)
Downloading pymupdf-1.27.1-cp310-abi3-manylinux_2_28_x86_64.whl (24.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.9/24.9 MB[0m [31m21.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading langchain_text_splitters-1.1.1-py3-none-any.whl (35 kB)
Installing collected packages: pymupdf, langchain-text-splitters
Successfully installed langchain-text-splitters-1.1.1 pymupdf-1.27.1


In [None]:
import os
docs_path = '/content/drive/MyDrive/LLMs_Project/docs/'
texts_path = '/content/drive/MyDrive/LLMs_Project/texts_extracted/'
sentences_path = '/content/drive/MyDrive/LLMs_Project/sentences_extracted/'
chunks_path = '/content/drive/MyDrive/LLMs_Project/chunks_extracted/'
requirements_path = '/content/drive/MyDrive/LLMs_Project/requirements.txt'
print("Docs to be processed")
file_list = os.listdir(docs_path)
document_count = len(file_list)
print(file_list)
print(f"Total documents found: {document_count}")

Docs to be processed
Total documents found: 4
['Laws of the Game 2025_26_single pages.pdf', 'FIFA Disciplinary Code_September 2025 edition_EN.pdf', 'FIFA Equipment Regulations_2025_EN.pdf', 'FWC26_Competition Regulations_EN.pdf']


In [36]:
import pymupdf
import re
import spacy
from glob import glob
from langchain_text_splitters import SpacyTextSplitter

def clean_text(text: str) -> str:
    # Remove non-printable control characters
    text = re.sub(r'[\x00-\x1F\x7F]', '', text)
    
    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

def extract_text(file_path: str, destination_folder: str) -> str:
    text_lines = []
    print(f"Processing file: {file_path}")
    doc = pymupdf.open(file_path)
    for page in doc:
      text = page.get_text()
      text_lines.extend(text.splitlines())
      text_lines = [clean_text(line) for line in text_lines]
    with open(destination_folder + file_path.split("/")[-1].split(".")[0] + ".txt", "w") as file:
        file.write("\n".join(text_lines))
        print(f"Extracted text saved in: {destination_folder + file_path.split("/")[-1].split(".")[0] + '.txt'}")


def chunk_text(text: str, chunk_size=500, chunk_overlap=100):
    text_splitter = SpacyTextSplitter(
        pipeline="en_core_web_sm",  # Uses spaCy for sentence splitting
        chunk_size=chunk_size,              # Then groups sentences into chunks of this size
        chunk_overlap=chunk_overlap            # Adds overlap between chunks
    )
    chunks = text_splitter.split_text(text)
    print(f"Number of chunks in: {len(chunks)}")
    return chunks

In [29]:
for file_path in glob(docs_path + "*.pdf", recursive=True):
    extract_text(file_path, texts_path)


Processing file: /content/drive/MyDrive/LLMs_Project/docs/Laws of the Game 2025_26_single pages.pdf
Extracted text saved in: /content/drive/MyDrive/LLMs_Project/texts_extracted/Laws of the Game 2025_26_single pages.txt
Processing file: /content/drive/MyDrive/LLMs_Project/docs/FIFA Disciplinary Code_September 2025 edition_EN.pdf
Extracted text saved in: /content/drive/MyDrive/LLMs_Project/texts_extracted/FIFA Disciplinary Code_September 2025 edition_EN.txt
Processing file: /content/drive/MyDrive/LLMs_Project/docs/FIFA Equipment Regulations_2025_EN.pdf
Extracted text saved in: /content/drive/MyDrive/LLMs_Project/texts_extracted/FIFA Equipment Regulations_2025_EN.txt
Processing file: /content/drive/MyDrive/LLMs_Project/docs/FWC26_Competition Regulations_EN.pdf
Extracted text saved in: /content/drive/MyDrive/LLMs_Project/texts_extracted/FWC26_Competition Regulations_EN.txt


In [40]:

chunks = []
for file_path in glob(texts_path + "*.txt", recursive=True):
    with open(file_path, "r") as file:
        text = file.read()
    content = chunk_text(text)
    metadata = {"source": file_path.split("/")[-1].split(".")[0]}
    for chunk in content:
        chunks.append({"chunk": chunk, "metadata": metadata})



Number of chunks in: 234




Number of chunks in: 284




Number of chunks in: 334




Number of chunks in: 466


In [43]:
print(f"Total number of chunks created: {len(chunks)}")

Total number of chunks created: 1318


In [1]:
# Requires transformers>=4.51.0

import torch
import torch.nn.functional as F

from torch import Tensor
from transformers import AutoTokenizer, AutoModel
from typing import List

In [6]:
model_name = "Qwen/Qwen3-Embedding-0.6B"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    padding_side="left"
)


model = AutoModel.from_pretrained(model_name).to(device)
model.eval()

Loading weights:   0%|          | 0/310 [00:00<?, ?it/s]

Qwen3Model(
  (embed_tokens): Embedding(151669, 1024)
  (layers): ModuleList(
    (0-27): 28 x Qwen3DecoderLayer(
      (self_attn): Qwen3Attention(
        (q_proj): Linear(in_features=1024, out_features=2048, bias=False)
        (k_proj): Linear(in_features=1024, out_features=1024, bias=False)
        (v_proj): Linear(in_features=1024, out_features=1024, bias=False)
        (o_proj): Linear(in_features=2048, out_features=1024, bias=False)
        (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
        (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
      )
      (mlp): Qwen3MLP(
        (gate_proj): Linear(in_features=1024, out_features=3072, bias=False)
        (up_proj): Linear(in_features=1024, out_features=3072, bias=False)
        (down_proj): Linear(in_features=3072, out_features=1024, bias=False)
        (act_fn): SiLUActivation()
      )
      (input_layernorm): Qwen3RMSNorm((1024,), eps=1e-06)
      (post_attention_layernorm): Qwen3RMSNorm((1024,), eps=1e-06)
    )
  )
  (norm): Qwen3RM

In [3]:
def last_token_pool(last_hidden_states: Tensor,
                    attention_mask: Tensor) -> Tensor:
    left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
    if left_padding:
        return last_hidden_states[:, -1]
    else:
        sequence_lengths = attention_mask.sum(dim=1) - 1
        batch_size = last_hidden_states.shape[0]
        return last_hidden_states[
            torch.arange(batch_size, device=last_hidden_states.device),
            sequence_lengths
        ]

def get_embeddings(texts: List[str],
                   task_description: str = None,
                   max_length: int = 8192) -> List[List[float]]:

    if not texts:
        return []

    batch_dict = tokenizer(
        texts,
        padding=True,
        truncation=True,
        max_length=max_length,
        return_tensors="pt",
    )

    batch_dict = {k: v.to(device) for k, v in batch_dict.items()}

    with torch.no_grad():
        outputs = model(**batch_dict)

    embeddings = last_token_pool(
        outputs.last_hidden_state,
        batch_dict["attention_mask"]
    )

    # Normalize (for cosine similarity search)
    embeddings = F.normalize(embeddings, p=2, dim=1)

    return embeddings.cpu().tolist()

In [48]:
texts = [c["chunk"] for c in chunks]
embeddings = get_embeddings(texts)

for i, emb in enumerate(embeddings):
    chunks[i]["embedding"] = emb

: 

: 

: 

# Milvus
https://milvus.io/docs/full_text_search_with_milvus.md

In [None]:
from pymilvus import (
    MilvusClient,
    DataType,
    Function,
    FunctionType,
    AnnSearchRequest,
    RRFRanker,
)

In [None]:
uri = "http://localhost:19530"
collection_name = "full_text_demo"
client = MilvusClient(uri=uri)

In [None]:
analyzer_params = {"tokenizer": "standard", "filter": ["lowercase"]}


In [None]:
schema = MilvusClient.create_schema()
schema.add_field(
    field_name="id",
    datatype=DataType.VARCHAR,
    is_primary=True,
    auto_id=True,
    max_length=100,
)
schema.add_field(
    field_name="content",
    datatype=DataType.VARCHAR,
    max_length=65535,
    analyzer_params=analyzer_params,
    enable_match=True,  # Enable text matching
    enable_analyzer=True,  # Enable text analysis
)
schema.add_field(field_name="sparse_vector", datatype=DataType.SPARSE_FLOAT_VECTOR)
schema.add_field(
    field_name="dense_vector",
    datatype=DataType.FLOAT_VECTOR,
    dim=1536,  # Dimension for text-embedding-3-small
)
schema.add_field(field_name="metadata", datatype=DataType.JSON)

bm25_function = Function(
    name="bm25",
    function_type=FunctionType.BM25,
    input_field_names=["content"],
    output_field_names="sparse_vector",
)

schema.add_function(bm25_function)


In [None]:
index_params = MilvusClient.prepare_index_params()
index_params.add_index(
    field_name="sparse_vector",
    index_type="SPARSE_INVERTED_INDEX",
    metric_type="BM25",
)
index_params.add_index(field_name="dense_vector", index_type="FLAT", metric_type="IP")

if client.has_collection(collection_name):
    client.drop_collection(collection_name)
client.create_collection(
    collection_name=collection_name,
    schema=schema,
    index_params=index_params,
)
print(f"Collection '{collection_name}' created successfully")


In [None]:
for i, doc in enumerate(documents):
    entities.append(
        {
            "content": doc["content"],
            "dense_vector": embeddings[i],
            "metadata": doc.get("metadata", {}),
        }
    )

# Insert data
client.insert(collection_name, entities)
print(f"Inserted {len(entities)} documents")
