In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
pip install -r '/content/drive/MyDrive/LLMs_Project/requirements.txt'

In [None]:
import os
#docs_path = '/content/drive/MyDrive/LLMs_Project/docs/'
docs_path = '/docs/'
#texts_path = '/content/drive/MyDrive/LLMs_Project/texts_extracted/'
texts_path = '/texts_extracted/'
requirements_path = '/content/drive/MyDrive/LLMs_Project/requirements.txt'
print("Docs to be processed")
file_list = os.listdir(docs_path)
document_count = len(file_list)
print(file_list)
print(f"Total documents found: {document_count}")

In [None]:
import pymupdf
import re
import spacy
from glob import glob
from langchain_text_splitters import SpacyTextSplitter

def clean_text(text: str) -> str:
    # Remove non-printable control characters
    text = re.sub(r'[\x00-\x1F\x7F]', '', text)
    
    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

def extract_text(file_path: str, destination_folder: str) -> str:
    text_lines = []
    print(f"Processing file: {file_path}")
    doc = pymupdf.open(file_path)
    for page in doc:
      text = page.get_text()
      text_lines.extend(text.splitlines())
      text_lines = [clean_text(line) for line in text_lines]
    with open(destination_folder + file_path.split("/")[-1].split(".")[0] + ".txt", "w") as file:
        file.write("\n".join(text_lines))
        print(f"Extracted text saved in: {destination_folder + file_path.split("/")[-1].split(".")[0] + '.txt'}")


def chunk_text(text: str, chunk_size=500, chunk_overlap=100):
    text_splitter = SpacyTextSplitter(
        pipeline="en_core_web_sm",  # Uses spaCy for sentence splitting
        chunk_size=chunk_size,      # Then groups sentences into chunks of this size
        chunk_overlap=chunk_overlap # Adds overlap between chunks
    )
    chunks = text_splitter.split_text(text)
    print(f"Number of chunks in: {len(chunks)}")
    return chunks

In [None]:
for file_path in glob(docs_path + "*.pdf", recursive=True):
    extract_text(file_path, texts_path)

In [None]:
chunks = []
for file_path in glob(texts_path + "*.txt", recursive=True):
    with open(file_path, "r") as file:
        text = file.read()
    content = chunk_text(text)
    metadata = {"source": file_path.split("/")[-1].split(".")[0]}
    for chunk in content:
        chunks.append({"chunk": chunk, "metadata": metadata})
print(f"Total number of chunks created: {len(chunks)}")

In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("Qwen/Qwen3-Embedding-0.6B")

In [None]:
texts = [c["chunk"] for c in chunks]
doc_1 = texts[0:333]

In [None]:
embeddings = model.encode(texts, show_progress_bar=True)

In [None]:
print(len(embeddings))
print(len(embeddings[0]))

In [None]:
for i, emb in enumerate(embeddings):
    chunks[i]["embedding"] = emb

In [None]:
import json

with open("/content/drive/MyDrive/LLMs_Project/embeddings.json", "w") as f:
    json.dump(chunks, f)

# Milvus
https://milvus.io/docs/full_text_search_with_milvus.md

In [None]:
from pymilvus import (
    MilvusClient,
    DataType,
    Function,
    FunctionType,
    AnnSearchRequest,
    RRFRanker,
)

In [None]:
uri = "http://localhost:19530"
collection_name = "football_docs"
client = MilvusClient(uri=uri)

In [None]:
analyzer_params = {"tokenizer": "standard", "filter": ["lowercase"]}


In [None]:
schema = MilvusClient.create_schema()
schema.add_field(
    field_name="id",
    datatype=DataType.VARCHAR,
    is_primary=True,
    auto_id=True,
    max_length=100,
)
schema.add_field(
    field_name="content",
    datatype=DataType.VARCHAR,
    max_length=65535,
    analyzer_params=analyzer_params,
    enable_match=True,  # Enable text matching
    enable_analyzer=True,  # Enable text analysis
)
schema.add_field(field_name="sparse_vector", datatype=DataType.SPARSE_FLOAT_VECTOR)
schema.add_field(
    field_name="dense_vector",
    datatype=DataType.FLOAT_VECTOR,
    dim=1024,  # Dimension for Qwen3-Embedding-0.6B
)
schema.add_field(field_name="metadata", datatype=DataType.JSON)

bm25_function = Function(
    name="bm25",
    function_type=FunctionType.BM25,
    input_field_names=["content"],
    output_field_names="sparse_vector",
)

schema.add_function(bm25_function)


In [None]:
index_params = MilvusClient.prepare_index_params()
index_params.add_index(
    field_name="sparse_vector",
    index_type="SPARSE_INVERTED_INDEX",
    metric_type="BM25",
)
index_params.add_index(field_name="dense_vector", index_type="FLAT", metric_type="IP")

if client.has_collection(collection_name):
    client.drop_collection(collection_name)
client.create_collection(
    collection_name=collection_name,
    schema=schema,
    index_params=index_params,
)
print(f"Collection '{collection_name}' created successfully")


In [None]:
for i, doc in enumerate(documents):
    entities.append(
        {
            "content": doc["content"],
            "dense_vector": embeddings[i],
            "metadata": doc.get("metadata", {}),
        }
    )

# Insert data
client.insert(collection_name, entities)
print(f"Inserted {len(entities)} documents")


In [None]:
query = "what is hybrid search"

query_embedding = get_embeddings([query])[0]

sparse_search_params = {"metric_type": "BM25"}
sparse_request = AnnSearchRequest(
    [query], "sparse_vector", sparse_search_params, limit=5
)

dense_search_params = {"metric_type": "IP"}
dense_request = AnnSearchRequest(
    [query_embedding], "dense_vector", dense_search_params, limit=5
)

results = client.hybrid_search(
    collection_name,
    [sparse_request, dense_request],
    ranker=RRFRanker(),  # Reciprocal Rank Fusion for combining results
    limit=5,
    output_fields=["content", "metadata"],
)
hybrid_results = results[0]

print("\nHybrid Search (Combined):")
for i, result in enumerate(hybrid_results):
    print(
        f"{i+1}. Score: {result['distance']:.4f}, Content: {result['entity']['content']}"
    )


# Answer generation

In [None]:
context = "\n\n".join([doc["entity"]["content"] for doc in hybrid_results])

prompt = f"""Answer the following question based on the provided context. 
If the context doesn't contain relevant information, just say "I don't have enough information to answer this question."

Context:
{context}

Question: {query}

Answer:"""

response = openai_client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[
        {
            "role": "system",
            "content": "You are a helpful assistant that answers questions based on the provided context.",
        },
        {"role": "user", "content": prompt},
    ],
)

print(response.choices[0].message.content)
