In [1]:
from dotenv import load_dotenv
import os

load_dotenv()
gpt_key = os.getenv("GPT")
lc_key = os.getenv("LC")
hf_key = os.getenv("HF")



# Load documents page by page

In [2]:
from langchain.document_loaders.pdf import PyPDFDirectoryLoader

def load_documents():
    document_loader = PyPDFDirectoryLoader("../data-futbol")
    return document_loader.load()

In [3]:
documents=load_documents()

# Split each page further into chunks

In [4]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.schema.document import Document

def split_documents(documents: list[Document]):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=800,
        chunk_overlap=80,
        length_function=len,
        is_separator_regex=False,
    )
    return text_splitter.split_documents(documents)

In [5]:
chunks = split_documents(documents)

# EMBEDDINGS

In [6]:
from sentence_transformers import SentenceTransformer

# Wrapper class to make SentenceTransformer compatible
class EmbeddingWrapper:
    def __init__(self, model_name='sentence-transformers/all-mpnet-base-v2'):
        self.model = SentenceTransformer(model_name)
    
    # The vector store expects this method
    def embed_documents(self, texts):
        embeddings = self.model.encode(texts)
        return embeddings.tolist()  # Ensure embeddings are a list, not an array

    # Method to embed a single query
    def embed_query(self, query):
        return self.model.encode([query])[0]

def get_embedding_function():
    # Return an instance of the wrapper
    return EmbeddingWrapper()

# Add a unique id for each chunk to avoid multiple inserts for the same chunk to the vector database

In [7]:

def set_chunk_ids(chunks):
    last_page_id = None
    current_chunk_index = 0
    for chunk in chunks:
        source = chunk.metadata.get("source")
        page = chunk.metadata.get("page")
        current_page_id = f"{source}:{page}"
    
        if current_page_id==last_page_id:
            current_chunk_index +=1
        else:
            current_chunk_index = 0
    
        chunk.metadata["id"] = f"{current_page_id}:{current_chunk_index}"
        last_page_id = current_page_id
    

In [8]:
set_chunk_ids(chunks)

In [9]:
CHROMA_PATH = "chroma_dbs"

In [10]:
from langchain_chroma import Chroma
from langchain.schema.document import Document

def add_to_chroma(chunks: list[Document]):
    db=Chroma(
        persist_directory=CHROMA_PATH, embedding_function=get_embedding_function()
    )
    existing_items=db.get(include=[])
    existing_ids=set(existing_items["ids"])

    print(existing_ids)
    # Identify the new chunks
    new_chunks=[]
    for chunk in chunks:
        if chunk.metadata["id"] not in existing_ids:
            new_chunks.append(chunk)

    if len(new_chunks):
        print(f"👉 Adding new documents: {len(new_chunks)}")
        new_chunk_ids = [chunk.metadata["id"] for chunk in new_chunks]
        db.add_documents(new_chunks, ids=new_chunk_ids)
    else:
        print("✅ No new documents to add")



In [11]:
add_to_chroma(chunks)

set()
👉 Adding new documents: 124


# RESETTING THE DATABASE IF NEEDED (restart Python environment afterwards)

In [164]:
import shutil
import os

# Force delete the directory
if os.path.exists(CHROMA_PATH):
    shutil.rmtree(CHROMA_PATH)

# Confirm deletion
if not os.path.exists(CHROMA_PATH):
    print(f"{CHROMA_PATH} successfully deleted.")
else:
    print(f"Failed to delete {CHROMA_PATH}.")


chroma_dbs successfully deleted.


# Connecting to the model API

In [12]:

from langchain_huggingface import HuggingFaceEndpoint
repo_id = "mistralai/Mixtral-8x7B-Instruct-v0.1"

model_kwargs = {
    "max_length": 128
}


llm = HuggingFaceEndpoint(
    repo_id=repo_id,
    temperature=0.5,
    model_kwargs=model_kwargs,
    huggingfacehub_api_token=hf_key,
)


In [13]:
from langchain_core.prompts import ChatPromptTemplate

def query_rag(query_text: str):
    embedding_function = get_embedding_function()
    db = Chroma(
        persist_directory = CHROMA_PATH,
        embedding_function = embedding_function
    )
    PROMPT_TEMPLATE = """
    Answer the question based only on the following context:
    {context}

    ---
    Answer the question based on the above context: {question}
    """
    results = db.similarity_search_with_score(query_text, k=5)
    context_text = "\n\n---\n\n".join([doc.page_content for doc,_score in results])
    prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
    prompt= prompt_template.format(context=context_text, question=query_text)
    # Directing the prompt to the model
    response_text = llm.invoke(prompt)
    print(response_text)

In [15]:
query_rag("what is the importance of scotland in football")


    Scotland is important in football because it is where the new 'combinatory', 'positional' football was developed. This was due to the intelligent adaptability of men taking up a game dominated by players who were bigger and stronger, but less mobile. The new football was not necessarily connected to the highly organized technical division of labour with which players from an industrial working-class background on Clydeside would have been familiar. The early Scottish dribblers and 'ball-players' were celebrated as the heroes of this 'new' football.
