In [67]:
import os
from typing import List

from dotenv import load_dotenv
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_core.documents.base import Document
from langchain_openai import OpenAIEmbeddings
from langchain_pinecone import PineconeVectorStore
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pinecone import Index, Pinecone


def get_text_from_pdf(pdf_file: str) -> List[Document]:
    """
    Extracts text from a PDF file and returns it as a list of Document objects.

    Args:
        pdf_file (str): Path to the PDF file to be processed.

    Returns:
        List[Document]: A list of Document objects, where each represents a page in the PDF.
    """
    # Initialize the PyMuPDFLoader with the given PDF file
    loader = PyMuPDFLoader(pdf_file)

    # Initialize an empty list to store the pages
    pages: List[Document] = []

    # Iterate over each loaded page and add it to the list
    for page in loader.load():
        pages.append(page)

    # Return the list of extracted pages
    return pages

In [99]:
from dotenv import load_dotenv
import os

# Load the OpenAI API key from the .env file
load_dotenv()
api_key = os.getenv("api_key")
if api_key is None:
    raise ValueError("The api_key environment variable is not set.")

In [69]:
pdf_files = [f for f in os.listdir() if f.endswith(".pdf")]
pdf_files

['Action_and_Points_No_Bold.pdf.pdf',
 'How_to_Manage_Your_Squad.pdf.pdf',
 'Project - 1st Delivery.pdf',
 'Solution_Design_Group4.pdf.pdf',
 'Stats_Explained.pdf.pdf']

In [70]:
# Initialize a list to store all extracted documents
docs: List[Document] = []

# Extract text from each PDF file and add to the docs list
for pdf_file in pdf_files:
    docs.extend(get_text_from_pdf(pdf_file))

In [71]:
text_splitter = RecursiveCharacterTextSplitter(
    separators="\n",  # Split on newlines
    chunk_size=1000,  # Maximum size of each chunk
    chunk_overlap=250,  # Overlap between chunks to preserve context
    add_start_index=True,  # Include the starting index of each chunk
)

In [72]:
all_splits = text_splitter.split_documents(docs)

In [75]:
embeddings_model = OpenAIEmbeddings(api_key= api_key, model="text-embedding-3-small")

embeddings = embeddings_model.embed_documents(
    [
        "Hi there!",
        "Oh, hello!",
        "What's your name?",
        "My friends call me World",
        "Hello World!"
    ]
)

len(embeddings), len(embeddings[0])

(5, 1536)

In [None]:
import os
import pinecone
import openai
from PyPDF2 import PdfReader
from sentence_transformers import SentenceTransformer

In [90]:
import os

use_serverless = True

In [91]:
from pinecone import Pinecone

api_key = os.environ.get("PINECONE_API_KEY")

pc = Pinecone(api_key=api_key)

In [92]:
from pinecone import ServerlessSpec

spec = ServerlessSpec(cloud="aws", region= "us-east-1")

In [93]:
index_name = "capstone-project"

In [94]:
if index_name in pc.list_indexes().names():
    pc.delete_index(index_name)

In [95]:
import time

dimensions = 1536
pc.create_index(name = index_name,
                dimension= dimensions,
                metric= "cosine",
                spec = spec)

while not pc.describe_index(index_name).status["ready"]:
    time.sleep(1)

In [96]:
index = pc.Index(index_name)

In [None]:
from dotenv import load_dotenv
import os

# Load the OpenAI API key from the .env file
load_dotenv()
api_key = os.getenv("api_key")
if api_key is None:
    raise ValueError("The api_key environment variable is not set.")

In [101]:
# Initialize a Pinecone vector store with OpenAI embeddings
vector_store = PineconeVectorStore(
    index=index, embedding=OpenAIEmbeddings(api_key=api_key, model="text-embedding-3-small")
)

In [102]:
# Generate unique IDs for each chunk
ids = [str(i) for i in range(len(all_splits))]

# Add the documents and their embeddings to the vector store
vector_store.add_documents(documents=all_splits, ids=ids)

['0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 '10',
 '11',
 '12',
 '13',
 '14',
 '15',
 '16',
 '17',
 '18',
 '19',
 '20',
 '21',
 '22',
 '23',
 '24',
 '25',
 '26',
 '27',
 '28',
 '29',
 '30',
 '31',
 '32',
 '33',
 '34',
 '35',
 '36',
 '37',
 '38']

In [103]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI
from langchain_core.prompts import PromptTemplate
from operator import itemgetter


def format_docs(documents):
    return "\n\n".join(doc.page_content for doc in documents)

In [104]:
retriever = vector_store.as_retriever(
            search_type="similarity_score_threshold",
            search_kwargs={"k": 2, "score_threshold": 0.5},
        )

In [107]:
response = retriever.invoke("How many poits by scoring a goal?")

In [108]:
print(response[0].page_content)


Action and Points
For playing up to 60 minutes: 1 point
For playing 60 minutes or more: 2 points
For each goal scored by goalkeeper: 10 points
For each goal scored by defender: 6 points
For each goal scored by midfielder: 5 points
For each goal scored by forward: 4 points
For each assist for a goal: 3 points
For a clean sheet by a goalkeeper or defender: 4 points
For a clean sheet by a midfielder: 1 point
For every 3 shots saved by goalkeeper: 1 point
For each penalty save: 5 points
For each penalty miss: -2 points
Bonus points for man of the match: 3 points
For every 2 goals conceded by goalkeeper or defender: -1 point
For each yellow card: -1 point
For each red card: -3 points
For each own goal: -2 points


In [110]:
llm = ChatOpenAI(api_key=api_key, model="gpt-4o-mini")


template = """Use the following pieces of context to answer the question at the end.
            If you don't know the answer, just say that you don't know, don't try to make up an answer.
            Use three sentences maximum and keep the answer as concise as possible.
            You have acess to the previous conversation history to personalize the conversation.

{context}

Question: {question}
"""

custom_rag_prompt = PromptTemplate.from_template(template)

In [111]:
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | custom_rag_prompt
    | llm
    | StrOutputParser()
)

In [112]:
response = rag_chain.invoke("How many poits by scoring a goal?")

In [113]:
print(response)


The points awarded for scoring a goal depend on the player's position: 10 points for a goalkeeper, 6 points for a defender, 5 points for a midfielder, and 4 points for a forward.
