In [None]:
%reset -f

In [None]:
'''
Ensure you have the following installed. If not, install them using pip. Two things to note: 
- 'DocArrayInMemorySearch' is a method to use your local machine to store your documents if you don't have a Pinecone account.
- PyPDFLoader is a method to load PDF files. If you don't have any PDF files, you can use TextLoader to load text files.
'''

import os
import sys
import time
from dotenv import load_dotenv
from transformers import AutoTokenizer, AutoModel
from langchain_community.llms import Ollama
from langchain_openai.chat_models import ChatOpenAI
from langchain_core.embeddings import Embeddings
from langchain_community.embeddings import OllamaEmbeddings
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_core.output_parsers import StrOutputParser
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain_community.vectorstores import DocArrayInMemorySearch
from langchain_core.document_loaders.base import Document
from langchain_core.runnables import RunnablePassthrough
from langchain_pinecone import PineconeVectorStore
from pinecone import Pinecone, ServerlessSpec
from operator import itemgetter
import torch

In [None]:
'''
This is a nifty little helper function I wrote to format the text. In a 
future iteration, I'll save the output directly to a text file. 
'''

def format_text(input_text, n=100):
    # First pass: Add a newline after each colon
    input_text = input_text.replace(':', ':\n')
    
    # Second pass: Add a newline every n characters, taking the new lines into account
    formatted_text = ''
    current_length = 0  # Track the current length of the line
    
    for word in input_text.split(' '):  # Split the text into words
        word_length = len(word)
        if current_length + word_length > n:
            # If adding the next word exceeds the limit, start a new line
            formatted_text += '\n' + word
            current_length = word_length
        else:
            # Otherwise, add the word to the current line
            if formatted_text:  # Add a space before the word if it's not the start of the text
                formatted_text += ' '
                current_length += 1  # Account for the added space
            formatted_text += word
            current_length += word_length
        
        # Account for newlines within the word itself (e.g., after a colon)
        newline_count = word.count('\n')
        if newline_count > 0:
            # Reset the current length for new lines
            current_length = word_length - word.rfind('\n') - 1
    
    return formatted_text

In [None]:
# Function to get embeddings
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
    with torch.no_grad():
        outputs = transformer_model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()

# Function to perform semantic chunking
def semantic_chunking(text, threshold=0.75):
    splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=50)
    initial_chunks = splitter.split_text(text)

    if not initial_chunks:
        print("No chunks generated for the text")
        return []
    
    chunk_embeddings = [get_embeddings(chunk) for chunk in initial_chunks]
    
    merged_chunks = []
    current_chunk = initial_chunks[0]
    current_embedding = chunk_embeddings[0]
    
    for i in range(1, len(initial_chunks)):
        similarity = torch.cosine_similarity(
            torch.tensor(current_embedding), torch.tensor(chunk_embeddings[i]), dim=0
        ).item()
        
        if similarity > threshold:
            current_chunk += " " + initial_chunks[i]
            current_embedding = get_embeddings(current_chunk)
        else:
            merged_chunks.append(current_chunk)
            current_chunk = initial_chunks[i]
            current_embedding = chunk_embeddings[i]
    
    merged_chunks.append(current_chunk)
    
    return merged_chunks

In [None]:
# Function to process documents in batches
def batch_process_documents(documents, batch_size=10):
    batched_documents = []
    for i in range(0, len(documents), batch_size):
        batch = documents[i:i + batch_size]
        batched_documents.append(batch)
    return batched_documents

In [None]:
'''
To build a .env file, you can start with a normal text file and rename it to .env. Make sure it's either 
in your working directory or you specify the path to the file. Load your OpenAI and Pinecone API keys into
your .env file and call it here. I've seen tutorials and documentation that leave "load_dotenv()" empty, but
for whatever reason, that did not work for me, so using Python, I specified the path to the .env file in 
between the quotation marks.
'''

print("Loading environment variables...")
load_dotenv(r"C:/Users/jspri/Desktop/AI_ML_Folder/Python_Practice_Folder/Natural_Language_Processing/EDQP_RAG_Model/env_variables.env")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")

In [None]:
'''
Select your model and load the embeddings. You can choose from the following models:
- GPT-4o
- Llama3 70b
'''

#MODEL = "gpt-4o"
MODEL = "llama3:70b-instruct"

# Build the model, parser, and prompt
print(f"Loading model and embeddings for {MODEL}")
if MODEL.startswith("gpt"):
    model = ChatOpenAI(openai_api_key=OPENAI_API_KEY, model=MODEL)
    embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
else:
    model = Ollama(model=MODEL)
    embeddings = OllamaEmbeddings(model=MODEL, temperature=0.2, top_k=5, top_p=0.7)

In [None]:
'''
Load your text, either in a text file or a PDF file. If you're using a PDF file, you can 
use the PyPDFLoader. Chunk size is the number of individual segments your text will be split 
into. Chunk overlap is the number of characters that will be repeated in each segment. Overlap 
is useful for ensuring that the model can understand the context of the text.
'''

# Load transformer model and tokenizer for semantic chunking
transformer_model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(transformer_model_name)
transformer_model = AutoModel.from_pretrained(transformer_model_name)

print("Loading text documents...")
loader = PyPDFLoader(r"C:\Users\jspri\Desktop\AI_ML_Folder\Python_Practice_Folder\Natural_Language_Processing\Source_Documents\MANMED_CH_15.pdf")
text_documents = loader.load()

print("Performing semantic chunking on text documents...")
documents = []
for i, text_document in enumerate(text_documents):
    print(f"Chunking document {i + 1} of {len(text_documents)}...")
    # Extract the text content from the Document object
    semantic_chunks = semantic_chunking(text_document.page_content)
    if not semantic_chunks:
        print(f"No chunks generated for document {i + 1}")
    # Convert chunks back to Document objects
    for chunk in semantic_chunks:
        documents.append(Document(page_content=chunk))

print(f"Number of semantic chunks created: {len(documents)}")

In [None]:
'''
Load your Pinecone API key and specify your index name. The index name, which is 
either added on the website or in the code below, should be unique to your project.
'''

print("Setting up Pinecone...")
document_name = 'navmed-p117'
pc = Pinecone(api_key=PINECONE_API_KEY)
if 'llama' in MODEL:
    model_name = 'llama3-70b-instruct'
else:
    model_name = MODEL
index_name = f'{document_name}-{model_name}'
print(f"Pinecone index: '{index_name}'")

In [None]:
'''
Pinecone is in the process of shifting to a fully serverless model. Since I'm on the east coast, 
I always default to AWS us-east-1. If you're on the west coast, you can change the region to 
the west region. The dimension is the number of dimensions in the vector space. The metric is
the distance metric used to calculate the similarity between vectors. 

Let's talk about the dimension for a minute. While I'm not going to get into an in-depth discussion
about it, all we're talking about is the vector's dimensionality. GPT's embeddings fluctuate based
on which one you choose. I used "text-embedding-ada-002", which has a dimensionality of 1536. The
open-source models noted previously all have dimensionality of 4096. A good thing about Pinecone is
that it will tell you if your dimensionality is incorrect. For example, when I used Llama2 for the 
first time, I entered 5125 for the dimensionality, and Pinecone told me that it did not equal 4096,
which was a pretty clear indicator that the proper dimensionality was 4096.
'''

spec = ServerlessSpec(cloud='aws', region='us-east-1')
if index_name in pc.list_indexes().names():
    print(f"Deleting existing index: {index_name}")
    pc.delete_index(index_name)
print(f"Creating new index: {index_name}")
pc.create_index(index_name, 
                dimension=1536 if 'gpt' in MODEL else 4096,  
                metric='cosine', 
                spec=spec)
while not pc.describe_index(index_name).status['ready']:
    time.sleep(1)

In [None]:
'''
I included this as a check to ensure that the index was properly created
and that I wasn't loading duplicate documents into the index.
'''

index = pc.Index(index_name)
print(f"Index '{index_name}' ready for use")
index.describe_index_stats()

In [None]:
'''
parser: This is a string output parser, which is used to parse the output from the 
model as a normal text string instead of something far less readable.

template: This is the template that will be used to generate the prompt. The context 
and question will be filled in with the actual question. Note that you don't need to
give the model any context, as it will be provided in the template. One interesting
aspect of the template is that you can modify your own template to fit your needs.

prompt: This passes the template into the prompt chain. 
'''

parser = StrOutputParser()
template = """
Based on the context provided, answer the question 
with a detailed explanation. If the question is unclear or 
lacks sufficient context to provide an informed answer, 
respond with "I don't know" or ask for clarification. Spell
out all acronyms. Provide references for major points 
based on the context provided. 

Context: {context}

Question: {question}

Only use information from the designated documentation.
Ensure your answer is thorough and detailed, offering 
insights and explanations to support your conclusions.
"""
prompt = PromptTemplate.from_template(template)

In [None]:
'''
Upload the documents to the Pinecone index. 

Set the 'data_storage' variable corresponding to the following:
- 0: Upload documents to Pinecone
- 1: Use an existing Pinecone index
- 2: Use DocArrayInMemorySearch to store documents on your local machine
'''

# Print the active index and prompt the user for input
print(f"Active Index: {index_name}")

# Prompt the user to confirm the index name
user_input = input("Enter 1 to continue using this index name or 2 to exit: ")

# Check the user input
if user_input == "2":
    print("Exiting as requested by the user.")
    sys.exit(0)
elif user_input != "1":
    print("Invalid input. Exiting.")
    sys.exit(1)

data_storage = 0
if data_storage == 0:
    print(f"Uploading documents to Pinecone index {index_name}")
    datastore = PineconeVectorStore.from_documents(documents, 
                                                embedding=embeddings, 
                                                index_name=index_name)
    print("Finished uploading documents to Pinecone index.")

elif data_storage == 1:
    print(f"Using existing Pinecone index {index_name} ")
    datastore = PineconeVectorStore.from_existing_index(index_name, 
                                                   embeddings)
    print("Finished pulling documents from Pinecone index.")

elif data_storage == 2:
    print("Storing documents locally")
    datastore = DocArrayInMemorySearch.from_documents(documents, embeddings)
    print("Finished uploading documents to local storage.")

In [None]:
'''
Chain together the context, question, prompt, model, and parser. If you don't use a variable
in conjunction with chain.invoke, the output will be printed to the console. 
'''

retriever = datastore.as_retriever()

chain = (
    {
        "context": itemgetter("question") | retriever,
        "question": itemgetter("question"),
    }
    | prompt
    | model
    | parser
)

questions = ["Please give a detailed summary of physical qualifications for submarine duty.",
             "Please give a detailed summary of physical qualifications for special warfare duty.",
             "Please give a detailed summary of the requirements for active duty separation and retirement.",
]

for question in questions:
    print(f"Question: {question}")
    print(f"Answer: {format_text(chain.invoke({'question': question}),100)}")
    print()