In [None]:
'''
Ensure you have the following installed. If not, install them using pip. Two things to note: 
- 'DocArrayInMemorySearch' is a method to use your local machine to store your documents if you don't have a Pinecone account.
- PyPDFLoader is a method to load PDF files. If you don't have any PDF files, you can use TextLoader to load text files.
'''

import os
import time
from dotenv import load_dotenv
from langchain_community.llms import Ollama
from langchain_openai.chat_models import ChatOpenAI
from langchain_community.embeddings import OllamaEmbeddings
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_core.output_parsers import StrOutputParser
from langchain_community.document_loaders import TextLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain_community.vectorstores import DocArrayInMemorySearch
from langchain_core.runnables import RunnableParallel, RunnablePassthrough
from langchain_pinecone import PineconeVectorStore
from pinecone import Pinecone, ServerlessSpec

In [None]:
'''
To build a .env file, you can start with a normal text file and rename it to .env. Make sure it's either 
in your working directory or you specify the path to the file. Load your OpenAI and Pinecone API keys into
your .env file and call it here. I've seen tutorials and documentation that leave "load_dotenv()" empty, but
for whatever reason, that did not work for me, so using Python, I specified the path to the .env file in 
between the quotation marks.
'''

load_dotenv(r"")
OPENAI_API_KEY = os.getenv("")
PINECONE_API_KEY = os.getenv("")

In [None]:
'''
Select your model and load the embeddings. You can choose from the following models:
- GPT-3.5-turbo
- Gemma 2B
- Mixtral 8x7B
- Llama2 7b
'''

MODEL = "gpt-3.5-turbo"
#MODEL = "gemma"
#MODEL = "mixtral"
#MODEL = "llama2:7b"

# Build the model, parser, and prompt
if MODEL.startswith("gpt"):
    model = ChatOpenAI(openai_api_key=OPENAI_API_KEY, model=MODEL)
    embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
else:
    model = Ollama(model=MODEL)
    embeddings = OllamaEmbeddings(model=MODEL)

In [None]:
'''
Load your text, either in a text file or a PDF file. If you're using a PDF file, you can 
use the PyPDFLoader. Chunk size is the number of individual segments your text will be split 
into. Chunk overlap is the number of characters that will be repeated in each segment. Overlap 
is useful for ensuring that the model can understand the context of the text.
'''

# Load the text
loader_txt = TextLoader(r"")
text_documents = loader_txt.load()
# Split the text into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
documents = text_splitter.split_documents(text_documents)
print(f"Number of chunks: {len(documents)}")

In [None]:
'''
parser: This is a string output parser, which is used to parse the output from the 
model as a normal text string instead of something far less readable.

template: This is the template that will be used to generate the prompt. The context 
and question will be filled in with the actual question. Note that you don't need to
give the model any context, as it will be provided in the template. One interesting
aspect of the template is that you can modify your own template to fit your needs.

prompt: This passes the template into the prompt chain. 
'''

parser = StrOutputParser()
template = """
Based on the context provided, answer the question 
with a detailed explanation. If the question is unclear or 
lacks sufficient context to provide an informed answer, 
respond with "I don't know" or ask for clarification.

Context: {context}

Question: {question}

Please ensure your answer is thorough and detailed, offering 
insights and explanations to support your conclusions.
"""
prompt = PromptTemplate.from_template(template)

In [None]:
'''
Load your Pinecone API key and specify your index name. The index name, which is 
either added on the website or in the code below, should be unique to your project.
'''

pc = Pinecone(api_key=PINECONE_API_KEY)
index_name = ''

In [None]:
'''
Pinecone is in the process of shifting to a fully serverless model. Since I'm on the east coast, 
I always default to AWS us-east-1. If you're on the west coast, you can change the region to 
the west region. The dimension is the number of dimensions in the vector space. The metric is
the distance metric used to calculate the similarity between vectors. 

Let's talk about the dimension for a minute. While I'm not going to get into an in-depth discussion
about it, all we're talking about is the vector's dimensionality. GPT's embeddings fluctuate based
on which one you choose. I used "text-embedding-ada-002", which has a dimensionality of 1536. The
open-source models noted previously all have dimensionality of 4096. A good thing about Pinecone is
that it will tell you if your dimensionality is incorrect. For example, when I used Llama2 for the 
first time, I entered 5125 for the dimensionality, and Pinecone told me that it did not equal 4096,
which was a pretty clear indicator that the proper dimensionality was 4096.
'''

spec = ServerlessSpec(cloud='aws', region='us-east-1')
if index_name in pc.list_indexes().names():
    pc.delete_index(index_name)
pc.create_index(index_name, 
                dimension=4096, 
                metric='cosine', 
                spec=spec)
while not pc.describe_index(index_name).status['ready']:
    time.sleep(1)

In [None]:
'''
I included this as a check to ensure that the index was properly created
and that I wasn't loading duplicate documents into the index.
'''

index = pc.Index(index_name)
index.describe_index_stats()

In [None]:
'''
Upload the documents to the Pinecone index. 

### Only run this once per document set because it will duplicate the documents 
in the index if run multiple times withoutloading a new set of documents. ###
'''
pinecone = PineconeVectorStore.from_documents(documents, 
                                              embeddings, 
                                              index_name)

In [None]:
'''
### Use this block to run the model on the Pinecone index without loading new documents. ###

In a later update, I'll probably incorporate a conditional statement that combines the
two PineconeVectorStore blocks into one. This will allow you to either load new documents
or run the model on the existing documents in the index.
'''

pinecone = PineconeVectorStore.from_existing_index(index_name, 
                                                   embeddings)

In [None]:
'''
Chain together the context, question, prompt, model, and parser. If you don't use a variable
in conjunction with chain.invoke, the output will be printed to the console. 
'''

chain = (
    {"context": pinecone.as_retriever(), "question":RunnablePassthrough()}
    | prompt
    | model
    | parser
)
stored_text = chain.invoke("")

In [None]:
'''
This is a nifty little helper function I wrote to format the text. In a 
future iteration, I'll save the output directly to a text file. 
'''

def format_text(input_text, n=100):
    # First pass: Add a newline after each colon
    input_text = input_text.replace(':', ':\n')
    
    # Second pass: Add a newline every n characters, taking the new lines into account
    formatted_text = ''
    current_length = 0  # Track the current length of the line
    
    for word in input_text.split(' '):  # Split the text into words
        word_length = len(word)
        if current_length + word_length > n:
            # If adding the next word exceeds the limit, start a new line
            formatted_text += '\n' + word
            current_length = word_length
        else:
            # Otherwise, add the word to the current line
            if formatted_text:  # Add a space before the word if it's not the start of the text
                formatted_text += ' '
                current_length += 1  # Account for the added space
            formatted_text += word
            current_length += word_length
        
        # Account for newlines within the word itself (e.g., after a colon)
        newline_count = word.count('\n')
        if newline_count > 0:
            # Reset the current length for new lines
            current_length = word_length - word.rfind('\n') - 1
    
    return formatted_text

# Format the text with n=100
formatted_text = format_text(stored_text, 100)

# Print the formatted text
print(formatted_text)