### Before start
- Initialize the environment running this command: 
 python -m venv .venv

In [10]:
# Libs to install
!pip install langchain
!pip install python-dotenv
!pip install openai
!pip install pypdf
!pip install bs4
!pip install chromadb -q
!pip install unstructured[local-inference] -q
!pip install python-magic-bin
!pip install selenium
!pip install pydantic-settings
!pip install chromadb
!pip install tiktoken


Collecting tiktoken


[notice] A new release of pip available: 22.3.1 -> 23.2.1
[notice] To update, run: python.exe -m pip install --upgrade pip



  Downloading tiktoken-0.5.1-cp311-cp311-win_amd64.whl (759 kB)
     -------------------------------------- 759.8/759.8 kB 2.1 MB/s eta 0:00:00
Installing collected packages: tiktoken
Successfully installed tiktoken-0.5.1


### Libraries & GPT Settings

In [28]:
# Libraries
import os
import openai
import datetime
import magic

from dotenv import load_dotenv, find_dotenv
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.document_loaders import DirectoryLoader
from langchain.document_loaders import SeleniumURLLoader
from langchain.chains import ConversationalRetrievalChain
from langchain.chat_models import ChatOpenAI


In [36]:
# GPT API settings
os.environ["OPENAI_API_KEY"] = os.getenv('OPENAI_API_KEY')

_ = load_dotenv(find_dotenv()) # read local .env file
openai.api_key = os.environ['OPENAI_API_KEY']

current_date = datetime.datetime.now().date()

target_date = datetime.date(2024, 6, 12)

if current_date > target_date:
    llm_model = "gpt-3.5-turbo"
else:
    llm_model = "gpt-3.5-turbo-0301"

## Functions

In [31]:
def chatWithGPT(prompt, model=llm_model):
    """
    chatWithGPT send the message to ChatGPT API and returns its answer
        :prompt: is the user prompt
        :model: (optional) indicates the GPT model
        :return: returns the answer from ChatGPT
    """
    messages = [{"role": "user", "content": prompt}]
    response = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        temperature=0,
    )
    return response.choices[0].message["content"]

def getAllData(data_dirpath):
    """
    getAllData loads all data using readFunctions
        :data_dirpath: Directory path of all files to load
        :return: returns all data in a string
    """
    # Read PDFs
    pdf_loader = DirectoryLoader(data_dirpath, glob="**/*.pdf")
        
    # Read web URLs in .txt
    with open(data_dirpath + "/" + "webURLs.txt") as f:
        lines = f.readlines()
        f.close()
        
    webpages_loader = SeleniumURLLoader(urls=lines)
    
    loaders = [pdf_loader, webpages_loader]
    documents = []
    
    for loader in loaders:
        documents.extend(loader.load())
                    
    return documents            
    
def getChunkText(documents):
    """
    getChunkText function chunks all text data in chunks
        :text: text data
        :return: chunks of data
    """
    text_splitter = CharacterTextSplitter(
    separator = ".",
    chunk_size = 1000,
    chunk_overlap = 200,
    length_function = len
    )

    chunks = text_splitter.split_documents(documents)
    return chunks

## Main

### Loading data

In [20]:

documents = getAllData("data")


#### all_texts in the output

In [16]:
print(documents)



### Splitting

In [21]:
chunks = getChunkText(documents)

Created a chunk of size 1170, which is longer than the specified 1000
Created a chunk of size 1139, which is longer than the specified 1000
Created a chunk of size 1827, which is longer than the specified 1000
Created a chunk of size 2154, which is longer than the specified 1000
Created a chunk of size 1024, which is longer than the specified 1000
Created a chunk of size 2258, which is longer than the specified 1000
Created a chunk of size 1649, which is longer than the specified 1000


#### chunks in ouput

In [18]:
print(chunks[0])

page_content='THE COMPLETE FITNESS\n\nHANDBOOK\n\nTable of Contents\n\nIntroduction……………………………………………………………………… 3\n\n1\n\nPhysical Fitness…………………………………………………………….……. 4\n\nPrincipals………………………………………………………………… 4\n\nFitness Assessment……………….……………………………………. 5\n\nFitness Questionnaire …….……………………………………….………6\n\nDiagnostic Test Scorecard…………………………………………….12\n\nFlexibility……………………………………………………………….13\n\nProgram Description.……………………………………………………14\n\nThe Road to Fitness…………………………………………….15\n\nBuilding your Strength Program/Tracking……………………….17\n\nGet Stronger in 4 Weeks………………………………………….19\n\nPrepare for Airborne School………………………………………21\n\nPrepare for NALC & Air Assault School………………….…….23\n\nCardiovascular Endurance……………………………………………….52\n\nTraining Heart Rates………….………………………………….52\n\nInterval Work Outs…………………………………………….…55\n\nPace Chart…………………………………………………….…61\n\n2\n\nIntroduction\n\nCadet Command would like to thank the Dr. Todd A' metadata={'source': 'data\\fitness-handbook.pdf'}


### Embedding

In [22]:
embeddings = OpenAIEmbeddings()
vectorstore = Chroma.from_documents(chunks, embeddings)

### Initialize Langchain - Conversation Retrieval Chain

In [33]:
chat_history = []
chat_with_docs = ConversationalRetrievalChain.from_llm(ChatOpenAI(temperature=0), vectorstore.as_retriever())

In [37]:
question = "Hi, I would like to do some exercise"
response = chat_with_docs({"question": question,
                           "chat_history": chat_history})
print(response["answer"])

AuthenticationError: Incorrect API key provided: sk-tdmoK***************************************4z65. You can find your API key at https://platform.openai.com/account/api-keys.