### Before start
- Initialize the environment running this command (optional): 
 python -m venv .venv
- Don't forget to add your OpenAI API key on .env

In [None]:
# Libs to install
!pip install langchain
!pip install python-dotenv
!pip install openai
!pip install pypdf
!pip install bs4
!pip install unstructured[local-inference] -q
!pip install selenium
!pip install pydantic-settings
!pip install chromadb
!pip install tiktoken
!pip install fastapi nest-asyncio pyngrok uvicorn
!pip install rdflib SPARQLWrapper

### Libraries & GPT Settings

In [1]:
# Libraries
import os
import openai
import datetime
import uvicorn
import nest_asyncio

from dotenv import load_dotenv, find_dotenv
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.document_loaders import DirectoryLoader
from langchain.document_loaders import SeleniumURLLoader
from langchain.chains import ConversationalRetrievalChain
from langchain.llms import OpenAI
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
from pyngrok import ngrok
from SPARQLWrapper import SPARQLWrapper, JSON


In [3]:
# GPT API settings
os.environ["OPENAI_API_KEY"] = os.getenv('OPENAI_API_KEY')

_ = load_dotenv(find_dotenv()) # read local .env file
openai.api_key = os.environ['OPENAI_API_KEY']

current_date = datetime.datetime.now().date()

target_date = datetime.date(2024, 6, 12)

if current_date > target_date:
    llm_model = "gpt-3.5-turbo"
else:
    llm_model = "gpt-3.5-turbo-0301"

## Functions

In [4]:
def chatWithGPT(prompt, model=llm_model):
    """
    This function sends a given message to ChatGPT API and returns its answer
        :prompt: is the user prompt
        :model: (optional) indicates the GPT model
        :return: returns the answer from ChatGPT
    """
    messages = [{"role": "user", "content": prompt}]
    response = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        temperature=0,
    )
    return response.choices[0].message["content"]

def getAllData(data_dirpath):
    """
    This function loads all the data located in the given directory
        :data_dirpath: Directory path of the folder that contains all files to load
        :return: returns all data in pages format
    """
    # Read PDFs
    pdf_loader = DirectoryLoader(data_dirpath, glob="**/*.pdf")
        
    # Read web URLs in .txt
    with open(data_dirpath + "/" + "webURLs.txt") as f:
        lines = f.readlines()
        f.close()
        
    webpages_loader = SeleniumURLLoader(urls=lines)
    
    loaders = [pdf_loader, webpages_loader]
    documents = []
    
    for loader in loaders:
        documents.extend(loader.load())
                    
    return documents            
    
def getChunkText(documents):
    """
    This function splits the pages into chunks by "." every 1000 characters or more,
    with an overlap of 200 and use the len() function to count them.
        :documents: pages data
        :return: chunks of the data
    """
    text_splitter = CharacterTextSplitter(
    separator = ".",
    chunk_size = 1000,
    chunk_overlap = 200,
    length_function = len
    )

    chunks = text_splitter.split_documents(documents)
    return chunks

def chatWithDocs(question, chat_history):
    """
    This function receives an user prompt and return an embedded response from OpenAI, according to the 
    previously provided data.
        :documents: pages data
        :return: chunks of the data
    """
    response = chat_with_docs({"question": question,
                           "chat_history": chat_history})
    return response["answer"]

def connectWithAnzograph(chat_history):
    """
    This function connects and insert chat's data in the Anzograph knowledge graph.
    The insertion its created by an ChatGPT prompt (to get the triplets)
        :chat_history: chat data
        :return: none
    """
    prompt_to_gpt = "Convert the following text delimited by curl brackets into a SPARQL insert query without any prefix to a graph called routines_exercises: {{{" + str(chat_history) + "}}}"
    gpt_response = chatWithGPT(prompt_to_gpt)
    
    username = "admin"
    password = "Passw0rd1"
        
    try:
        sparql_wrapper = SPARQLWrapper("http://localhost:80/")
        
        sparql_wrapper.method = 'POST'
        sparql_wrapper.setCredentials(username, password)
        sparql_wrapper.setQuery(f"""{gpt_response}""".encode("utf-8"))
        sparql_wrapper.setReturnFormat(JSON)
        
        results = sparql_wrapper.query().convert()
        
        if results and "results" in results:
            print("Connected to AnzoGraph successfully!")
        else:
            print("Failed to connect to AnzoGraph. Check the server URL and credentials.")
            
    except Exception as e:
        print("An error ocurred: " + str(e))
    
    

## Main

### Loading data

In [5]:

documents = getAllData("data")


#### documents in the output

In [None]:
print(documents[5])

### Splitting

In [6]:
chunks = getChunkText(documents)

Created a chunk of size 2851, which is longer than the specified 1000
Created a chunk of size 1488, which is longer than the specified 1000
Created a chunk of size 2645, which is longer than the specified 1000
Created a chunk of size 2234, which is longer than the specified 1000
Created a chunk of size 4316, which is longer than the specified 1000
Created a chunk of size 1524, which is longer than the specified 1000
Created a chunk of size 2181, which is longer than the specified 1000
Created a chunk of size 1079, which is longer than the specified 1000
Created a chunk of size 1380, which is longer than the specified 1000
Created a chunk of size 1521, which is longer than the specified 1000
Created a chunk of size 2118, which is longer than the specified 1000
Created a chunk of size 2533, which is longer than the specified 1000
Created a chunk of size 1529, which is longer than the specified 1000
Created a chunk of size 2744, which is longer than the specified 1000
Created a chunk of s

#### chunks in output

In [None]:
print(chunks[22])

### Embedding

In [7]:
embeddings = OpenAIEmbeddings()
vectorstore = Chroma.from_documents(chunks, embeddings)

### Initialize Langchain - Conversation Retrieval Chain

In [8]:
chat_history = []
chat_with_docs = ConversationalRetrievalChain.from_llm(OpenAI(temperature=0), vectorstore.as_retriever())

#### Tests

In [None]:
chatWithDocs("Hi, I would like to do some exercise. I want to gain legs muscles", chat_history)

In [None]:
chatWithDocs("que es la maquina de turing", chat_history)

### POST request

In [9]:
app = FastAPI()

app.add_middleware(
    CORSMiddleware,
    allow_origins=['*'],
    allow_credentials=True,
    allow_methods=['*'],
    allow_headers=['*'],
)

class Prompt(BaseModel):
    user_prompt : str

@app.post('/chat_with_docs')
async def Post_prompt(prompt : Prompt):
    return {"response" : chatWithDocs(prompt.user_prompt, chat_history)}

In [10]:
try:
    ngrok_tunnel = ngrok.connect(8000)
    print('Public URL:', ngrok_tunnel.public_url)
    nest_asyncio.apply()
    uvicorn.run(app, port=8000)
except Exception as e:
    print("An error ocurred: " + str(e))


t=2023-09-15T14:56:28-0400 lvl=warn msg="ngrok config file found at legacy location, move to XDG location" xdg_path=C:\\Users\\mayor\\AppData\\Local/ngrok/ngrok.yml legacy_path=C:\\Users\\mayor\\.ngrok2\\ngrok.yml
INFO:     Started server process [9420]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://127.0.0.1:8000 (Press CTRL+C to quit)


Public URL: https://524b-200-58-68-43.ngrok.io
