<a href="https://colab.research.google.com/github/NahuelCostaCortez/InteligeciaNegocio/blob/main/DemoQA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
#@markdown # Descargar librerías necesarias
!pip install langchain
!pip install chromadb
!pip install gradio
!pip install Pillow==9.0.0
!pip install pypdf

Collecting langchain
  Downloading langchain-0.0.338-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m18.1 MB/s[0m eta [36m0:00:00[0m
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain)
  Downloading dataclasses_json-0.6.2-py3-none-any.whl (28 kB)
Collecting jsonpatch<2.0,>=1.33 (from langchain)
  Downloading jsonpatch-1.33-py2.py3-none-any.whl (12 kB)
Collecting langsmith<0.1.0,>=0.0.63 (from langchain)
  Downloading langsmith-0.0.65-py3-none-any.whl (46 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.1/46.1 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain)
  Downloading marshmallow-3.20.1-py3-none-any.whl (49 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.4/49.4 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langch

Collecting pypdf
  Downloading pypdf-3.17.1-py3-none-any.whl (277 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m277.6/277.6 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pypdf
Successfully installed pypdf-3.17.1


## UTILS

In [8]:
import gradio as gr
import os
from langchain import PromptTemplate, HuggingFaceHub
from langchain.document_loaders import TextLoader, PyPDFLoader
from langchain.document_loaders import UnstructuredPDFLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceHubEmbeddings
from langchain.indexes.vectorstore import VectorstoreIndexCreator
from langchain.chains import ConversationalRetrievalChain

PRE_PROMPT = """Use the following context to answer the question at the end. If you don't know the answer, just say you don't know, don't try to make up an answer.

{context}

Pregunta: {question}
Respuesta:"""

def extract_text_from_pdf(pdf_file):
    """
    Extracts text from a PDF file.

    Args:
        pdf_file (str): The path to the PDF file.

    Returns:
        list: A list of strings, where each string represents the text extracted from a page of the PDF file.
    """
    loader = PyPDFLoader(pdf_file)
    pages = loader.load_and_split()
    text_splitter = CharacterTextSplitter(chunk_size=300, separator='. \n')
    return text_splitter.split_documents(pages)

def load_chain(text, model_name, temperature):
    """
    Loads and returns a retriever and language model from the Hugging Face Hub.

    Args:
        text (Document): Reference text.
        model_name (str): The name of the model to load from the Hugging Face Hub.
        temperature (float): The temperature to use for sampling from the model.

    Returns:
        tuple: A tuple containing the retriever and the model.
            retriever (ChromaRetriever): A retriever object that can be used to retrieve similar documents.
            model (HuggingFaceHubModel): A language model object that can be used to generate text.
    """
    embeddings = HuggingFaceHubEmbeddings()
    retriever = Chroma.from_documents(text, embeddings).as_retriever()
    model = HuggingFaceHub(repo_id=model_name, model_kwargs={"temperature":temperature})

    PROMPT = PromptTemplate(
    template=PRE_PROMPT, input_variables=["context", "question"]
    )
    pipeline = ConversationalRetrievalChain.from_llm(model, retriever, return_source_documents=True, combine_docs_chain_kwargs={"prompt": PROMPT})
    return pipeline

def predict(pipeline, question, chat_history):
    answer = pipeline({"question": question, "chat_history": chat_history})
    return answer['answer']

def get_answer(file_name, model_name, temperature, pre_prompt, question, chat_history):
  text = extract_text_from_pdf(file_name)
  pipeline = load_chain(text, model_name, temperature)
  answer = predict(pipeline, question, chat_history)
  return answer

## DEMO

In [None]:
with gr.Blocks() as demo:
    # Para cargar pdf
    pdf_file = gr.File(label="Cargar pdf", file_types=['.pdf'])
    # Para seleccionar el modelo a utilizar
    model_id = gr.Dropdown(label="model", choices=["OpenAssistant/oasst-sft-1-pythia-12b", "google/flan-ul2", "bigscience/bloomz"], value="OpenAssistant/oasst-sft-1-pythia-12b")
    temperature = gr.Slider(0, 2, value=0, label="Grado de alucinación")
    # Para introducir la API key de HuggingFace
    API = gr.Textbox(label="API KEY", placeholder="Inserta tu API KEY de HuggingFace ")
    # Para comprobar que los embeddings se han creado correctamente
    status = gr.Textbox(label="Status", placeholder="", interactive=False)
    # Componente para crear el panel del chat
    chat_history = gr.Chatbot()
    # textbox para el input del usuario
    msg = gr.Textbox(label="Pregunta:")
    # Para borrar el mensaje y el chat
    clear = gr.ClearButton([msg, chat_history])

    def pdf_changes(pdf_file, model_id, temperature, API):
      if API == " ":
        return "Inserta primero la API key"
      os.environ["HUGGINGFACEHUB_API_TOKEN"] = API
      text = extract_text_from_pdf(pdf_file.name)
      global qa
      qa = load_chain(text, model_id, temperature)
      return "Ready"

    def get_pdf_text_and_predict(pdf_file, model_name, temperature, question, chat_history):
      answer = get_answer(pdf_file.name, model_name, temperature, PRE_PROMPT, question, chat_history)
      chat_history.append((question, answer))
      return answer, chat_history

    # Cuando se da enter sobre msg (textbox) se llama a la función con [msg, chatbot]
    # como parámetros y se recibe [msg, chatbot] como respuesta
    pdf_file.change(pdf_changes, inputs=[pdf_file, model_id, temperature, API], outputs=[status], queue=False)
    msg.submit(get_pdf_text_and_predict, inputs=[pdf_file, model_id, temperature, msg, chat_history], outputs=[msg, chat_history])

demo.launch(debug=True)

Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
Running on public URL: https://60835ab95907fda936.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


You're using a different task than the one specified in the repository. Be sure to know what you're doing :)
You're using a different task than the one specified in the repository. Be sure to know what you're doing :)
