<a href="https://colab.research.google.com/github/Hasibur-ridoy/Ask-anything-about-NSU/blob/main/RAG_with_GPT_generator_with_gui.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Use playground mode**

**File>Open in playground mode**

For Text to csv [click this](https://cutt.ly/texttocsv)


# Install the dependencies

Install the packages. Restart runtime after first time install in colab.


In [None]:
# Install the latest release of Haystack in your own environment
! pip install farm-haystack

# Install the latest master of Haystack
!pip install --upgrade pip
!pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab,faiss]

!pip install openai
!pip install gradio

Collecting farm-haystack
  Downloading farm_haystack-1.25.5-py3-none-any.whl (770 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m770.3/770.3 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting boilerpy3 (from farm-haystack)
  Downloading boilerpy3-1.0.7-py3-none-any.whl (22 kB)
Collecting events (from farm-haystack)
  Downloading Events-0.5-py3-none-any.whl (6.8 kB)
Collecting httpx (from farm-haystack)
  Downloading httpx-0.27.0-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
Collecting lazy-imports==0.3.1 (from farm-haystack)
  Downloading lazy_imports-0.3.1-py3-none-any.whl (12 kB)
Collecting posthog (from farm-haystack)
  Downloading posthog-3.5.0-py2.py3-none-any.whl (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.3/41.3 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting prompthub-py==4.0.0 (from farm-haystack)
  

# Import the packages

In [None]:
import json
import openai
import logging
logging.basicConfig(format="%(levelname)s - %(name)s -  %(message)s", level=logging.WARNING)
logging.getLogger("haystack").setLevel(logging.INFO)
from typing import List
import requests
import pandas as pd
from haystack import Document
from haystack.document_stores import FAISSDocumentStore
from haystack.nodes import RAGenerator, DensePassageRetriever
from haystack.utils import fetch_archive_from_http

from haystack.pipelines import GenerativeQAPipeline, DocumentSearchPipeline
from haystack.utils import print_answers

from pprint import pprint, PrettyPrinter
from typing import Dict, Any, List, Optional
from collections import defaultdict
import gradio as gr


from haystack.schema import Document, Answer, SpeechAnswer
from haystack.document_stores.sql import DocumentORM
logger = logging.getLogger(__name__)

PydanticSchemaGenerationError: Unable to generate pydantic-core schema for <class 'pandas.core.frame.DataFrame'>. Set `arbitrary_types_allowed=True` in the model_config to ignore this error or implement `__get_pydantic_core_schema__` on your type to fully support it.

If you got this error by calling handler(<some type>) within `__get_pydantic_core_schema__` then you likely need to call `handler.generate_schema(<some type>)` since we do not call `__get_pydantic_core_schema__` on `<some type>` otherwise to avoid infinite recursion.

For further information visit https://errors.pydantic.dev/2.7/u/schema-for-unknown-type

# Dataset

In [None]:
# Directory of the csv file in colab runtime folder
doc_dir = "/content/curated_dataset_100.csv"

#Set separator as tab(\t) if the csv is tab separated, comma(,) if the csv is comma separated
df = pd.read_csv(doc_dir, sep=",")

# Minimal cleaning
df.fillna(value="", inplace=True)

print(df.head())

# Cast data into Haystack Document Objects


In [None]:
titles = list(df["title"].values)
texts = list(df["text"].values)
documents: List[Document] = []
for title, text in zip(titles, texts):
    documents.append(Document(content=text, meta={"name": title or ""}))

# FAISSDocumentStore, DensePassageRetriever and RAGenerator

In [None]:
# Initialize FAISS document store.
# Set `return_embedding` to `True`, so generator doesn't have to perform re-embedding
document_store = FAISSDocumentStore(faiss_index_factory_str="Flat", return_embedding=True)

# Initialize DPR Retriever to encode documents, encode question and query documents
retriever = DensePassageRetriever(
    document_store=document_store,
    query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
    passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
    use_gpu=True,
    embed_title=True,
)



In [None]:
# Initialize RAG Generator
generator = RAGenerator(
    model_name_or_path="facebook/rag-token-nq",
    use_gpu=True,
    top_k=1,
    #max_length=200,
    min_length=2,
    embed_title=True,
    num_beams=2,
)

# Update the document

We write documents to the DocumentStore, first by deleting any remaining documents then calling write_documents(). The update_embeddings() method uses the retriever to create an embedding for each document.

In [None]:
# Delete existing documents in documents store
document_store.delete_documents()

# Write documents to document store
document_store.write_documents(documents)

# Add documents embeddings to index
document_store.update_embeddings(retriever=retriever)

# Function for answer using RAG generator

Custom function for print

In [None]:

def print_ans(results: dict,passage: bool = False):
    """
    Utility function to print results of Haystack pipelines
    :param results: Results that the pipeline returned.
    :param details: Defines the level of details to print. Possible values: minimum, medium, all.
    :param max_text_len: Specifies the maximum allowed length for a text field. If you don't want to shorten the text, set this value to None.
    :return: None
    """
    # Defines the fields to keep in the Answer for each detail level
    fields_to_keep_by_level = {
        "minimum": {
            Answer: ["answer"],
        },
    }

    if not "answers" in results.keys():
        raise ValueError(
            "The results object does not seem to come from a Reader: "
            f"it does not contain the 'answers' key, but only: {results.keys()}.  "
            "Try print_documents or print_questions."
        )

    if "query" in results.keys():
        print(f"\nQuestion: {results['query']}\nAnswer:")

        answers = results["answers"][0]
        doc=results["documents"][0]
        ans=(str)(answers.answer)
        docs=(str)(doc.content)
        if passage:
          return ans, docs
        else:
          return ans, " "
        #print(answers.score)
        #pprint(doc.content)


Wrap the question and answering in a function

In [None]:
def bolo_with_rag(question, passage:bool =False):
          import warnings
          warnings.filterwarnings('ignore')
          pipe = GenerativeQAPipeline(generator=generator, retriever=retriever)
          res = pipe.run(query=question, params={"Generator": {"top_k": 1}, "Retriever": {"top_k": 5}})
          if passage:
            return (print_ans(res,True))
          else:
            return(print_ans(res))

# Function for answer using GPT


Custom function for printing the passage by Jawad

In [None]:
def custom_print_doc(results: dict, max_text_len: Optional[int] = None, print_name: bool = True, print_meta: bool = False, string_out: bool = False):
    #print(f"\nQuery: {results['query']}\n")

    # Verify that the input contains Documents under the `document` key
    if any(not isinstance(doc, Document) for doc in results["documents"]):
        raise ValueError(
            "This results object does not contain `Document` objects under the `documents` key. "
            "Please make sure the last node of your pipeline makes proper use of the "
            "new Haystack primitive objects, and if you're using Haystack nodes/pipelines only, "
            "please report this as a bug."
        )

    for doc in results["documents"]:
        content = doc.content
        if string_out:
          content = (str)(doc.content)
          return content
        print(results["content"])

A custom function to take string after/before a certain word

In [None]:
def substring_after(s, delim):
    return s.partition(delim)[2]
def substring_before(s,delim):
    return s.partition(delim)[0]

Use the passage extracted by the function in RAG dpr and wrap it inside a function

In [None]:
def bolo_with_gpt(question,passage: bool= False):
    pipe = GenerativeQAPipeline(generator=generator, retriever=retriever)
    res = DocumentSearchPipeline(retriever).run(query=question, params={"Retriever": {"top_k": 5}})
    passage1= custom_print_doc(res, string_out = True)
    #Get the passage before question mark
    q=substring_before(question,"?")
    prompt = "Answer the question from the given passage." + "Question:" + q+"in North South University?" +"Passage: " + passage1
    openai.api_key = "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"      # API key
    response = openai.Completion.create(engine="text-davinci-002", prompt=prompt, temperature= 0.15, max_tokens=128)

    #Parse the answer from the object
    n=json.loads(json.dumps(json.loads((json.dumps(response)))))
    res=json.dumps(n["choices"][0]["text"])
    res2=substring_after(res, "\\n\\n")
    new_string=res2.replace('"','')
    #print(question + '\n')
    #print(new_string + '\n')
    if passage:
      return new_string, passage1
    else:
      return new_string, " "

# Go Nuts
Use bolo_with_rag function for answers with rag generator

and bolo_with_gpt function for answers with gpt generator

**Use playground mode**

**File>Open in playground mode**

In [None]:
def question_answer(choice, question, passage):
  if choice =="GPT" and passage == "Yes":
    return bolo_with_gpt(question, True)
  elif choice =="GPT" and passage == "No":
    return bolo_with_gpt(question, False)
  elif choice =="RAG" and passage == "Yes":
    return bolo_with_rag(question, True)
  elif choice =="RAG" and passage == "No":
    return bolo_with_rag(question, False)


In [None]:
question_answer("GPT","Who is the vice-chancellor ?","Yes")

In [None]:


guii = gr.Interface(
    question_answer,
    inputs=[gr.Radio(["RAG", "GPT"], label="Choose a generator for model"),gr.Textbox(label="Write your question: "),gr.Radio(["Yes","No"],label="Print Passage?")],
    outputs=[gr.Text(line=2,label="Answer"),gr.Textbox(label="Passage",defualt=" ")]

)

guii.launch(debug=True)