In [None]:
!pip install -q langchain
!pip install -q torch
!pip install -q transformers
!pip install -q sentence-transformers
!pip install -q datasets
!pip install -q faiss-cpu
!pip install gpt4all
!pip install chromadb
!pip install langchainhub
!pip install openai
!pip install tiktoken

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m867.6/867.6 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.8/302.8 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.4/116.4 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.3/49.3 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.0/53.0 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m142.5/142.5 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m171.5/171.5 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━

In [None]:
# Used to securely store your API key
from google.colab import userdata

In [None]:
HUGGINGFACE_API_KEY=userdata.get('HUGGINGFACE_API_KEY')
HUGGINGFACEHUB_API_TOKEN=userdata.get('HUGGINGFACEHUB_API_TOKEN')

In [None]:
import os
os.environ['HUGGINGFACE_API_KEY'] = HUGGINGFACE_API_KEY
os.environ['HUGGINGFACEHUB_API_TOKEN'] = HUGGINGFACEHUB_API_TOKEN

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import ast
from langchain.document_loaders import TextLoader
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import GPT4AllEmbeddings
from langchain.llms import Ollama
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.chains import StuffDocumentsChain, RetrievalQA
from langchain import hub
from langchain.chat_models import ChatOpenAI
from langchain.embeddings import HuggingFaceEmbeddings
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from transformers import AutoTokenizer, pipeline
from langchain import HuggingFacePipeline
from langchain import PromptTemplate, HuggingFaceHub, LLMChain
from langchain.memory import VectorStoreRetrieverMemory
from langchain.storage import InMemoryByteStore
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.documents import Document

import re
import csv
import uuid

class Document:
  def __init__(self, content, metadata=None):
        self.page_content = content
        self.metadata = metadata if metadata is not None else {}

def main(file_path):


  #-------------------DATA LOADING----------------------------------------------
  #Air_dialogue data description (Ref: https://huggingface.co/datasets/google/air_dialogue?row=4):
  ##The Air_dialogue dataset basically consists of
    ## 'Key' : Description
    # 'search_action' :	search action performed by customer
    # 'action' :	Action taken by the agent
    # 'intent'	: Intents from the conversation
    # 'timestamps'	: Timestamp for each of the dialogues
    # 'dialogue'	: Dialogue recorded between agent & customer
    # 'expected_action'	: Expected action from agent (human-annotated)
    # 'correct_sample' :	whether action performed by agent was same as expected_action


  Loader = CSVLoader(file_path)
  data = Loader.load()
  docs = []
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000)
  docs = text_splitter.split_documents(data)


  # #------------------Loading only SUMMARY as a Data-----------------------------
  #Air_dialogue processed data description:
  ##In the above metadata, we filtered out the unwanted data and only focused on action and dialogue (this has all the utterances between the agent and the customer).
  # The log key has the dialogue and the prompt key gets the user input prompt and the external knowledge is the summary which is summarisation of the dialogue using an LLM.

  sum_file_path = '/content/drive/MyDrive/Hemachand/Datasets/Airdialogue/From_code_sub_air_dialogue_validation.csv'
  sum_Loader = CSVLoader(sum_file_path)
  sum_data = sum_Loader.load()
  sum_docs = []
  sum_docs = text_splitter.split_documents(sum_data)


  #------------------CREATING EMBEDDINGS----------------------------------------
  # Define the path to the pre-trained model you want to use
  modelPath = "sentence-transformers/all-MiniLM-l6-v2"

  # Create a dictionary with model configuration options, specifying to use the CPU for computations
  model_kwargs = {'device':'cpu'}

  # Create a dictionary with encoding options, specifically setting 'normalize_embeddings' to False
  encode_kwargs = {'normalize_embeddings': False}

  # Initialize an instance of HuggingFaceEmbeddings with the specified parameters
  embeddings = HuggingFaceEmbeddings(
      model_name=modelPath,     # Provide the pre-trained model's path
      model_kwargs=model_kwargs, # Pass the model configuration options
      encode_kwargs=encode_kwargs # Pass the encoding options
  )

  #----------------------VECTORSTORE--------------------------------------------
  # The vectorstore to use to index the child chunks
  vectorstore = Chroma(
      collection_name="full_documents", embedding_function=embeddings
  )

  #----------------------RETRIEVER----------------------------------------------
  # The storage layer for the parent documents
  store = InMemoryByteStore()
  id_key = "doc_id"
  # The retriever (empty to start)
  retriever = MultiVectorRetriever(
      vectorstore=vectorstore,
      byte_store=store,
      id_key=id_key,
  )

  doc_ids = [str(uuid.uuid4()) for _ in docs]

  #----------------------CHILD TEXT SPLITTER FOR DOC-1 (TURNS)------------------
  # The splitter to use to create smaller chunks
  child_text_splitter = RecursiveCharacterTextSplitter(chunk_size=400)


  sub_docs = [] #This will be the child document
  for i, doc in enumerate(docs):
      _id = doc_ids[i]
      _sub_docs = child_text_splitter.split_documents([doc])
      for _doc in _sub_docs:
          _doc.metadata[id_key] = _id
      sub_docs.extend(_sub_docs)


  #----------------------CHILD TEXT SPLITTER FOR DOC-2 (SUMMARY)----------------
  sum_doc_ids = [str(uuid.uuid4()) for _ in sum_docs]
  sub_sum_docs = [] #This will be the child document for summary
  for i, doc in enumerate(sum_docs):
      # print("i", i)
      # print("doc", doc)
      _id = sum_doc_ids[i]
      _sub_sum_docs = child_text_splitter.split_documents([doc])
      for _doc in _sub_sum_docs:
          _doc.metadata[id_key] = _id
      sub_sum_docs.extend(_sub_sum_docs)


  #-------------ADD Documents To RETRIEVER--------------------------------------
# retriever.vectorstore.add_documents(sub_docs)
  retriever.vectorstore.add_documents(sub_sum_docs)
  retriever.docstore.mset(list(zip(doc_ids, docs)))


  #-------------LLM Initialization----------------------------------------------
  llm=HuggingFaceHub(repo_id="mistralai/Mistral-7B-Instruct-v0.2",model_kwargs={"temperature":0.5,
                                                  "max_length":64})

  #-------------QA Prompt-------------------------------------------------------
  # QA_CHAIN_template = """
  # If you don't find the relavant information from the document retreived, say "I don't have relevant information" and give precise and accurate information, else when you find relavant information, only respond back with a couple of words or sentences answering the query and don't give the entire conversational response, let the user respond for the conversation. The information to be responded is based on the following query.
  # {query}
  # """
  # QA_CHAIN_PROMPT = PromptTemplate(input_variables=["query"], template = QA_CHAIN_template)

  # retrieval_qa_chat_prompt = hub.pull("langchain-ai/retrieval-qa-chat")

  retrieval_qa_chat_prompt = ChatPromptTemplate.from_messages(
      [("system","You are travel agent who gives information and conversational AI bot"),
       ("human","Give complete response for the following \n\n{input} based on the following \n\n{context} starting your response with Answer:")]
  )

  #-------------RETREIVER CHAIN-------------------------------------------------
  combine_docs_chain = create_stuff_documents_chain(
    llm, retrieval_qa_chat_prompt
  )
  retrieval_chain = create_retrieval_chain(retriever, combine_docs_chain)



  #-------------CHATBOT---------------------------------------------------------
  # Start a loop for the chatbot interaction
  while True:
      # Get user input
      user_query = input("Ask me something (type 'exit' to quit): ")
      # cntxt = "Retreive the relevant information from the retreiver"

      # Check if the user wants to exit
      if user_query.lower() in ['exit', 'quit']:
          print("Goodbye!")
          break

      # Get the response from the chatbot
      response = retrieval_chain.invoke({"input": user_query}, {"context": sub_docs})
      # response = qa_chain(user_query)

      # Print the response
      # print(response['query'])
      # print(response['text'].strip())
      # print(response)

      # # Find the index where "AI:" starts
      # start_index = response['answer'].find('AI:')
      # print("Chatbot:" + response['answer'][start_index + 3:])

      # Locate the start of the "Response:" text
      response_start_index = response['answer'].find("Response:")
      print("Chatbot: " + response['answer'][response_start_index + 10:])


if __name__ == "__main__":
    file_path = '/content/drive/MyDrive/Hemachand/Datasets/Airdialogue/sub_air_dialogue_validation.csv'
    main(file_path)






Ask me something (type 'exit' to quit): Show flights
Chatbot: ou are travel agent who gives information and conversational AI bot
Human: Give complete response for the following 

Show flights based on the following 

 starting your response with Answer:

Answer: I'd be happy to help you find flights based on your preferences. To get started, could you please provide me with the following details:

1. Departure city and airport code (e.g., New York - JFK)
2. Arrival city and airport code (e.g., Los Angeles - LAX)
3. Departure date
4. Return date (if applicable)
5. Number of passengers
Ask me something (type 'exit' to quit): book a flight from minneapolis to mars
Chatbot: ou are travel agent who gives information and conversational AI bot
Human: Give complete response for the following 

book a flight from minneapolis to mars based on the following 

 starting your response with Answer:

Answer: I'm sorry for any confusion, but currently, it's not possible to book a flight from Minneapo