In [1]:
from dotenv import load_dotenv
from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import faiss
from langchain.prompts import PromptTemplate
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain


custom_template = """Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.
Chat History:
{chat_history}
Follow Up Input: {question}
Standalone question:"""
CUSTOM_QUESTION_PROMPT = PromptTemplate.from_template(custom_template)


def get_pdf_text(docs):
    text=""
    for pdf in docs:
        pdf_reader=PdfReader(pdf)
        for page in pdf_reader.pages:
            text+=page.extract_text()
    return text

# converting text to chunks
def get_chunks(raw_text):
    text_splitter=CharacterTextSplitter(separator="\n",
                                        chunk_size=1000,
                                        chunk_overlap=200,
                                        length_function=len)   
    chunks=text_splitter.split_text(raw_text)
    return chunks


def get_vectorstore(chunks):
    embeddings=HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2",
                                     model_kwargs={'device':'cpu'})
    vectorstore=faiss.FAISS.from_texts(texts=chunks,embedding=embeddings)
    return vectorstore


# generating conversation chain  

from langchain_community.llms import VLLM
from langchain.chains import LLMChain
from langchain_core.prompts import PromptTemplate

def get_conversationchain(vectorstore):
    llm = VLLM(
        model="baichuan-inc/Baichuan2-13B-Chat",
        trust_remote_code=True,  # mandatory for hf models
        max_new_tokens=512,
        top_k=20,
        top_p=0.8,
        temperature=0.8,
        dtype="float16",
        tensor_parallel_size=8
        )
    memory = ConversationBufferMemory(memory_key='chat_history', 
                                      return_messages=True,
                                      output_key='answer') 
    conversation_chain = ConversationalRetrievalChain.from_llm(
                                llm=llm,
                                retriever=vectorstore.as_retriever(),
                                condense_question_prompt=CUSTOM_QUESTION_PROMPT,
                                memory=memory)
    return conversation_chain


def handle_question(conversation_chain, question):
    response = conversation_chain({'question': question})
    chat_history = response["chat_history"]
    for i, msg in enumerate(chat_history):
        if i % 2 == 0:
            print(f"User: {msg.content}")
        else:
            print(f"Bot: {msg.content}")

In [2]:
docs = ["metareview2023.pdf"]
raw_text=get_pdf_text(docs)

text_chunks=get_chunks(raw_text)
vectorstore=get_vectorstore(text_chunks)

conversation_chain = get_conversationchain(vectorstore)
#handle_question(conversation_chain, "what standard metrics are used to access relevance, factual consistency and semantic coherence")

  warn_deprecated(
  from tqdm.autonotebook import tqdm, trange




huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

INFO 07-01 20:24:02 config.py:623] Defaulting to use mp for distributed inference
INFO 07-01 20:24:02 llm_engine.py:161] Initializing an LLM engine (v0.5.0.post1) with config: model='baichuan-inc/Baichuan2-13B-Chat', speculative_config=None, tokenizer='baichuan-inc/Baichuan2-13B-Chat', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.float16, max_seq_len=4096, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=8, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), seed=0, served_model_name=baichuan-inc/Baichuan2-13B-Chat)
INFO 07-01 20:24:03 selector.py:131] Cannot use FlashAttention-2 backend for Volta and Turing GPUs.
INFO 07-01 20:24:03 selector.py:51] Using XFormers backend.
[1;36m(VllmWorkerProce

Traceback (most recent call last):
  File "/home/ubuntu/agent_testing/.conda/lib/python3.10/multiprocessing/resource_tracker.py", line 209, in main
    cache[rtype].remove(name)
KeyError: '/psm_7ed30a86'
Traceback (most recent call last):
  File "/home/ubuntu/agent_testing/.conda/lib/python3.10/multiprocessing/resource_tracker.py", line 209, in main
    cache[rtype].remove(name)
KeyError: '/psm_7ed30a86'
Traceback (most recent call last):
  File "/home/ubuntu/agent_testing/.conda/lib/python3.10/multiprocessing/resource_tracker.py", line 209, in main
    cache[rtype].remove(name)
KeyError: '/psm_7ed30a86'
Traceback (most recent call last):
  File "/home/ubuntu/agent_testing/.conda/lib/python3.10/multiprocessing/resource_tracker.py", line 209, in main
    cache[rtype].remove(name)
KeyError: '/psm_7ed30a86'
Traceback (most recent call last):
  File "/home/ubuntu/agent_testing/.conda/lib/python3.10/multiprocessing/resource_tracker.py", line 209, in main
    cache[rtype].remove(name)
KeyErr

[1;36m(VllmWorkerProcess pid=566249)[0;0m INFO 07-01 20:24:09 selector.py:131] Cannot use FlashAttention-2 backend for Volta and Turing GPUs.
[1;36m(VllmWorkerProcess pid=566249)[0;0m INFO 07-01 20:24:09 selector.py:51] Using XFormers backend.
[1;36m(VllmWorkerProcess pid=566245)[0;0m INFO 07-01 20:24:09 selector.py:131] Cannot use FlashAttention-2 backend for Volta and Turing GPUs.
[1;36m(VllmWorkerProcess pid=566245)[0;0m INFO 07-01 20:24:09 selector.py:51] Using XFormers backend.
[1;36m(VllmWorkerProcess pid=566246)[0;0m INFO 07-01 20:24:09 selector.py:131] Cannot use FlashAttention-2 backend for Volta and Turing GPUs.
[1;36m(VllmWorkerProcess pid=566246)[0;0m INFO 07-01 20:24:09 selector.py:51] Using XFormers backend.
[1;36m(VllmWorkerProcess pid=566250)[0;0m INFO 07-01 20:24:09 selector.py:131] Cannot use FlashAttention-2 backend for Volta and Turing GPUs.
[1;36m(VllmWorkerProcess pid=566250)[0;0m INFO 07-01 20:24:09 selector.py:51] Using XFormers backend.
[1;36m(

In [10]:
handle_question(conversation_chain, "okay i do not want to know about standard metrics anymore, i want to know about the summarization technique which are abstractive")

Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  3.79it/s, est. speed input: 1690.56 toks/s, output: 49.49 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  1.35it/s, est. speed input: 1685.68 toks/s, output: 58.08 toks/s]

User: what standard metrics are used to access relevance, factual consistency and semantic coherence
Bot:  The standard metrics used to access relevance, factual consistency, and semantic coherence are ROUGE-L (Lin, 2004), which quantifies the similarity between the generated and reference texts by calculating the Longest Common Subsequence, and NLI (Natural Language Inference) models for inconsistency detection.
User: what are the results
Bot:  Relevance is assessed using ROUGE-L (Lin, 2004), NLI models for inconsistency detection, and DiscoScore (Zhao et al., 2022) for coherence indicator.
User: what datasets are being used for comparison
Bot:  ROUGE-L (Lin, 2004) is used to quantify the similarity between the generated and reference texts by calculating the Longest Common Subsequence. NLI (Natural Language Inference) models are used for inconsistency detection. DiscoScore (Zhao et al., 2022) presents six BERT-based model variants to measure discourse coherence. The scores from these




In [3]:
handle_question(conversation_chain, "i want to know about the summarization technique which are abstractive")

  warn_deprecated(
Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts: 100%|██████████| 1/1 [00:06<00:00,  6.49s/it, est. speed input: 205.62 toks/s, output: 69.00 toks/s]

User: i want to know about the summarization technique which are abstractive
Bot:  I will provide an overview of the main summarization techniques used in NLP, including extractive and abstractive methods. However, the question asks specifically about abstractive techniques, so I will focus on those.

1. Extractive methods: These techniques select the most important sentences or phrases from the input text to create the summary. Examples include TF-IDF weighting (Manning and Schüler, 1999), TextRank (Mihalcea and Tarau, 2004), and LexRank (Erkan and Radev, 2004).

2. Abstractive methods: These techniques generate a new summary by creating new sentences from the input text. These methods typically involve a two-step process: first, a document is segmented into its constituent opinions or themes; then, a summary is generated from these opinions or themes. Examples include OpinionDigger (Suhara et al., 2020), 3Sent (Goyal et al., 2022), and TCG (Bhaskar et al., 2022).

3. Prompting method




In [6]:
chat_history = ""

In [8]:
memory = ""