In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
!pip install langchain_community chromadb rouge

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl.metadata (4.1 kB)
Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [None]:
from tqdm import tqdm

## Text Preprocessing

In [None]:
# Use Simple Wikipedia as retrieved data
import pandas as pd
from langchain_text_splitters import RecursiveCharacterTextSplitter

def get_texts_splitted():
    df = pd.read_parquet("hf://datasets/rahular/simple-wikipedia/data/train-00000-of-00001-090b52ccb189d47a.parquet")
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
    texts = list(df['text'])
    text = ' '.join(texts)
    texts_splitted = text_splitter.split_text(text)
    return texts_splitted

In [None]:
import re

def preprocess_text(text):
    # text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    return text

## Retriever

In [None]:
from langchain_community.embeddings import SentenceTransformerEmbeddings

embedding_function = SentenceTransformerEmbeddings(model_name='all-MiniLM-L6-v2', model_kwargs={'device':'cuda'})

  embedding_function = SentenceTransformerEmbeddings(model_name='all-MiniLM-L6-v2', model_kwargs={'device':'cuda'})


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
# https://github.com/chroma-core/chroma/issues/1049#issuecomment-1699859480

def split_list(input_list, chunk_size):
    res = []
    for i in range(0, len(input_list), chunk_size):
        res.append(input_list[i:i + chunk_size])
    return res

In [None]:
def embed():
    texts_splitted = get_texts_splitted()
    print(len(texts_splitted), texts_splitted[0])
    texts_chunked = split_list(texts_splitted, 41000)
    for ts in tqdm(texts_chunked):
        db = Chroma.from_texts(ts, embedding_function, persist_directory='./embed')

In [None]:
from langchain_community.vectorstores import Chroma

def get_chroma_retriever():
    db = Chroma(persist_directory='./embed', embedding_function=embedding_function)
    retriever = db.as_retriever(search_kwargs={'k':5})
    return retriever

In [None]:
from langchain_community.retrievers import BM25Retriever

def get_bm25_retriever():
    texts_splitted = get_texts_splitted()
    texts_splitted = list(map(preprocess_text, tqdm(texts_splitted)))
    retriever = BM25Retriever.from_texts(texts_splitted, k=8)
    return retriever

In [None]:
# Test dense vs sparse retrievers
retriever = get_chroma_retriever()
# retriever = get_bm25_retriever()

## Prompt

In [None]:
from langchain.prompts import ChatPromptTemplate

prompt = ChatPromptTemplate.from_template(
"""
You are a Trivia expert with all the knowledge in the world. Below are the answers given by yourself. These are the guidelines followed:

- Answer in a single word or a short phrase.
- No extra information, explanations, or notes.
- Do not include parenthetical statements.
- Avoid any special characters or tags, such as "less than" and "greater than" symbols.
- The answer must be directly relevant to the question.
- Do not use phrases like "None of the above" unless it is the actual answer.

Here are some examples:

Question: What is the chemical symbol for gold?
Answer: Au

Question: What is the capital of Australia?
Answer: Canberra

Question: Which artist is known for the painting "The Starry Night"?
Answer: Vincent van Gogh

Context:
{context}

Question: {input}
Answer: """)

## LLM

In [None]:
from langchain_core.language_models.llms import LLM as BaseLLM
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import os

class LLM(BaseLLM):
    model_name: str = None

    class Config:
        arbitrary_types_allowed = True
        extra = 'allow'

    def __init__(self, model_name, hf_token):
        super().__init__()
        self.model_name = model_name
        self.hf_token = hf_token
        self.tokenizer = AutoTokenizer.from_pretrained(
            model_name,
            token=self.hf_token
        )

        self.model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype=torch.bfloat16,
            device_map="auto",
            token=self.hf_token
        )

        self.terminators = [
            self.tokenizer.eos_token_id,
            self.tokenizer.convert_tokens_to_ids("<|eot_id|>")
        ]

        special_tokens_dict = {
            "pad_token": "<pad>",
            "eos_token": "</s>"
        }
        self.tokenizer.add_special_tokens(special_tokens_dict)

    def _call(self, prompt, stop=None, **kwargs):
        messages = [{"role": "user", "content": prompt}]

        input_ids = self.tokenizer.apply_chat_template(
            messages,
            add_generation_prompt=True,
            return_tensors="pt"
        ).to(self.model.device)

        outputs = self.model.generate(
            input_ids,
            max_new_tokens=500,
            eos_token_id=self.terminators,
            pad_token_id=self.tokenizer.eos_token_id,
            do_sample=False,
            temperature=None,
            top_p=None,
            top_k=None
        )

        response = outputs[0][input_ids.shape[-1]:]
        return self.tokenizer.decode(response, skip_special_tokens=True)

    @property
    def _identifying_params(self):
        return {"name": "LLM", "model_name": self.model_name}

    @property
    def _llm_type(self):
        return "llm"

In [None]:
llm = LLM(model_name='meta-llama/Meta-Llama-3-8B-Instruct', hf_token='your_huggingface_token')

tokenizer_config.json:   0%|          | 0.00/51.0k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]



## Chain

In [None]:
from langchain.chains.combine_documents import create_stuff_documents_chain

document_chain = create_stuff_documents_chain(llm, prompt)

In [None]:
from langchain.chains import create_retrieval_chain
retrieval_chain = create_retrieval_chain(retriever, document_chain)

In [None]:
def clean_output(output):
    output = output['answer']
    start_index = 0
    end_index = output.find('\n', start_index)
    if end_index == -1:
        end_index = len(output)
    return output[start_index:end_index].strip()

In [None]:
query = 'Who was the man behind Chipmunks?'
input = {'input': preprocess_text(query)}

response = retrieval_chain.invoke(input)
clean_output(response)

'Ross Bagdasarian'

# Evaluation

In [None]:
# Use TriviaQA as Evaluation Dataset
import json
with open('./unfiltered-web-dev.json', 'r') as f:
    data = json.load(f)['Data']

In [None]:
def index_to_input(index):
    example = data[index]
    example_q = example['Question']
    input = {'input': preprocess_text(example_q)}
    return input

In [None]:
def index_to_answer(index):
    example = data[index]
    return example['Answer']['Value']

In [None]:
def query_one(index):
    response = retrieval_chain.invoke(index_to_input(index))
    return index_to_answer(index), clean_output(response)

In [None]:
# Sample Pair of Ground Truth and Generated Output
for i in range(20):
    print(query_one(i))

('David Seville', 'Ross Bagdasarian')
('Scorpio', 'Scorpio')
('Sunset Boulevard', 'Sunset')
('Campbell-Bannerman', 'Asquith')
('Exile', 'Exile')
('Cancer', 'Tuberculosis')
('Octopussy', 'All Time High')
('18 million', '25')
('Utah', 'Utah')
('Lauren Bacall', 'Lauren Bacall')
('Nikkei', 'Nikkei')
('Moonwalk', 'Moonwalk')
('1930s', '1950s')
('Hit the ball closer to the hole', 'Practice')
('In 1912, in Stockholm', '1936')
('Boxing rings were originally circular', 'Tradition')
('$85,000', 'Ten')
('Eighteen--two bears (one walking, one seated), a bison, camel, cougar, elephant, giraffe, gorilla, hippopotamus, hyena , kangaroo, lion, monkey, rhinoceros, seal, sheep, tier, and zebra', '8')
('Kilimanjaro', 'Kilimanjaro')
('Green', 'Green')


In [None]:
def process_string(s):
    s = preprocess_text(s)
    return s.lower()

In [None]:
import torch
from nltk.translate import bleu
from rouge import Rouge
rouge = Rouge()

def evaluation(answer, output):
    answer = process_string(answer)
    output = process_string(output)
    em_score = 1 if answer == output else 0
    bleu_score = bleu([output.split()], answer.split(), (1,))
    rouge_score = rouge.get_scores(output, answer)
    rouge_1_score = rouge_score[0]['rouge-1']['f']
    rouge_l_score = rouge_score[0]['rouge-l']['f']
    return torch.Tensor([em_score, bleu_score, rouge_1_score, rouge_l_score])

In [None]:
# Example Evaluation
for i in range(5):
    print(evaluation(query_one(i)[0], query_one(i)[1]))

tensor([0., 0., 0., 0.])
tensor([1., 1., 1., 1.])
tensor([0.0000, 0.5000, 0.6667, 0.6667])
tensor([0., 0., 0., 0.])
tensor([1., 1., 1., 1.])


In [None]:
def main(num_of_tests=len(data)):
    print('Create Input...')
    inputs = list(map(index_to_input, range(num_of_tests)))
    print('Running Inference...')
    outputs = []
    for input in tqdm(inputs):
        outputs.append(retrieval_chain.invoke(input))
    print('Cleaning Output...')
    outputs = list(map(clean_output, outputs))
    print('Evaluating...')
    result = []
    for i in range(num_of_tests):
        ans = index_to_answer(i)
        out = outputs[i]
        result.append(evaluation(ans, out))
    result = torch.stack(result)
    result = torch.mean(result, dim=0)
    return result

In [None]:
main(500)