#Installing Dependencies & Setting Environment Variables

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%pip install -U langchain_huggingface  > /dev/null 2>&1
%pip install bs4  > /dev/null 2>&1
%pip install langchain_community  > /dev/null 2>&1
%pip install faiss-cpu datasets sentence-transformers transformers bitsandbytes > /dev/null 2>&1
%pip install -U bitsandbytes optimum[intel] > /dev/null 2>&1
%pip install gradio > /dev/null 2>&1
%pip install --upgrade --quiet  cohere > /dev/null 2>&1
%pip install -U langchain-cohere > /dev/null 2>&1
%pip install rouge-score > /dev/null 2>&1

In [3]:
import bs4
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from transformers import AutoModelForCausalLM, AutoModelForQuestionAnswering, AutoTokenizer, pipeline, BitsAndBytesConfig
import torch
import gradio as gr
import sys
import os
from langchain.retrievers.contextual_compression import ContextualCompressionRetriever
from langchain_cohere import CohereRerank
from langchain_community.llms import Cohere
import faiss
from rouge_score import rouge_scorer
from time import time

In [4]:
sys.path.append('/content/drive/MyDrive/colab')
import constants as const
import config_handler
config = config_handler.load_config()

In [5]:
MODEL = const.FALCON

In [6]:
os.environ[const.LANGCHAIN_TRACING_V2] = const.TRUE
os.environ[const.LANGCHAIN_ENDPOINT] = const.ENDPOINT
os.environ[const.LANGCHAIN_API_KEY] = config[MODEL][const.API_KEY]
os.environ[const.LANGCHAIN_PROJECT]= config[MODEL][const.PROJECT]
os.environ[const.COHERE_API_KEY] = config[const.COHERE][const.API_KEY]

#FAISS Vector Store

In [7]:
# Initialize an instance of HuggingFaceEmbeddings with the specified parameters
embeddings = HuggingFaceEmbeddings(
    model_name= config[const.EMBEDDINGS][const.NAME], # Provide the pre-trained model's path
    model_kwargs= {const.DEVICE: config[const.EMBEDDINGS][const.KWARGS][const.DEVICE]}, # Pass the model configuration options
    encode_kwargs= {const.NORMALIZE_EMBEDDINGS: config[const.EMBEDDINGS][const.ENCODE_KWARGS][const.NORMALIZE_EMBEDDINGS]} # Pass the encoding options
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [8]:
loaded_db = FAISS.load_local("/content/drive/MyDrive/colab/vector_store",
                             embeddings,
                             allow_dangerous_deserialization = config[const.DANGEROUS_DESERIAL])

#Model Loading

In [9]:
# To quantize model into 4bits
quantization_config = BitsAndBytesConfig(
    load_in_4bit = config[const.QUANTIZATION][const.LOAD_4BIT],
    bnb_4bit_compute_dtype = torch.float16,
    bnb_4bit_quant_type = config[const.QUANTIZATION][const.QUANT_TYPE]
)

In [10]:
model_tokenizer = config[MODEL][const.TOKENIZER]
model = AutoModelForCausalLM.from_pretrained(config[MODEL][const.NAME],
                                             quantization_config = quantization_config,
                                             low_cpu_mem_usage = config[const.QUANTIZATION][const.LOW_CPU])
tokenizer = AutoTokenizer.from_pretrained(model_tokenizer[const.NAME])

config.json:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

pytorch_model.bin.index.json:   0%|          | 0.00/16.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.95G [00:00<?, ?B/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/4.48G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/117 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.73M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/281 [00:00<?, ?B/s]

#Retrieval & Reranking

In [11]:
#Retriever
retriever = loaded_db.as_retriever(search_type=config[const.RETRIEVER][const.SEARCH_TYPE],
                            search_kwargs={const.K: config[const.RETRIEVER][const.TOP_MATCH]})

In [12]:
#Reranker
llm = Cohere(temperature = config[const.COHERE][const.TEMP])
compressor = CohereRerank(model=config[const.COHERE][const.NAME])
compression_retriever = ContextualCompressionRetriever(base_compressor=compressor, base_retriever=retriever)

  llm = Cohere(temperature = config[const.COHERE][const.TEMP])


#Generation



In [13]:
def predict(query, history=" "):
  top_matches = compression_retriever.invoke(query)

  # Retrieve the 3 paragraphs from reranker and concatenate it as a context string
  context = ''
  for i in range(len(top_matches)):
    context += top_matches[i].page_content

  input_text = config[MODEL][const.PROMPT].format(context = context, question = query)

  tokenizer.pad_token = tokenizer.eos_token
  encoding = tokenizer(input_text, return_tensors=model_tokenizer[const.RETURN_TENSORS],
                       padding = model_tokenizer[const.PADDING],
                       truncation = model_tokenizer[const.TRUNCATION]).to(const.CUDA)

  with torch.no_grad():
    #generating embeddings
    outputs = model.generate(**encoding,
                             max_new_tokens = config[MODEL][const.MAX_NEW_TOKENS],
                             pad_token_id = config[MODEL][const.PAD_TOKEN_ID],
                             repetition_penalty = config[MODEL][const.REPETITION_PEN]
                             )

  #converting embeddings into text
  generated_text = tokenizer.decode(outputs[0], skip_special_tokens=model_tokenizer[const.SKIP_SPECIAL_TOKENS])

  return generated_text[generated_text.find(input_text)+len(input_text):]

In [None]:
gr.ChatInterface(predict).launch(share=config[const.GRADIO][const.SHARE],
                                 debug=config[const.GRADIO][const.DEBUG])

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
Running on public URL: https://e5f94a71dc7b8ad98f.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


#Evaluation

In [None]:
def evaluate_cohere(reference_text, generated_text, compressor):
    rerank_result = compressor.rerank(query=reference_text, documents=[generated_text], top_n = 1)
    return rerank_result[0]['relevance_score']

def evaluate_rouge(reference_text, generated_text):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer = True)
    rouge_score = scorer.score(reference_text, generated_text)
    return rouge_score

In [None]:
start_time = time()
generated_text = predict("")
end_time = time()
reference_text = ""

In [None]:
rouge_score = evaluate_rouge(reference_text, generated_text)
relevance_score = evaluate_cohere(reference_text, generated_text, compressor)
inference_time = end_time - start_time
print(f"ROUGEL Score: {rouge_score['rougeL']}")
print(f"Cohere Relevance Score: {relevance_score}")
print(f"Inference Time: {inference_time}")