# Imports

In [1]:
!pip install -q chromadb sentence-transformers

In [3]:
!pip install -q chromadb

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m526.8/526.8 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.0/92.0 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.4/62.4 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.3/41.3 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m34.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.9/59.9 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m107.0/107.0 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━

In [7]:
!pip install -q langchain




In [102]:
from huggingface_hub import login
import logging
from transformers import (
    GPT2Tokenizer, GPT2LMHeadModel, T5Tokenizer, T5ForConditionalGeneration,
    BertTokenizer, BertForQuestionAnswering, DistilBertTokenizer, DistilBertForQuestionAnswering,
    GPTNeoForCausalLM, pipeline
)
import torch
from chromadb import Client, Settings
from sentence_transformers import SentenceTransformer
import gc

In [3]:
project_path = '/content/drive/MyDrive/Colab Notebooks/RAG'

# GPU

In [4]:

print(torch.cuda.is_available())
# print(torch.cuda.get_device_name(0))


False


# Hugging Face Login

In [5]:
# Log in using your Hugging Face access token
access_token = "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"
login(access_token)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful


# LLM


---

###key functions of LLM class:


*   **load_llm_local**: tries to load an llm from google drvie
*   **load_llm_online**: loads the llm from hugging face
*   **select_device**: if gpu is available it will select it
*   **generate_text**: it can generate text based on given prompt

In [8]:
class LLM:
    model_classes = {
        'gpt2': (GPT2Tokenizer, GPT2LMHeadModel, 'gpt2'),
        't5': (T5Tokenizer, T5ForConditionalGeneration, 't5-small'),
        'bert': (BertTokenizer, BertForQuestionAnswering, 'bert-large-uncased-whole-word-masking-finetuned-squad'),
        'distil-bert': (DistilBertTokenizer, DistilBertForQuestionAnswering, 'distilbert-base-cased-distilled-squad'),
        'gpt-neo': (GPT2Tokenizer, GPTNeoForCausalLM, 'EleutherAI/gpt-neo-1.3B')
    }
    def __init__(self, llm_type: str, load_online=False, save_model=False):
        self.device = self.select_device()
        self.tokenizer, self.model = self.load_llm(llm_type, load_online, save_model)
        self.model.to(self.device)
        logging.basicConfig(level=logging.INFO)
        logging.info(f"Model {llm_type} loaded and moved to {self.device}.")

    def load_llm(self, llm_type: str, load_online: bool, save_model: bool):

        tokenizer_class, model_class, model_path = self.model_classes[llm_type]

        if not load_online:
            model_path = f"{project_path}/models/{model_path}"

        tokenizer = tokenizer_class.from_pretrained(model_path)
        model = model_class.from_pretrained(model_path)

        if save_model:
            tokenizer.save_pretrained(f'{project_path}/models/{model_path}')
            model.save_pretrained(f'{project_path}/models/{model_path}')

        return tokenizer, model


    @staticmethod
    def select_device() -> str:
        return 'cuda' if torch.cuda.is_available() else 'cpu'

    def generate_text(self, input_text: str, context: str = '') -> str:
        raise NotImplementedError("The generate_text method should be implemented by the subclass.")

    def free_memory(self):
        del self.model
        del self.tokenizer
        gc.collect()
        torch.cuda.empty_cache()

## GPT2

In [91]:
class GPT2(LLM):
    def __init__(self, load_online=False, save_model=False):
        super().__init__('gpt2', load_online, save_model)

    def generate_text(self, input_text: str, context: str = '') -> str:

        prompt = f"Context: {context}\nQuestion: {question}\nAnswer:"
        inputs = self.tokenizer.encode(prompt, return_tensors='pt')
        outputs = self.model.generate(
            inputs,
            max_length=80,
            temperature=0.7,
            top_p=0.9,
            top_k=50,
            num_return_sequences=1,
            pad_token_id=self.tokenizer.eos_token_id,
            do_sample=True
        )

        response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        response = response.replace(prompt, '').strip()

        return response.split('\n')[0]

## t5

In [10]:
class T5(LLM):
    def __init__(self, load_online=False, save_model=False):
        super().__init__('t5', load_online, save_model)

    def generate_text(self, input_text: str, context: str = None) -> str:

        # prompt = f"question: {input_text} context: {context}"
        prompt = f"question: {input_text} context: {context}" if context else f"question: {input_text}"
        input_ids = self.tokenizer.encode(prompt, return_tensors='pt', max_length=512, truncation=True)
        outputs = self.model.generate(input_ids, max_length=50, num_beams=1, early_stopping=False)
        response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)

        return response

## BERT

In [103]:
class BERT(LLM):
    def __init__(self, load_online=False, save_model=False):
        super().__init__('bert', load_online, save_model)

    def generate_text(self, input_text: str, context: str = '') -> str:

        nlp = pipeline('question-answering', model=self.model, tokenizer=self.tokenizer)
        result = nlp(question=input_text, context=context)
        return result['answer']

## Distil BERT

In [12]:
class DistilBERT(LLM):
    def __init__(self, load_online=False, save_model=False):
        super().__init__('distil-bert', load_online, save_model)

    def generate_text(self, input_text: str, context: str = '') -> str:
        nlp = pipeline('question-answering', model=self.model, tokenizer=self.tokenizer)
        result = nlp(question=input_text, context=context)
        return result['answer']

## GPT-Neo

In [16]:
class NeoGPT(LLM):
    def __init__(self, load_online=False, save_model=False):
        super().__init__('gpt-neo', load_online, save_model)

    def generate_text(self, input_text: str, context: str = None) -> str:

        prompt = f"question: {input_text} context: {context}" if context else f"question: {input_text}"

        inputs = self.tokenizer.encode(prompt, return_tensors='pt')

        outputs = self.model.generate(
            inputs,
            max_length=100,
            num_return_sequences=1,
            pad_token_id=self.tokenizer.eos_token_id,
            num_beams=5,
            temperature=0.7,
            top_k=50,
            top_p=0.95,
            no_repeat_ngram_size=2,
            do_sample=True
        )

        response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        return response

## test LLM class

In [92]:
llm = GPT2()

In [21]:
# Example usage
question = "What is the capital of Iran?"
context = 'The capital of Iran is Tehran.'

response = llm.generate_text(question, context)
print("response:", response)

response: Tehran is the capital of Iran.


# Collection

In [93]:
class Collection:
    def __init__(self, collection_name: str, transformer_type: str = 'all-MiniLM-L6-v2', load_online=False, save_transformer=False):
        self.client = Client(Settings())
        existing_collections = [col.name for col in self.client.list_collections()]
        if collection_name in existing_collections:
            self.client.delete_collection(collection_name)
        self.collection = self.client.get_or_create_collection(collection_name)
        self.vectorizer = self.load_sentence_transformer(transformer_type, load_online, save_transformer)

    def load_sentence_transformer(self, transformer_type: str, load_online: bool, save_transformer: bool):
        transformer_path = f'{project_path}/models/{transformer_type}' if not load_online else transformer_type
        vectorizer = SentenceTransformer(transformer_path)

        if save_transformer:
            vectorizer.save(f'{project_path}/models/{transformer_type}')

        return vectorizer

    def add_contexts(self, context_data: list):
        vectors = self.vectorizer.encode(context_data)
        ids = [f"context_{i}" for i in range(len(context_data))]
        self.collection.add(ids=ids, embeddings=vectors.tolist(), documents=context_data)
        print("Documents added to ChromaDB.")

    def retrieve_contexts(self, question: str, top_n: int = 1):
        question_vector = self.vectorizer.encode([question])[0].tolist()
        results = self.collection.query(query_embeddings=[question_vector], n_results=top_n)
        results = results['documents'][0]
        return results[:top_n]

## test Collection class

In [38]:
# tranformer types:

# default => all-MiniLM-L6-v2
# paraphrase-MiniLM-L6-v2
# paraphrase-xlm-r-multilingual-v1
# stsb-roberta-large

In [94]:
collection = Collection('rag')

context_data = [
    "The capital of France is Paris. It is known for its art, culture, and cuisine.",
    "The Great Wall of China is one of the greatest wonders of the world.",
    "The Amazon rainforest is a moist broadleaf forest that covers most of the Amazon basin of South America.",
    "The Amazon rainforest is a moist broadleaf forest that covers most of the Amazon basin of South Asia."
]
collection.add_contexts(context_data)

Documents added to ChromaDB.


In [73]:
response = collection.retrieve_contexts('amazon', top_n=2)

print(response)

['The Amazon rainforest is a moist broadleaf forest that covers most of the Amazon basin of South Asia.', 'The Amazon rainforest is a moist broadleaf forest that covers most of the Amazon basin of South America.']


# RAG

In [104]:
class RAG:
    def __init__(self, llm: LLM, collection: Collection):
        self.llm = llm
        self.collection = collection

    def generate_response(self, query: str, top_n: int=1) -> str:
        retrieved_contexts = self.collection.retrieve_contexts(query, top_n)
        retrieved_contexts = '\n'.join(retrieved_contexts)
        response = self.llm.generate_text(query, retrieved_contexts)
        return response


## test RAG class

In [105]:
llm = BERT()
collection = Collection('rag')

context_data = [
    "The capital of France is Paris. It is known for its art, culture, and cuisine.",
    "The Great Wall of China is one of the greatest wonders of the world.",
    "The Amazon rainforest is a moist broadleaf forest that covers most of the Amazon basin of South America.",
    "The Amazon rainforest is a moist broadleaf forest that covers most of the Amazon basin of South Asia."
]
collection.add_contexts(context_data)

Documents added to ChromaDB.


In [108]:
rag = RAG(llm, collection)

In [110]:
query = "tell me about china?"
response = rag.generate_response(query, top_n=3)
print(response)

The Great Wall of China is one of the greatest wonders of the world.


In [34]:
from langchain_community.embeddings.sentence_transformer import (
    SentenceTransformerEmbeddings,
)
import chromadb
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction
ef = SentenceTransformerEmbeddingFunction(model_name='all-MiniLM-L6-v2')

chroma_client = chromadb.Client()
collection_name = "marmikpandya"
try:
    chroma_client.delete_collection(collection_name)
    print(f"Deleted existing collection: {collection_name}")
except Exception as e:
    print(f"Collection {collection_name} does not exist or could not be deleted: {e}")

# Create the collection
collection = chroma_client.create_collection(name=collection_name, embedding_function=ef)
# collection = chroma_client.create_collection(name="marmikpandya", embedding_function=ef)
context_data = [
    "The capital of France is Paris. It is known for its art, culture, and cuisine.",
    "The Great Wall of China is one of the greatest wonders of the world.",
    "The Amazon rainforest is a moist broadleaf forest that covers most of the Amazon basin of South America.",
    "The Amazon rainforest is a moist broadleaf forest that covers most of the Amazon basin of South Asia."
]
collection.add(
    documents=context_data,
    # metadatas=[{"response": out} for out in dataset["train"]["output"]],
    ids=[str(i) for i in range(len(context_data))]
)
chroma_client = chromadb.Client()
ef_lc = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
langchain_chroma = Chroma(
    client=chroma_client,
    collection_name="marmikpandya",
    embedding_function=ef_lc,
)
retriever = langchain_chroma.as_retriever()

from langchain.chains import create_retrieval_chain

from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
prompt = ChatPromptTemplate.from_template("""Answer the following question based only on the provided context:

<context>
{context}
</context>

Question: {input}""")

document_chain = create_stuff_documents_chain(llm, prompt)
retrieval_chain = create_retrieval_chain(retriever, document_chain)

Deleted existing collection: marmikpandya


TypeError: Expected a Runnable, callable or dict.Instead got an unsupported type: <class '__main__.LLM'>

In [8]:
context_data = [
    "The capital of France is Paris. It is known for its art, culture, and cuisine.",
    "The Great Wall of China is one of the greatest wonders of the world.",
    "The Amazon rainforest is a moist broadleaf forest that covers most of the Amazon basin of South America.",
    "The Amazon rainforest is a moist broadleaf forest that covers most of the Amazon basin of South Asia."
]

# collection = Collection(collection_name="qa_contexts", model_name='all-MiniLM-L6-v2')
# collection = Collection(collection_name="qa_contexts", model_name='paraphrase-MiniLM-L6-v2')
collection = Collection(collection_name="qa_contexts", model_name='paraphrase-xlm-r-multilingual-v1')
# collection = Collection(collection_name="qa_contexts", model_name='stsb-roberta-large')

# Add contexts to the collection
collection.add_contexts(context_data)

# Retrieve a context based on a question
question = "What is the capital of France?"
context = collection.retrieve_contexts(question)
print(f"Retrieved context: {context}")


Documents added to ChromaDB.
Retrieved context: ['The capital of France is Paris. It is known for its art, culture, and cuisine.']


In [19]:
!pip install -q langchain_chroma langchain_community

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.3/49.3 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [20]:
import logging
from sentence_transformers import SentenceTransformer
from transformers import GPT2Tokenizer, GPT2LMHeadModel, T5Tokenizer, T5ForConditionalGeneration, BertTokenizer, BertForMaskedLM, DistilBertTokenizer, DistilBertForMaskedLM, RobertaTokenizer, RobertaForMaskedLM, GPTNeoForCausalLM
import torch
from chromadb import Client, Settings
from langchain.chains import create_retrieval_chain
# from langchain.llms import OpenAI
from langchain_chroma import Chroma
from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings