# Imports

In [5]:
!pip install -q chromadb sentence-transformers

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m526.8/526.8 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.7/224.7 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.0/92.0 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.4/62.4 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.3/41.3 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m25.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.9/59.9 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━

In [6]:
from huggingface_hub import login
import logging
from transformers import (
    AutoModelForCausalLM, AutoTokenizer,
    GPT2Tokenizer, GPT2LMHeadModel, T5Tokenizer, T5ForConditionalGeneration,
    BertTokenizer, BertForMaskedLM, DistilBertTokenizer, DistilBertForMaskedLM,
    RobertaTokenizer, RobertaForMaskedLM, GPTNeoForCausalLM
)
import torch
from chromadb import Client, Settings
from sentence_transformers import SentenceTransformer

In [7]:
project_path = '/content/drive/MyDrive/Colab Notebooks/RAG'

# GPU

In [8]:

print(torch.cuda.is_available())
# print(torch.cuda.get_device_name(0))


False


# Hugging Face Login

In [8]:
# Log in using your Hugging Face access token
access_token = "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"
login(access_token)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful


# LLM


---

###key functions of LLM class:


*   **load_llm_local**: tries to load an llm from google drvie
*   **load_llm_online**: loads the llm from hugging face
*   **select_device**: if gpu is available it will select it
*   **generate_text**: it can generate text based on given prompt

In [9]:
class LLM:
    def __init__(self, llm_type: str, load_online=False, configs=None):
        self.configs = configs
        self.device = self.select_device()
        if not load_online:
            self.tokenizer, self.model = self.load_llm_local(llm_type)
        else:
            self.tokenizer, self.model = self.load_llm_online(llm_type)
        self.model.to(self.device)
        logging.basicConfig(level=logging.INFO)
        logging.info(f"Model {llm_type} loaded and moved to {self.device}.")

    def load_llm_local(self, llm_type: str):
        try:
            if llm_type == 'gpt2':
                model_path = f"{project_path}/models/gpt2"
                tokenizer = GPT2Tokenizer.from_pretrained(model_path)
                model = GPT2LMHeadModel.from_pretrained(model_path)
            elif llm_type == 'distilgpt2':
                model_path = f"{project_path}/models/distilgpt2"
                tokenizer = GPT2Tokenizer.from_pretrained(model_path)
                model = GPT2LMHeadModel.from_pretrained(model_path)
            elif llm_type == 't5-small':
                model_path = f"{project_path}/models/t5-small"
                tokenizer = T5Tokenizer.from_pretrained(model_path)
                model = T5ForConditionalGeneration.from_pretrained(model_path)
            elif llm_type == 'bert':
                model_path = f"{project_path}/models/bert"
                tokenizer = BertTokenizer.from_pretrained(model_path)
                model = BertForMaskedLM.from_pretrained(model_path)
            elif llm_type == 'distilbert':
                model_path = f"{project_path}/models/distilbert"
                tokenizer = DistilBertTokenizer.from_pretrained(model_path)
                model = DistilBertForMaskedLM.from_pretrained(model_path)
            elif llm_type == 'roberta':
                model_path = f"{project_path}/models/roberta"
                tokenizer = RobertaTokenizer.from_pretrained(model_path)
                model = RobertaForMaskedLM.from_pretrained(model_path)
            elif llm_type == 'gpt-neo':
                model_path = f"{project_path}/models/gpt-neo"
                tokenizer = GPT2Tokenizer.from_pretrained(model_path)
                model = GPTNeoForCausalLM.from_pretrained(model_path)
            else:
                raise ValueError(f"Unsupported model type: {llm_type}")
            return tokenizer, model
        except Exception as e:
            logging.error(f"Error loading model {llm_type} from {model_path}: {e}")
            raise
    def load_llm_online(self, llm_type: str):
        try:
            if llm_type == 'gpt2':
                model_path = "gpt2"
                tokenizer = GPT2Tokenizer.from_pretrained(model_path)
                model = GPT2LMHeadModel.from_pretrained(model_path)

            elif llm_type == 'distilgpt2':
                model_path = "distilgpt2"
                tokenizer = GPT2Tokenizer.from_pretrained(model_path)
                model = GPT2LMHeadModel.from_pretrained(model_path)

            elif llm_type == 't5-small':
                model_path = "t5-small"
                tokenizer = T5Tokenizer.from_pretrained(model_path)
                model = T5ForConditionalGeneration.from_pretrained(model_path)

            elif llm_type == 'bert':
                model_path = "bert-base-uncased"
                tokenizer = BertTokenizer.from_pretrained(model_path)
                model = BertForMaskedLM.from_pretrained(model_path)

            elif llm_type == 'distilbert':
                model_path = "distilbert-base-uncased"
                tokenizer = DistilBertTokenizer.from_pretrained(model_path)
                model = DistilBertForMaskedLM.from_pretrained(model_path)

            elif llm_type == 'roberta':
                model_path = "roberta-base"
                tokenizer = RobertaTokenizer.from_pretrained(model_path)
                model = RobertaForMaskedLM.from_pretrained(model_path)

            elif llm_type == 'gpt-neo':
                model_path = "EleutherAI/gpt-neo-125M"
                tokenizer = GPT2Tokenizer.from_pretrained(model_path)
                model = GPTNeoForCausalLM.from_pretrained(model_path)

            else:
                raise ValueError(f"Unsupported model type: {llm_type}")

            # save the model after download
            tokenizer.save_pretrained(f'{project_path}/models/{llm_type}')
            model.save_pretrained(f'{project_path}/models/{llm_type}')
            return tokenizer, model
        except Exception as e:
            logging.error(f"Error downloading model {llm_type} from {model_path}: {e}")
            raise

    @staticmethod
    def select_device() -> str:
        return 'cuda' if torch.cuda.is_available() else 'cpu'

    def generate_text(self, input_text: str) -> str:
        try:
            input_ids = self.tokenizer.encode(input_text, return_tensors='pt').to(self.device)
            if self.tokenizer.pad_token_id is None:
                self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
            attention_mask = input_ids.ne(self.tokenizer.pad_token_id).long().to(self.device)

            default_configs = {
                'max_length': 100,
                'num_return_sequences': 1,
                'do_sample': True,
                'top_k': 90,
                'top_p': 0.95,
                'temperature': 0.3,
                'attention_mask': attention_mask,
                'pad_token_id': self.tokenizer.eos_token_id
            }

            if self.configs:
                default_configs.update(self.configs)

            output = self.model.generate(input_ids, **default_configs)

            return self.tokenizer.decode(output[0], skip_special_tokens=True)
        except Exception as e:
            logging.error(f"Error generating text for input '{input_text}': {e}")
            return "Error generating text."


In [87]:
llm = LLM('gpt2', )

In [91]:
output = llm.generate_text('question: who are you, answer is mohammad')
print(output)

question: who are you, answer is mohammad.

Answer: I am mohammad.

Answer: I am mohammad.

Answer: I am mohammad.

Answer: I am mohammad.

Answer: I am mohammad.

Answer: I am mohammad.

Answer: I am mohammad.

Answer: I am mohammad.

Answer: I am moh


# Collection

In [34]:
class Collection:
    def __init__(self, collection_name: str, model_name: str = 'all-MiniLM-L6-v2', load_online=False):
        self.client = Client(Settings())
        existing_collections = [col.name for col in self.client.list_collections()]
        if collection_name in existing_collections:
            self.client.delete_collection(collection_name)
        existing_collections = [col.name for col in self.client.list_collections()]
        self.collection = self.client.get_or_create_collection(collection_name)
        if not load_online:
            self.vectorizer = SentenceTransformer(f'{project_path}/models/{model_name}')
        else:
            self.vectorizer = SentenceTransformer(model_name)
            self.vectorizer.save(f'{project_path}/models/{model_name}')

    def add_contexts(self, context_data: list):
        vectors = self.vectorizer.encode(context_data)
        ids = [f"context_{i}" for i in range(len(context_data))]
        self.collection.add(ids=ids, embeddings=vectors.tolist(), documents=context_data)
        print("Documents added to ChromaDB.")

    def retrieve_context(self, question: str):
        question_vector = self.vectorizer.encode([question])[0].tolist()
        results = self.collection.query(query_embeddings=[question_vector], n_results=1)
        return results['documents'][0]

In [36]:
context_data = [
    "The capital of France is Paris. It is known for its art, culture, and cuisine.",
    "The Great Wall of China is one of the greatest wonders of the world.",
    "The Amazon rainforest is a moist broadleaf forest that covers most of the Amazon basin of South America.",
    "The Amazon rainforest is a moist broadleaf forest that covers most of the Amazon basin of South Asia."
]

# collection = Collection(collection_name="qa_contexts", model_name='all-MiniLM-L6-v2')
# collection = Collection(collection_name="qa_contexts", model_name='paraphrase-MiniLM-L6-v2')
collection = Collection(collection_name="qa_contexts", model_name='paraphrase-xlm-r-multilingual-v1')
# collection = Collection(collection_name="qa_contexts", model_name='stsb-roberta-large')

# Add contexts to the collection
collection.add_contexts(context_data)

# Retrieve a context based on a question
question = "What is the capital of France?"
context = collection.retrieve_context(question)
print(f"Retrieved context: {context}")


Documents added to ChromaDB.
Retrieved context: ['The capital of France is Paris. It is known for its art, culture, and cuisine.']


In [115]:
from transformers import BertTokenizer, BertModel
import torch

class BERTBasedModel:
    def __init__(self):
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.model = BertModel.from_pretrained('bert-base-uncased')

    def encode_text(self, text):
        inputs = self.tokenizer(text, return_tensors="pt", max_length=512, truncation=True, padding=True)
        outputs = self.model(**inputs)
        return outputs.last_hidden_state.mean(dim=1).squeeze().detach().numpy()

# Usage example
if __name__ == "__main__":
    bert_model = BERTBasedModel()
    text = "The capital of France is Paris."
    encoded_text = bert_model.encode_text(text)
    print(encoded_text)


[-2.13676035e-01 -3.91903222e-01 -2.66700089e-01 -3.90870571e-02
 -3.93290162e-01 -9.54794586e-02  8.90238136e-02  1.03771603e+00
 -2.21996307e-01 -2.70034432e-01 -3.49999458e-01 -4.28629965e-01
 -7.70025030e-02  1.71783954e-01 -1.31053180e-01 -1.23675130e-02
 -1.39211595e-01  1.52360499e-01  3.39183770e-02 -4.00302485e-02
 -2.77385622e-01  4.52033371e-01  4.41524871e-02  4.77774471e-01
  5.29628515e-01 -6.12427406e-02 -2.49680728e-01  5.03534414e-02
  5.06222434e-02 -2.36037657e-01  3.21265273e-02  1.99337065e-01
 -2.26213992e-01 -6.91059530e-02  5.07191896e-01  9.36603248e-02
  2.01888323e-01 -6.04904629e-02 -1.24802321e-01  1.13808803e-01
 -2.64049262e-01 -6.49127007e-01  1.19680561e-01 -2.82466352e-01
 -2.95890179e-02 -4.63031411e-01  1.74842656e-01  5.03693402e-01
  5.76818287e-01  2.61461318e-01 -6.11727893e-01  3.30407768e-01
 -9.77744162e-03  4.70626354e-01  6.99345469e-01  7.66546547e-01
 -4.58161354e-01 -5.63782990e-01 -7.32800007e-01  1.02959208e-01
 -1.81162879e-01  3.28731

In [116]:
from gensim.models import Word2Vec
import numpy as np

class WordEmbeddingModel:
    def __init__(self):
        self.model = Word2Vec.load("path/to/word2vec.model")

    def encode_text(self, text):
        tokens = text.lower().split()
        vectors = [self.model.wv[token] for token in tokens if token in self.model.wv]
        if vectors:
            return np.mean(vectors, axis=0)
        else:
            return np.zeros(self.model.vector_size)

# Usage example
if __name__ == "__main__":
    word2vec_model = WordEmbeddingModel()
    text = "The capital of France is Paris."
    encoded_text = word2vec_model.encode_text(text)
    print(encoded_text)


FileNotFoundError: [Errno 2] No such file or directory: 'path/to/word2vec.model'

# RAG

In [None]:
class RAG:
  def __init__(self):
    pass



In [None]:
!pip uninstall -q chromadb

Proceed (Y/n)? y


In [None]:
!pip install -q chromadb

In [None]:
!pip install -q chromadb sentence-transformers

In [None]:
!pip install sentence-transformers

Collecting sentence-transformers
  Using cached sentence_transformers-3.0.0-py3-none-any.whl (224 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cusolver-cu12==11.4.5.107 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cusolver_cu12-11.4.5.107-py3-none-manylinux1_x86_64.whl (124.2 MB)
Installing collected packages: nvidia-cudnn-cu12, nvidia-cusolver-cu12, sentence-transformers
Successfully installed nvidia-cudnn-cu12-8.9.2.26 nvidia-cusolver-cu12-11.4.5.107 sentence-transformers-3.0.0


In [None]:
from chromadb import Client, Settings
from sentence_transformers import SentenceTransformer

# Initialize ChromaDB client
client = Client(Settings())

# Define a collection for storing contexts
collection = client.get_or_create_collection("qa_contexts")

# Initialize the sentence transformer model for vectorization
vectorizer = SentenceTransformer('all-MiniLM-L6-v2')

# Sample context data
context_data = [
    "The capital of France is Paris. It is known for its art, culture, and cuisine.",
    "The Great Wall of China is one of the greatest wonders of the world.",
    "The Amazon rainforest is a moist broadleaf forest that covers most of the Amazon basin of South America."
]

# Vectorize the context data
vectors = vectorizer.encode(context_data, convert_to_tensor=True)

# Generate unique IDs for each context
ids = [f"context_{i}" for i in range(len(context_data))]

# Add vectors and context data to ChromaDB
collection.add(ids=ids, embeddings=vectors.tolist(), documents=context_data)

# Now let's define functions to retrieve context and answer questions
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch

# Load the model and tokenizer
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Check if GPU is available
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

def retrieve_context(question):
    # Vectorize the question
    question_vector = vectorizer.encode([question], convert_to_tensor=True)[0]
    # Find the most similar context in the collection
    results = collection.query(query_embeddings=[question_vector.tolist()], n_results=1)
    return results['documents'][0]

def answer_question_t5(question):
    context = retrieve_context(question)
    input_text = f"question: {question} context: {context}"
    input_ids = tokenizer.encode(input_text, return_tensors='pt').to(device)

    # Generate answer
    with torch.no_grad():
        output = model.generate(input_ids, max_length=150)

    # Decode the generated text
    answer = tokenizer.decode(output[0], skip_special_tokens=True)
    return answer.strip()

# Example usage
question = "What is the capital of France?"
print("Generated Answer:", answer_question_t5(question))


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Generated Answer: Paris


In [None]:
from chromadb import Client, Settings
from sentence_transformers import SentenceTransformer

# Initialize ChromaDB client
client = Client(Settings())

# Define a collection for storing contexts
collection = client.get_or_create_collection("qa_contexts")

# Initialize the sentence transformer model for vectorization
vectorizer = SentenceTransformer('all-MiniLM-L6-v2')

# Sample context data
context_data = [
    "The capital of France is Paris. It is known for its art, culture, and cuisine.",
    "The Great Wall of China is one of the greatest wonders of the world.",
    "The Amazon rainforest is a moist broadleaf forest that covers most of the Amazon basin of South America.",
    "The Amazon rainforest is a moist broadleaf forest that covers most of the Amazon basin of South asia."
]

# Vectorize the context data
vectors = vectorizer.encode(context_data)

# Generate unique IDs for each context
ids = [f"context_{i}" for i in range(len(context_data))]

# Add vectors and context data to ChromaDB
collection.add(ids=ids, embeddings=vectors.tolist(), documents=context_data)

print("Documents added to ChromaDB.")





Documents added to ChromaDB.


In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch

# Load the model and tokenizer
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Check if GPU is available
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

def retrieve_context(question):
    # Vectorize the question
    question_vector = vectorizer.encode([question])[0].tolist()
    # Find the most similar context in the collection
    results = collection.query(query_embeddings=[question_vector], n_results=1)
    return results['documents'][0]

def answer_question_t5(question):
    context = retrieve_context(question)
    input_text = f"question: {question} context: {context}"
    input_ids = tokenizer.encode(input_text, return_tensors='pt').to(device)

    # Generate answer
    with torch.no_grad():
        output = model.generate(input_ids, max_length=150)

    # Decode the generated text
    answer = tokenizer.decode(output[0], skip_special_tokens=True)
    return answer.strip()

# Example usage
question = "the"
print("Generated Answer:", answer_question_t5(question))


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


{'ids': [['context_3']], 'distances': [[1.8912122249603271]], 'metadatas': [[None]], 'embeddings': None, 'documents': [['The Amazon rainforest is a moist broadleaf forest that covers most of the Amazon basin of South asia.']], 'uris': None, 'data': None}
['The Amazon rainforest is a moist broadleaf forest that covers most of the Amazon basin of South asia.']
Generated Answer: ['The Amazon rainforest is a moist broadleaf forest that covers most of the Amazon basin of South asia


In [None]:
def rag_system(question, top_n=1):
    context = retrieve_context(question)
    input_text = f"question: {question} context: {context}"
    input_ids = tokenizer.encode(input_text, return_tensors='pt').to(device)

    # Generate answer
    with torch.no_grad():
        output = model.generate(input_ids, max_length=150)

    # Decode the generated text
    answer = tokenizer.decode(output[0], skip_special_tokens=True)
    return answer.strip()

# Example usage
question = "What is the capital of France?"
response = rag_system(question)
print("Generated Answer:", response)


{'ids': [['context_0']], 'distances': [[0.44983717799186707]], 'metadatas': [[None]], 'embeddings': None, 'documents': [['The capital of France is Paris. It is known for its art, culture, and cuisine.']], 'uris': None, 'data': None}
Generated Answer: Paris


In [None]:
# Example usage
question = "the"
response = rag_system(question)
print("Generated Answer:", response)

{'ids': [['context_2']], 'distances': [[1.9055213928222656]], 'metadatas': [[None]], 'embeddings': None, 'documents': [['The Amazon rainforest is a moist broadleaf forest that covers most of the Amazon basin of South America.']], 'uris': None, 'data': None}
Generated Answer: ['The Amazon rainforest is a moist broadleaf forest that covers most of the Amazon basin of South America
