In [None]:
!CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python

### Requirements

In [None]:
!pip install pypdf
!pip install --upgrade --quiet  langchain langchain-community
!pip install -qU langchain-text-splitters
!pip install chromadb
!pip install sentence_transformers
!pip install langchain_chroma langchainhub

### Import libararies

In [None]:
from huggingface_hub import hf_hub_download
from langchain_community.llms import LlamaCpp
from transformers import AutoTokenizer, AutoModel
import torch
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
import chromadb
from langchain_community.embeddings.sentence_transformer import (
    SentenceTransformerEmbeddings,
)
from langchain_chroma import Chroma
from langchain import hub
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

In [None]:
model_name = "google/gemma-2b-it"
model_file = "gemma-2b-it.gguf"
HF_TOKEN = "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"
model_path = hf_hub_download(model_name,
                             filename=model_file,
                             local_dir='/content',
                             token=HF_TOKEN)
print("My model path: ", model_path)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


gemma-2b-it.gguf:   0%|          | 0.00/10.0G [00:00<?, ?B/s]

My model path:  /content/gemma-2b-it.gguf


In [None]:
llm = LlamaCpp(model_path=model_path, n_gpu_layers=1, n_ctx=2048, f16_kv=True, verbose=True)

llama_model_loader: loaded meta data with 19 key-value pairs and 164 tensors from /content/gemma-2b-it.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = gemma
llama_model_loader: - kv   1:                               general.name str              = gemma-2b-it
llama_model_loader: - kv   2:                       gemma.context_length u32              = 8192
llama_model_loader: - kv   3:                          gemma.block_count u32              = 18
llama_model_loader: - kv   4:                     gemma.embedding_length u32              = 2048
llama_model_loader: - kv   5:                  gemma.feed_forward_length u32              = 16384
llama_model_loader: - kv   6:                 gemma.attention.head_count u32              = 8
llama_model_loader: - kv   7:              gemma.attention.head_count_kv u32        

In [None]:
llm.invoke("Simulate a rap battle between itachi uchiha and sasuke uchiha")

Llama.generate: prefix-match hit

llama_print_timings:        load time =    3677.96 ms
llama_print_timings:      sample time =    1319.47 ms /   256 runs   (    5.15 ms per token,   194.02 tokens per second)
llama_print_timings: prompt eval time =   14281.85 ms /    10 tokens ( 1428.19 ms per token,     0.70 tokens per second)
llama_print_timings:        eval time =  226882.91 ms /   255 runs   (  889.74 ms per token,     1.12 tokens per second)
llama_print_timings:       total time =  230921.58 ms /   265 tokens


".\n\n**Crowd:** Ladies and gentlemen, the stage is set for a lyrical showdown! Tonight, we have two titans of the anime world, Itachi Uchiha and Sasuke Uchiha!\n\n**Announcer:** Let the battle commence!\n\n**Verse 1: Itachi**\nI am the shadow that lurks in the night,\nWith eyes that burn with inner light.\nNo weapon can match my cunning ways,\nI dance through the shadows, leaving no trace.\n\nMy jutsu are deadly, my abilities vast,\nFrom sharingan to Rasengan, I leave a trail.\nThe Sharingan's power fuels my every stride,\nSasuke, your reign is finally denied.\n\n**Verse 2: Sasuke**\nI am the lightning that strikes with thunderous force,\nMy chakra flows like a river in the storm.\nYour jutsus are nothing but smoke and haze,\nI erase your presence with an elegant maze.\n\nMy eyes, the source of my power and might,\nCan see through deceit, right through the night.\nThe Sharingan's power is a precious prize,\nBut it cannot match my thirst for life's prize.\n\n**Verse 3: Itachi**\nYou ma

In [None]:
tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-base-en-v1.5")
embed_model = AutoModel.from_pretrained("BAAI/bge-base-en-v1.5")

class CustomEmbeddingFunction:
    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer

    def __call__(self, input):
        encoded_input = self.tokenizer(input, padding=True, truncation=True, return_tensors='pt')
        with torch.no_grad():
            model_output = self.model(**encoded_input)
        embeddings_tensor = model_output[0][:, 0]
        embeddings_list = [embedding.tolist() for embedding in embeddings_tensor]
        return embeddings_list

embedding_function = CustomEmbeddingFunction(embed_model, tokenizer)

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/777 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

In [None]:
loader = PyPDFLoader("Itachi.pdf")
pages = loader.load_and_split()

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100,
    chunk_overlap=20,
    length_function=len,
    is_separator_regex=False,
)
texts = []
for page in pages:
  texts.append(page.page_content)

In [None]:
texts = text_splitter.create_documents(texts)

In [None]:
splited_texts = []
for i, doc in enumerate(texts):
    splited_texts.append(str(doc.page_content).replace("\n"," "))

In [None]:
chroma_client = chromadb.Client()
collection = chroma_client.create_collection(name="text_embed")

In [None]:
chroma_client.delete_collection(name="text_embed")

In [None]:
all_embeddings = []
for text in splited_texts:
    embeddings = embedding_function(text)
    all_embeddings.append(embeddings[0])

for i, (text, embedding) in enumerate(zip(splited_texts, all_embeddings)):
    collection.add(ids=[str(i)], documents=[text], embeddings=[embedding])

In [None]:
ef_lc = SentenceTransformerEmbeddings(model_name="multi-qa-mpnet-base-dot-v1")

langchain_chroma = Chroma(
    client=chroma_client,
    collection_name="text_embed",
    embedding_function=ef_lc,
)

retriever = langchain_chroma.as_retriever()

In [None]:
prompt = hub.pull("rlm/rag-prompt")

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [None]:
rag_chain.invoke("do you know ronaldo? he is football player.")

Llama.generate: prefix-match hit

llama_print_timings:        load time =    2217.60 ms
llama_print_timings:      sample time =     135.97 ms /    30 runs   (    4.53 ms per token,   220.64 tokens per second)
llama_print_timings: prompt eval time =  255050.90 ms /   102 tokens ( 2500.50 ms per token,     0.40 tokens per second)
llama_print_timings:        eval time =  529599.46 ms /    29 runs   (18262.05 ms per token,     0.05 tokens per second)
llama_print_timings:       total time =  784964.75 ms /   131 tokens


' no\nExplanation: The context does not mention whether or not Ronaldo is a football player, so I cannot answer this question from the provided context.'

In [None]:
rag_chain.invoke("how was ronaldo in realmadrid?")

Llama.generate: prefix-match hit

llama_print_timings:        load time =   27590.15 ms
llama_print_timings:      sample time =      52.75 ms /    12 runs   (    4.40 ms per token,   227.51 tokens per second)
llama_print_timings: prompt eval time =   54478.06 ms /    40 tokens ( 1361.95 ms per token,     0.73 tokens per second)
llama_print_timings:        eval time =   10127.06 ms /    12 runs   (  843.92 ms per token,     1.18 tokens per second)
llama_print_timings:       total time =   64701.45 ms /    52 tokens


' with the Akatsuki, he became an assassin.'

In [None]:
rag_chain.invoke("who are you?")

Llama.generate: prefix-match hit

llama_print_timings:        load time =   27590.15 ms
llama_print_timings:      sample time =      16.60 ms /     4 runs   (    4.15 ms per token,   240.91 tokens per second)
llama_print_timings: prompt eval time =    9666.38 ms /    39 tokens (  247.86 ms per token,     4.03 tokens per second)
llama_print_timings:        eval time =    2432.42 ms /     3 runs   (  810.81 ms per token,     1.23 tokens per second)
llama_print_timings:       total time =   12124.21 ms /    42 tokens


' Itachi Uchiha.'