In [10]:
# https://medium.com/@mayadakhatib/rag-a-simple-practical-example-using-llama-index-and-huggingface-fab3e5aa7442

In [12]:
%pip install llama-index
%pip install docx2txt

%pip install llama-index-embeddings-huggingface
%pip install llama-index-embeddings-instructor
%pip install optimum
%pip install auto-gptq
%pip install -U accelerate bitsandbytes datasets peft transformers



In [11]:
from google.colab import drive
drive.mount('/content/drive')
input_dir = '/content/drive/MyDrive/colab_input_data/docs/'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [13]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings
# initialize the LLM
# Parameters that can be tuned later to test different models and different chunk sizes
Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
Settings.llm = None
Settings.chunk_size = 256
Settings.chunk_overlap = 15

LLM is explicitly disabled. Using MockLLM.


In [14]:
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

In [15]:

embeddings = embed_model.get_text_embedding("Hello World!")
print(len(embeddings))
print(embeddings[:5])

384
[-0.0032757006119936705, -0.011690812185406685, 0.041559189558029175, -0.03814816474914551, 0.024183034896850586]


In [16]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core import Settings

documents = SimpleDirectoryReader(
    input_dir=input_dir, required_exts=[".docx"]
).load_data()

In [19]:
from llama_index.core.retrievers import VectorIndexRetriever

index = VectorStoreIndex.from_documents(documents)

# set number of docs to retreive
top_k = 2

# configure retriever
retriever = VectorIndexRetriever(
    index=index,
    similarity_top_k=top_k,
)

In [20]:
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.postprocessor import SimilarityPostprocessor

# assemble query engine
query_engine = RetrieverQueryEngine(
    retriever=retriever,
    node_postprocessors=[SimilarityPostprocessor(similarity_cutoff=0.5)],
)

# query documents - testing
query = "Where Paul Yudkin Studied ?"
response = query_engine.query(query)
#print(response)
context = "Context:\n"
for i in range(top_k):
    context = context + response.source_nodes[i].text + "\n\n"

print(len(context))

1971


In [21]:
from transformers import AutoModelForCausalLM, AutoTokenizer
model_name = "TheBloke/Mistral-7B-Instruct-v0.2-GPTQ"
model = AutoModelForCausalLM.from_pretrained(model_name,
                                             trust_remote_code=False,
                                             revision="main",
                                             device_map="cuda:0"
                                             )
# load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

config.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/4.16G [00:00<?, ?B/s]

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.
Some weights of the model checkpoint at TheBloke/Mistral-7B-Instruct-v0.2-GPTQ were not used when initializing MistralForCausalLM: ['model.layers.0.mlp.down_proj.bias', 'model.layers.0.mlp.gate_proj.bias', 'model.layers.0.mlp.up_proj.bias', 'model.layers.0.self_attn.k_proj.bias', 'model.layers.0.self_attn.o_proj.bias', 'model.layers.0.self_attn.q_proj.bias', 'model.layers.0.self_attn.v_proj.bias', 'model.layers.1.mlp.down_proj.bias', 'model.layers.1.mlp.gate_proj.bias', 'model.layers.1.mlp.up_proj.bias', 'model.layers.1.self_attn.k_proj.bias', 'model.layers.1.self_attn.o_proj.bias', 'model.layers.1.self_attn.q_proj.bias', 'model.layers.1.self_attn.v_proj.bias', 'model.layers.10.mlp.down_proj.bias', 'model.layers.10.mlp.gate_proj.bias', 'model.layers.10.mlp.up_proj.bias', 'model.layers.10.self_attn.k_proj.bias', 'model.layers.10.self_attn.o_proj.bias', 'model.layers.10.self_attn.q_pr

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.46k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

In [22]:
prompt_template_w_context = lambda context, comment: f"""[INST]MayaGPT, functioning as a virtual data science consultant on Medium, communicates in clear, accessible language, escalating to technical depth upon request. \
It reacts to feedback aptly and ends responses with its signature '-MayaGPT'. \
MayaGPT will tailor the length of its responses to match the viewer's comment, providing concise acknowledgments to brief expressions of gratitude or feedback, \
thus keeping the interaction natural and engaging.
If MayaGPT CANNOT answer a question or DOESN'T find proper information, MayaGPT will simple respond that it doesn't know.

{context}
Please respond to the following comment. Use the context above if it is helpful.

{comment}
[/INST]
"""

In [23]:
comment = "Where Paul Studied ?"
prompt = prompt_template_w_context(context, comment)

inputs = tokenizer(prompt, return_tensors="pt")
outputs = model.generate(input_ids=inputs["input_ids"].to("cuda"), max_new_tokens=280)

print(tokenizer.batch_decode(outputs)[0])

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


<s> [INST]MayaGPT, functioning as a virtual data science consultant on Medium, communicates in clear, accessible language, escalating to technical depth upon request. It reacts to feedback aptly and ends responses with its signature '-MayaGPT'. MayaGPT will tailor the length of its responses to match the viewer's comment, providing concise acknowledgments to brief expressions of gratitude or feedback, thus keeping the interaction natural and engaging.
If MayaGPT CANNOT answer a question or DOESN'T find proper information, MayaGPT will simple respond that it doesn't know.

Context:
Paul Yudkin

Tel Aviv | 054-4442821 | paul.yudkin@gmail.com
[GitHub/LinkedIn link, if applicable]

Summary

Algorithm developer specializing in computer vision and deep learning. Proven expertise in designing and implementing algorithms for facial recognition, fine-grained recognition, image segmentation, and generative models. Strong track record of delivering innovative solutions across industries.

Technic