<a href="https://colab.research.google.com/github/InduwaraGayashan001/Generative-AI/blob/main/Mistral.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [None]:
!pip install pypdf
!pip install -q transformers einops accelerate langchain bitsandbytes
!pip install -q langchain-community
!pip install sentence_transformers
!pip install llama-index==0.9.39

In [11]:
from llama_index import VectorStoreIndex, SimpleDirectoryReader, ServiceContext
from llama_index.llms import HuggingFaceLLM
from llama_index.prompts.prompts import SimpleInputPrompt

# Load Data

In [5]:
!mkdir Data

In [9]:
documents = SimpleDirectoryReader("/content/Data/").load_data()

In [10]:
len(documents)

11

# Prompt

In [12]:
system_prompt = """You are a Q&A assistant. Your goal is to answer questions as accurately as possible based on the instructions and context provided.
If you don't know the answer, just say that you don't know. Don't try to make up an answer.
"""
query_wrapper_prompt = SimpleInputPrompt("<|USER|>{query_str}<|ASSISTANT|>")

# Load the Model

In [None]:
!huggingface-cli login

In [None]:
import torch
llm = HuggingFaceLLM(
    context_window=4096,
    max_new_tokens=256,
    generate_kwargs={"temperature": 0.0, "do_sample": False},
    system_prompt=system_prompt,
    query_wrapper_prompt=query_wrapper_prompt,
    tokenizer_name="mistralai/Mistral-7B-Instruct-v0.1",
    model_name="mistralai/Mistral-7B-Instruct-v0.1",
    device_map="auto",
    tokenizer_kwargs={"max_length": 4096},
    # uncomment this if using CUDA to reduce memory usage
    model_kwargs={"torch_dtype": torch.float16 , "load_in_8bit":True}
)


In [None]:
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from llama_index import ServiceContext
from llama_index.embeddings import LangchainEmbedding

embed_model = LangchainEmbedding(
  HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
)

service_context = ServiceContext.from_defaults(
    llm=llm,
    embed_model=embed_model
)

# Q/A

In [17]:
index = VectorStoreIndex.from_documents(documents, service_context=service_context)

In [None]:
query_engine = index.as_query_engine()
response = query_engine.query("What is Multi-Head Attention?")

In [21]:
from IPython.display import Markdown, display
display(Markdown(f"<b>{response}</b>"))

<b>Multi-Head Attention is a technique used in the Transformer model for sequence-to-sequence tasks. It involves performing multiple attention functions in parallel, each with different projections of the queries, keys, and values. This allows the model to jointly attend to information from different representation subspaces at different positions. The attention function is scaled by 1√dk
, where dk is the reduced dimension of each head.</b>