In [None]:
!pip install transformers optimum auto-gptq

In [None]:
# You should login to the 🤗Hub to download the model
from huggingface_hub import login
login()

In [None]:
# The easiest way is to load the model via 🤗 Transformer - Pipeline
from transformers import pipeline

messages = [
    {"role": "system", "content": "You are a helpful Lead Machine Learning Engineer that gets user started with Mistral7B"},
    {"role": "user", "content": "How can I run Mistral7b on the free tier version of Google Colab?"},
]
pipe = pipeline("text-generation", model="mistralai/Mistral-7B-Instruct-v0.3", device_map="auto", max_new_tokens=256)
pipe(messages)

In [None]:
# For faster inference, you can also use an optimised model from the 🤗 Community

# Import necessary modules from Hugging Face transformers
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

# Define the model path from Hugging Face Hub
model_name_or_path = "thesven/Mistral-7B-Instruct-v0.3-GPTQ"

# Load the tokenizer with fast tokenization
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)

# Load the model with automatic device mapping
model = AutoModelForCausalLM.from_pretrained(model_name_or_path,
                                             device_map="auto",
                                             trust_remote_code=False,
                                             revision="main")

# Set the padding token to the end-of-sequence token
model.pad_token = model.config.eos_token_id

# Define the prompt template with system and instruction tags
prompt_template = '''
<s><<SYS>>You are a helpful Lead Machine Learning Engineer that gets user started with Mistral7B:</s><</SYS>>
<s>[INST]How can I run Mistral7b on the free tier version of Google Colab</s>[/INST]
<s>[ASSISTANT]
'''

# Tokenize the prompt and move input IDs to the GPU
input_ids = tokenizer(prompt_template, return_tensors='pt').input_ids.cuda()

# Generate a response from the model with specified decoding parameters
output = model.generate(inputs=input_ids,
                        temperature=0.1,
                        do_sample=True,
                        top_p=0.95,
                        top_k=40,
                        max_new_tokens=256)

# Decode and print the generated output
print(tokenizer.decode(output[0]))
