In [15]:
 # !pip install peft transformers accelerate bitsandbytes langchain
# !pip install langchain
# !pip install --upgrade ipympl

In [1]:
import os
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    pipeline
)

from datasets import load_dataset
from peft import LoraConfig, PeftModel   # try QLora

from langchain.text_splitter import CharacterTextSplitter
from langchain.document_transformers import Html2TextTransformer
from langchain.document_loaders import AsyncChromiumLoader

from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

from langchain.prompts import PromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from langchain.llms import HuggingFacePipeline
from langchain.chains import LLMChain

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

  warn("The installed version of bitsandbytes was compiled without GPU support. "


'NoneType' object has no attribute 'cadam32bit_grad_fp32'


### Loading Mistral Model

In [2]:
# Tokenizer: Not Quantized Model

model_name = "mistralai/Mistral-7B-Instruct-v0.2"

# Add a quantized model

tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir="D:\\DemoProjects\\GenerativeAI\\Labs\\data\\base_models\\")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

model = AutoModelForCausalLM.from_pretrained(model_name, cache_dir="D:\\DemoProjects\\GenerativeAI\\Labs\\data\\base_models\\")


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

### Trainable Parameters 

In [4]:
def trainable_model_parameters(model):
    trainable_model_params = 0
    total_model_params = 0
    for _, param in model.named_parameters():
        total_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable Model Params: {trainable_model_params}\ntotal Model Params: {total_model_params}\npercentage of trainable Params: {100*(trainable_model_params/total_model_params):.3f}%"    

In [5]:
print(trainable_model_parameters(model))

trainable Model Params: 7241732096
total Model Params: 7241732096
percentage of trainable Params: 100.000%


### Mistral Text Generation Pipeline

In [6]:
text_generation_pipeline = pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    temperature=0.2,
    repetition_penalty=1.1,
    return_full_text=True,
    max_new_tokens=1000
)

In [7]:
mistral_llm = HuggingFacePipeline(pipeline=text_generation_pipeline)

In [11]:
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

inputs_not_chat = tokenizer.encode_plus("[INST] I want to fly to Paris. Tell me about its weather? [/INST]", return_tensors="pt")['input_ids']

In [13]:
generated_ids = model.generate(inputs_not_chat, 
                               max_new_tokens=1000, 
                               do_sample=True)
decoded = tokenizer.batch_decode(generated_ids)
print(decoded)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


["<s> [INST] I want to fly to Paris. Tell me about its weather? [/INST] Paris, the capital city of France, is known for its mild and changeable weather. The climate in Paris is generally considered to be oceanic, which means that it experiences mild winters and cool summers. The average temperature in Paris hovers around 12 degrees Celsius (54 degrees Fahrenheit) throughout the year, with temperatures ranging from 1 degree Celsius (34 degrees Fahrenheit) in winter to 19 degrees Celsius (66 degrees Fahrenheit) in summer.\n\nParis experiences four distinct seasons. In the spring (March to May), the city comes alive with blooming flowers and mild temperatures. In the summer (June to August), temperatures can rise, and the city experiences long days with extended hours of sunlight. Autumn (September to November) brings crisp weather and falling leaves, while winter (December to February) can be wet and chilly, with occasional snowfall.\n\nIt's always a good idea to check the current weathe

In [None]:
prompt_template = """
[INST]
Instruction: Generate brief summary of the weather for the {city}:

{context}

### QUESTION:
{question} [/INST]
"""

# Create prompt from prompt template 
prompt = PromptTemplate(
    input_variables=["context", "question", "city"],
    template=prompt_template,
)

In [None]:
# we can also use simple generatting flow
original_model.to('cuda')

inputs = tokenizer(prompt_template, return_tensors="pt").to(device)
outputs = original_model.generate(**inputs, max_new_tokens=5000, pad_token_id=tokenizer.eos_token_id, do_sample=True)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)

In [None]:
# Create llm chain 
llm_chain = LLMChain(llm=mistral_llm, prompt=prompt)

In [None]:
city = input("Enter your destination city: ")
weather_data = get_weather(OPENWEATHERMAP_API_KEY, city)

# retrieved_results = retrieve_vector_db(query)[0][0]

rag_chain = ( 
 {"context": lambda x: weather_data, "question": RunnablePassthrough(), "city":lambda x: city}
    | llm_chain
)
rag_chain.invoke(f"I am traveling to {city}. Tell me about its weather?")['text']