In [15]:
!pip install -q requests torch bitsandbytes transformers sentencepiece accelerate gradio

In [16]:
from google.colab import userdata
from huggingface_hub import login
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForSeq2SeqLM, TextStreamer, BitsAndBytesConfig, TextIteratorStreamer
import torch
import gradio as gr
import threading

In [17]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.svâ€¦

In [18]:
LLAMA = "meta-llama/Llama-3.2-3B-Instruct"

In [19]:
def load_model(model_name):

  quant_config = BitsAndBytesConfig(
    load_in_4bit=True, # Load model in 4-bit
    bnb_4bit_quant_type="nf4", # Use NF4 quantization
    bnb_4bit_compute_dtype=torch.bfloat16, # Compute in bfloat16
    bnb_4bit_use_double_quant=True, # Enable nested quantization
  )

  tokenizer = AutoTokenizer.from_pretrained(model_name)
  tokenizer.pad_token = tokenizer.eos_token
  model = AutoModelForCausalLM.from_pretrained(model_name, device_map='auto', quantization_config= quant_config)
  return tokenizer, model

In [20]:
tokenizer, model = load_model(LLAMA)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [21]:
messages = [
    {"role": "system", "content": "You are a helpful assistant"}
]

user_input = "What is Generative AI and Agentic AI?"

max_tokens = 2000

In [22]:
def generate_stream_model(tokenizer, model, user_input, max_tokens=2000):

  global messages

  messages.append({"role": "user", "content": user_input})

  inputs = tokenizer.apply_chat_template(messages, return_tensors='pt').to('cuda')

  streamer = TextIteratorStreamer(
      tokenizer,
      skip_prompt=True,
      decode_kwargs={"skip_special_token":True}
  )

  thread = threading.Thread(
      target = model.generate,
      kwargs={'inputs':inputs, 'max_new_tokens':max_tokens, 'streamer':streamer}
  )
  thread.start()

  for text_chunk in streamer:
    filtered_chunk = text_chunk.replace("<|eot_id|>","")
    print(filtered_chunk, end="")

In [23]:
generate_stream_model(tokenizer, model, user_input)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


<|start_header_id|>assistant<|end_header_id|>

Generative AI and Agentic AI are two related but distinct concepts in the field of Artificial Intelligence (AI).

**Generative AI:**
Generative AI refers to a type of AI that can generate new, original content, such as images, videos, music, text, or even entire systems. This is achieved through the use of algorithms that learn patterns and relationships in existing data, and then use this knowledge to create new, synthetic content that is similar in style and structure.

Examples of Generative AI include:

* Image generation: using neural networks to generate realistic images of objects, scenes, or people.
* Music generation: using algorithms to create new music that sounds similar to existing music.
* Text generation: using language models to generate new text, such as articles, stories, or even entire books.

**Agentic AI:**
Agentic AI, also known as "agent-based" AI, refers to a type of AI that can act autonomously and make decisions i

In [24]:
def generate_gradio_opt(user_input):

  global tokenizer, model, messages, max_tokens

  messages.append({"role": "user", "content": user_input})

  inputs = tokenizer.apply_chat_template(messages, return_tensors='pt').to('cuda')

  streamer = TextIteratorStreamer(
      tokenizer,
      skip_prompt=True,
      decode_kwargs={"skip_special_token":True}
  )

  thread = threading.Thread(
      target = model.generate,
      kwargs={'inputs':inputs, 'max_new_tokens':max_tokens, 'streamer':streamer}
  )
  thread.start()

  gather_reply = ""

  for text_chunk in streamer:

    filtered_chunk = text_chunk.replace("<|eot_id|>","")

    gather_reply += filtered_chunk

    yield gather_reply


  messages.append({"role": "assistant", "content": gather_reply})


In [25]:
with gr.Blocks() as demo:
  gr.Markdown("# Chat with LLAMA")
  with gr.Row():
    with gr.Column():
      user_input = gr.Textbox(label="Your Message Here", placeholder="Type Something here....")
      output_box = gr.Markdown(label="AI Response", min_height=50)
      send_button = gr.Button("Send")

  send_button.click(fn=generate_gradio_opt, inputs=user_input, outputs=output_box)


demo.launch(debug=True)

It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://dd92736f1dd88d1b12.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://dd92736f1dd88d1b12.gradio.live


