In [10]:
%pip install torch transformers huggingface_hub accelerate  gradio bitsandbytes timm sentencepiece diffusers
%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

Collecting diffusers
  Downloading diffusers-0.32.1-py3-none-any.whl.metadata (18 kB)
Downloading diffusers-0.32.1-py3-none-any.whl (3.2 MB)
   ---------------------------------------- 0.0/3.2 MB ? eta -:--:--
   ------ --------------------------------- 0.5/3.2 MB 4.2 MB/s eta 0:00:01
   ------------------- -------------------- 1.6/3.2 MB 3.8 MB/s eta 0:00:01
   ----------------------------- ---------- 2.4/3.2 MB 4.1 MB/s eta 0:00:01
   ---------------------------------------- 3.2/3.2 MB 4.0 MB/s eta 0:00:00
Installing collected packages: diffusers
Successfully installed diffusers-0.32.1
Note: you may need to restart the kernel to use updated packages.


In [1]:
from huggingface_hub import notebook_login;
from transformers import pipeline, AutoImageProcessor, AutoModelForObjectDetection, AutoTokenizer, AutoModelForCausalLM, LlamaForCausalLM, SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan;
from accelerate import init_empty_weights;
import torch;
from PIL import Image, ImageDraw;
import gradio as gr;
import soundfile as sf;
import os;
from diffusers import StableDiffusionPipeline;

In [None]:
notebook_login();

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
os.environ["HF_HOME"] = "./bot";

device = "cpu";
if torch.cuda.is_available ():
    device = torch.device ("cuda");
elif torch.backends.mps.is_available():
    device = torch.device ("mps");

model_id = "meta-llama/Llama-3.2-3B-Instruct";
tokenizer = AutoTokenizer.from_pretrained (model_id);
model = AutoModelForCausalLM.from_pretrained (
    model_id,
    torch_dtype=torch.float16,
    device_map=None
).to (device);

processor = SpeechT5Processor.from_pretrained ("microsoft/speecht5_tts");
tts_model = SpeechT5ForTextToSpeech.from_pretrained ("microsoft/speecht5_tts");
vocoder = SpeechT5HifiGan.from_pretrained ("microsoft/speecht5_hifigan");
speaker_embeddings = torch.randn (1, 512);

ckpt = "yainage90/fashion-object-detection";
image_processor = AutoImageProcessor.from_pretrained (ckpt);
object_model = AutoModelForObjectDetection.from_pretrained (ckpt).to (device);

stable_diffusion_model_id = "runwayml/stable-diffusion-v1-5";
pipe = StableDiffusionPipeline.from_pretrained (stable_diffusion_model_id, torch_dtype = torch.float32);

global_context = "";

def generate_text_and_audio (prompt):
    global global_context;

    global_context += f"User: {prompt}\n";
    formatted_prompt = f"{global_context}Assistant:";

    inputs = tokenizer (formatted_prompt, return_tensors="pt").to (device);
    generate_ids = model.generate (
        inputs.input_ids,
        temperature = 0.1,
        top_p = 0.85,
        top_k = 50,
        repetition_penalty = 1.2,
        do_sample = True,
        num_return_sequences = 1,
        max_length = 200
    )
    result_text = tokenizer.decode (generate_ids[0], skip_special_tokens = True).strip ();
    global_context += f"Assistant: {result_text}\n";

    inputs = processor (text = result_text, return_tensors = "pt");
    speech = tts_model.generate_speech (inputs["input_ids"], speaker_embeddings, vocoder = vocoder);

    audio_path = "response_audio.wav";
    sf.write (audio_path, speech.cpu ().numpy (), samplerate = 16000);

    return result_text, audio_path;

def detect_objects (image):

    image = image.convert ("RGB");

    with torch.no_grad ():
        inputs = image_processor (images = [image], return_tensors = "pt").to (device);
        outputs = object_model (**inputs);
        target_sizes = torch.tensor ([[image.size[1], image.size[0]]]).to (device);
        results = image_processor.post_process_object_detection (outputs, threshold = 0.4, target_sizes = target_sizes)[0];

    draw = ImageDraw.Draw (image);
    for score, label, box in zip (results["scores"], results["labels"], results["boxes"]):
        score = round (score.item(), 2);
        label = object_model.config.id2label[label.item ()];
        box = [round (i.item(), 2) for i in box];
        draw.rectangle (box, outline = "red", width = 3);
        draw.text ((box[0], box[1] - 10), f"{label} ({score})", fill = "red");

    return image;

def generate_image (prompt, height, width):
    image = pipe (prompt, height = int (height), width = int (width)).images[0];
    return image;

def build_interface ():
    with gr.Blocks () as demo:
        gr.Markdown ("# Text generator + TTS, object detection and image generator");

        with gr.Tab ("Text generator + TTS"):
            gr.Markdown ("Introduce your response");
            with gr.Row ():
                prompt = gr.Textbox (label = "Ask", placeholder = "Introduce your question...");
                generate_button = gr.Button ("Generate");
            with gr.Row ():
                output_text = gr.Textbox (label = "Generated response", lines = 10);
                output_audio = gr.Audio (label = "Answer audio");
            generate_button.click (
                generate_text_and_audio,
                inputs = [prompt],
                outputs = [output_text, output_audio]
            );

        with gr.Tab ("Object detection"):
            gr.Markdown ("Upload an image");
            with gr.Row ():
                image_input = gr.Image (type = "pil", label = "Upload your image");
                detect_button = gr.Button ("Object detection");
            with gr.Row ():
                image_output = gr.Image (type = "pil", label = "Result");
            detect_button.click (
                detect_objects,
                inputs = image_input,
                outputs = image_output
            );

        with gr.Tab ("Image generator"):
            gr.Markdown ("Generate an image from a prompt");
            with gr.Row ():
                image_prompt = gr.Textbox (label = "Text for the image", placeholder = "Describe the image to generate");
                image_height = gr.Number (label = "Height", value = 512);
                image_width = gr.Number (label = "Width", value = 512);
                generate_image_button = gr.Button ("Generate image");
            with gr.Row ():
                generated_image = gr.Image (label = "Generated image");
            generate_image_button.click (
                generate_image,
                inputs = [image_prompt, image_height, image_width],
                outputs = generated_image
            );

    return demo;

demo = build_interface ();
demo.launch ();

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

* Running on local URL:  http://127.0.0.1:7861

To create a public link, set `share=True` in `launch()`.


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
