* Install the required libraries

In [None]:
!pip install -q transformers accelerate torch gradio

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m64.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m32.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m55.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

* Load the required libraries

In [2]:
from transformers import AutoProcessor, AutoModelForVision2Seq, AutoModelForCausalLM, AutoTokenizer
import torch
from PIL import Image
import gradio as gr
import re
import gc

* Initialize placeholders for models and processors so they can be loaded later

In [3]:
# Placeholder for the code generation model (DeepSeek-R1-Distill-Qwen-1.5B)
code_model = None
# Placeholder for the tokenizer corresponding to the code generation model
code_tokenizer = None
# Placeholder for the image description model (LLaVA v1.6 Mistral 7B)
vision_model = None
# Placeholder for the processor that prepares images for the image model
vision_processor = None

In [4]:
def unload_models(except_model=None):
    """
    unloads the model that is not used to free up GPU resources
    Parameters:
        except_model (str):
     If set to "code", the code model is retained.
     If set to "vision", the vision model is retained.
    """
    # Use the global model references to modify them directly
    global code_model, vision_model
    if except_model != "code" and code_model is not None:
        del code_model  # Delete the code model from memory
        gc.collect()    # Run garbage collection to free up memory
        code_model = None # Reset the reference to None
    if except_model != "vision" and vision_model is not None:
        del vision_model  # Delete the vision model from memory
        gc.collect()      # Run garbage collection
        vision_model = None  # Reset the reference
    torch.cuda.empty_cache()   # Clear unused memory from the CUDA cache to free up GPU VRAM

In [5]:
def load_code_model():
    """
    Loads the DeepSeek code generation model and tokenizer into memory,
    if they are not already loaded.

    Returns:
        str: Message indicating whether the model was loaded or already in memory.
    """
    # Access the global model and tokenizer variables
    global code_model, code_tokenizer
    # Only load the model if it's not already loaded
    if code_model is None:
        # Free memory by unloading other models except the code model
        unload_models(except_model="code")
        # Hugging Face model Name
        model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
        # Load the tokenizer for the code generation mode
        code_tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
        # Load the code generation model
        code_model = AutoModelForCausalLM.from_pretrained(
            model_name,
            trust_remote_code=True,       # Enables support for repositories with custom model code
            torch_dtype=torch.float16,    # Use half precision to reduce GPU memory usage
            device_map="auto"             # Automatically maps model to available GPU (or CPU if no GPU)
        )
        # Clean up any unused GPU memory after loading
        torch.cuda.empty_cache()
        return "✅ Code model loaded successfully."
    # If the model is already loaded, do not reload it
    return "✅ Code model already loaded."

In [6]:
def generate_code(prompt, temperature=0.3, top_p=0.95, max_new_tokens=256, repetition_penalty=1.1):
    """
    Generate clean Python code from a natural language prompt using the DeepSeek model.

    Parameters:
        prompt (str): The task or question you want to turn into Python code.
        temperature (float): Controls randomness; lower means more predictable results.
        top_p (float): Controls how much of the most likely words are considered; lower means safer outputs.
        max_new_tokens (int): Maximum number of words (tokens) to generate.
        repetition_penalty (float): Makes the model less likely to repeat the same thing.

    Returns:
        str: A clean Python function as plain text, without extra explanation or formatting.
    """
    # Make sure the model is loaded
    if code_model is None:
        return "⚠️ Please load the code model first."

    # Create a prompt to clearly tell the model to generate only Python code
    formatted_prompt = (
        f"Write only Python code for the following task.\n"
        f"Task: {prompt.strip()}\n"
        f"Do not include any explanation, markdown, or text outside the function.\n"
        f"Only return clean executable Python code.\n"
        f"Code:\n"
    )
    # Convert the prompt into tokens and move it to the model's device (GPU/CPU)
    inputs = code_tokenizer(formatted_prompt, return_tensors="pt").to(code_model.device)
    # Generate code using the model
    outputs = code_model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,              # How much code to generate
        temperature=temperature,                    # More or less randomness
        top_p=top_p,                                # Choose from the top most likely words
        do_sample=True,                             # Use sampling instead of always picking the top word
        repetition_penalty=repetition_penalty,      # Discourage repeating the same text
        pad_token_id=code_tokenizer.eos_token_id,   # Token used for padding
        eos_token_id=code_tokenizer.eos_token_id    # Token that tells the model to stop
    )
    # Convert the generated tokens back to readable text
    generated_text = code_tokenizer.decode(outputs[0], skip_special_tokens=True)

     # Try to extract just the first function definition (starts with "def")
    if "def " in generated_text:
        code_block = generated_text.split("def ", 1)[1]   # Keep everything after 'def'
        code_block = "def " + code_block                  # Add 'def' back to the start
        code_lines = code_block.splitlines()              # Break into lines

        # Remove extra lines after the function ends (e.g. explanations)
        cleaned_lines = []
        for line in code_lines:
            if line.strip().startswith("#") and "your code here" in line.lower():
                continue  # remove placeholder comments
            if line.strip() == "":
                continue  # Skip empty lines
            cleaned_lines.append(line)
            # Stop after a simple return statement
            if "return" in line:
                break

        return "\n".join(cleaned_lines).strip()

    # If no valid function was generated
    return "⚠️ No function found in output. Try rephrasing the prompt."


In [7]:
def load_vision_model():
    """
    Loads the image description model and its processor.

    Returns:
        str: Message indicating whether the model was loaded or already available.
    """
    # Use global variables so they can be accessed elsewhere
    global vision_model, vision_processor
    # Only load the model if it's not already in memory
    if vision_model is None:
        # Free up GPU memory by unloading other models (if needed)
        unload_models(except_model="vision")
        # Hugging Face model Name
        model_id = "llava-hf/llava-v1.6-mistral-7b-hf"
        # Load the processor — it handles image and text inputs
        vision_processor = AutoProcessor.from_pretrained(model_id)
        # Load the image-to-text model (generates descriptions from images)
        vision_model = AutoModelForVision2Seq.from_pretrained(
            model_id,
            torch_dtype=torch.float16,  # Use half precision to save GPU memory
            device_map="auto"           # Automatically place model on available GPU or CPU
        )
        # Clean up unused GPU memory after loading
        torch.cuda.empty_cache()
        return "✅ Vision model loaded successfully."
    # If already loaded, skip reloading
    return "✅ Vision model already loaded."

In [8]:
def clean_trailing_cutoff(text):
    """
    Cleans up any incomplete sentence or cut-off word at the end of a generated text.
    This helps make outputs more polished and readable.

    Parameters:
        text (str): The generated text that may end abruptly.

    Returns:
        str: A cleaned-up version of the text, ending at the last complete sentence or word.
    """
    # Try to find the last proper sentence ending:
    # Look for a period (.), exclamation mark (!), or question mark (?)
    # followed by a space, near the end of the text
    match = re.search(r"([\.!?])\s+[^\.!?]*$", text)
    if match:
        end_idx = match.end(1)  # Index right after the punctuation
        return text[:end_idx].strip() # Keep everything up to that point

    # If no full sentence found, try removing the last word (it may be incomplete)
    words = text.strip().split()
    if len(words) > 1:
        return " ".join(words[:-1])  # Remove the last word and return the rest
    # If there's only one word or nothing useful, return the original text
    return text.strip()


In [9]:
def describe_image(image, prompt, temperature=0.7, top_p=0.9, max_new_tokens=128):
     """
     Generates a description or answer based on an image and a text prompt
     using the LLaVA image-to-text model.

     Parameters:
         image (PIL.Image or np.ndarray): The input image to describe or question about.
         prompt (str): Text prompt (e.g. "What is happening in this image?").
         temperature (float): Controls randomness in generation (higher = more creative).
         top_p (float): Picks from top most likely words (lower = safer output).
         max_new_tokens (int): How much text the model can generate.

     Returns:
         str: Cleaned-up response generated by the vision-language model.
     """
     # Make sure the vision model is loaded
     if vision_model is None:
        return "⚠️ Please load the image model first."
     # Ensure an image is provided
     if image is None:
        return "Please upload an image."
     # If the image is in NumPy format, convert it to a PIL image
     if not isinstance(image, Image.Image):
        image = Image.fromarray(image)
     # Insert a special image token at the start of the prompt so the model knows to use the image
     image_token = vision_processor.tokenizer.convert_ids_to_tokens(vision_processor.tokenizer.image_token_id)

     # Combine the image token and the user prompt (e.g. "What is in this image?")
     prompt = prompt.strip()
     full_prompt = f"{image_token} {prompt}"

     try:
        # Process both the image and the text into the right format for the model
        inputs = vision_processor(
            images=image, text=full_prompt, return_tensors="pt"
            ).to(vision_model.device)   # Move input to the same device as the model (GPU/CPU)
        # Generate a response from the model without updating any weights
        with torch.no_grad():
            generated_ids = vision_model.generate(
                **inputs,
                do_sample=True,                 # Enable sampling (not deterministic)
                temperature=temperature,        # Add randomness
                top_p=top_p,                    # Limit sampling to top p probability
                max_new_tokens=max_new_tokens,  # Limit output length
                eos_token_id=vision_processor.tokenizer.eos_token_id,
                pad_token_id=vision_processor.tokenizer.pad_token_id,
            )
        # Convert generated token IDs into readable text
        generated_text = vision_processor.tokenizer.decode(generated_ids[0], skip_special_tokens=True).strip()
        # Remove the prompt from the beginning of the output
        cleaned_text = re.sub(
            rf"{re.escape(prompt)}[\s\.\,\:\-\_]*", "", generated_text, count=1, flags=re.IGNORECASE
        ).strip()
        # Remove any incomplete sentence or word from the end of the output
        cleaned_text = clean_trailing_cutoff(cleaned_text)

        return cleaned_text


     except Exception as e:
         # Handle and return any errors that happen during processing
         return f"Error during generation: {str(e)}"

In [10]:
# some example prompts the user can use in the code generation
example_prompts = [
    "create a function to calculate factorial of a number",
    "Write a function that returns the square of a number",
    "make a function that adds two numbers",
    "Write Python code to check if a number is even or odd",
    "write code to find the largest number in a list",
    "create a simple calculator that can add, subtract, multiply and divide"
]

In [11]:
# Create a Gradio interface with a title
with gr.Blocks(title="AI Assistant") as demo:

    # Use tabs to organize features into separate views
    with gr.Tabs():


        # --- IMAGE DESCRIPTION TAB ---
        with gr.Tab("🖼️ Image Description"):
            gr.Markdown("## 🖼️ Image Description Generator")
            # Show current status of the image model
            vision_status = gr.Textbox(label="Model Status", value="❌ Model not loaded")
            # Button to load the vision model
            load_vision_btn = gr.Button("🔄 Load Image Model")
            # Layout for inputs and outputs
            with gr.Row():
                with gr.Column():
                    # Upload image input
                    image_input = gr.Image(type="pil", label="Upload an Image")
                    # Prompt for what to ask the model about the image
                    image_prompt = gr.Textbox(
                        lines=2, label="Prompt for Image Description",
                        value="Describe this image in detail", interactive=True)
                    # Sliders for parameters
                    image_temperature = gr.Slider(0.1, 1.5, step=0.1, value=0.7, label="Temperature")
                    image_top_p = gr.Slider(0.1, 1.0, step=0.05, value=0.9, label="Top-p")
                    image_max_tokens = gr.Slider(10, 512, step=16, value=128, label="Max New Tokens")
                    # Button to generate image description
                    describe_btn = gr.Button("🖋️ Generate Description")

                with gr.Column():
                    # Text output for the image description
                    image_output = gr.Textbox(label="Generated Description", lines=8)

            # Link button click to function that loads the vision model
            load_vision_btn.click(fn=load_vision_model, outputs=vision_status)
            # Link generate button to image description function
            describe_btn.click(fn=describe_image,
                               inputs=[image_input, image_prompt, image_temperature, image_top_p, image_max_tokens],
                               outputs=image_output)

        # --- CODE GENERATOR TAB ---
        with gr.Tab("💻 Code Generation"):
            gr.Markdown("## 🧠 DeepSeek Code Generator")
            # Display the current status of the code model
            code_status = gr.Textbox(label="Model Status", value="❌ Model not loaded")
            # Button to load the code generation model
            load_code_btn = gr.Button("🔄 Load Code Model")
            # Organize inputs/outputs side-by-side
            with gr.Row():
                with gr.Column():
                    # Prompt input box for natural language code requests
                    prompt_input = gr.Textbox(
                        label="Enter your prompt", lines=3,
                        placeholder="Describe the code you want...")
                    #  Sliders for parameters inside a collapsible section
                    with gr.Accordion("Advanced Parameters", open=False):
                        temperature = gr.Slider(0.1, 1.5, value=0.3, step=0.1, label="Temperature")
                        top_p = gr.Slider(0.1, 1.0, value=0.95, step=0.05, label="Top-p")
                        max_tokens = gr.Slider(64, 1024, value=256, step=32, label="Max New Tokens")
                        repetition_penalty = gr.Slider(1.0, 2.0, value=1.1, step=0.1, label="Repetition Penalty")
                    # Button to trigger code generation
                    generate_btn = gr.Button("🚀 Generate Code")

                with gr.Column():
                    # Output area for generated Python code
                    code_output = gr.Code(
                        label="Generated Python Code", language="python", interactive=True)

            # When the load button is clicked, call load_code_model and show result in code_status
            load_code_btn.click(fn=load_code_model, outputs=code_status)
            # When the generate button is clicked, run generate_code and show result in code_output
            generate_btn.click(fn=generate_code,
                               inputs=[prompt_input, temperature, top_p, max_tokens, repetition_penalty],
                               outputs=code_output)
            # Add clickable examples to populate the prompt input
            gr.Examples(example_prompts, inputs=prompt_input)

# Launch the app with debug and sharing enabled
demo.launch(debug=True, share=True)

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://d747634e87be32d0e9.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/3.07k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/679 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.55G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

processor_config.json:   0%|          | 0.00/176 [00:00<?, ?B/s]

chat_template.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


tokenizer_config.json:   0%|          | 0.00/2.07k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/41.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/552 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.28k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/70.2k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/380M [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]



Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://d747634e87be32d0e9.gradio.live


