# Load the Model and Tokenizer

In [18]:
import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import torch.profiler
import time
import pandas as pd  # To format the profiler output into a table



device = "cpu"

#define llama models
llama_models = {
    "Llama 3.2 1B Instruct": "meta-llama/Llama-3.2-1B-Instruct"  
}

# function to load the model
def load_model(model_name):
    """Load the specified Llama model."""
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name)
    generator = pipeline('text-generation', model=model, tokenizer=tokenizer, device=device)
    return generator

# Define the function to profile and store operations
def inference_profiler(model_name, operation, time_taken):
    """Store the inference operation profiling in a dictionary."""
    if model_name not in inference_profiling_data:
        inference_profiling_data[model_name] = {}

    if operation not in inference_profiling_data[model_name]:
        inference_profiling_data[model_name][operation] = []

    inference_profiling_data[model_name][operation].append(time_taken)


model_cache = {} # for caching generated chats
loaded_model = "null"
# Define the profiler dictionary to store operations
inference_profiling_data = {}

def generate_chat(user_input, history, model_choice):
    """Generate chatbot responses using the selected Llama model and task."""
    
    if model_choice not in model_cache:
        model_cache[model_choice] = load_model(llama_models[model_choice])
    
    generator = model_cache[model_choice]

    if history is None:
        history = []

    history.append({"role": "user", "content": user_input})

    # Start the profiler
    with torch.profiler.profile(
        activities=[torch.profiler.ProfilerActivity.CPU],
        record_shapes=True,
        with_stack=False
    ) as prof:
        response = generator(
            history,
            max_length=100,
            pad_token_id=generator.tokenizer.eos_token_id,
            do_sample=True,
            temperature=0.7,
            top_p=0.9
        )[-1]["generated_text"][-1]["content"]
        prof.step()  # Ensure the profiler finalizes its step

    # Store the key averages object directly (not as a string)
    inference_profiling_data[model_choice] = prof.key_averages()
    print(prof.key_averages().table(sort_by="self_cpu_time_total"))  # For debugging

    history.append({"role": "assistant", "content": response})
    
    return history


# Analysis Functions

In [19]:
def generate_analysis(selection, model_choice):
    """Generate the analysis details based on the selected option."""
    import pandas as pd

    if selection == "Model Architecture":
        return see_structure(model_choice)  # Return structured model architecture
    
    elif selection == "Torch Profile":
        profile_data = inference_profiling_data.get(model_choice, None)
        if not profile_data:
            return pd.DataFrame(columns=["Name", "Self CPU %", "Self CPU", "CPU total %", "CPU total", "CPU time avg", "# of Calls"])
        
        # Helper function to format time given in microseconds
        def format_time(us):
            # Assuming input 'us' is in microseconds
            if us < 1e3:
                return f"{us:.3f}μs"
            elif us < 1e6:
                return f"{us/1e3:.3f}ms"
            else:
                return f"{us/1e6:.3f}s"
        
        # Build raw data list with numeric values
        raw_data = []
        total_self_cpu = sum(event.self_cpu_time_total for event in profile_data)
        total_cpu_time = sum(event.cpu_time_total for event in profile_data)
        
        for event in profile_data:
            self_cpu_pct = (event.self_cpu_time_total / total_self_cpu * 100) if total_self_cpu > 0 else 0
            cpu_total_pct = (event.cpu_time_total / total_cpu_time * 100) if total_cpu_time > 0 else 0
            raw_data.append({
                "Name": event.key,
                "Self CPU Raw": event.self_cpu_time_total,
                "Self CPU %": self_cpu_pct,
                "CPU total Raw": event.cpu_time_total,
                "CPU total %": cpu_total_pct,
                "CPU time avg Raw": event.cpu_time_total / event.count if event.count > 0 else 0,
                "# of Calls": event.count
            })
        
        # Sort the raw data by 'Self CPU Raw' in descending order
        raw_data = sorted(raw_data, key=lambda x: x["Self CPU Raw"], reverse=True)
        
        # Build display data with formatted values
        display_data = []
        for row in raw_data[:20]:
            display_data.append({
                "Name": row["Name"],
                "Self CPU %": f"{row['Self CPU %']:.2f}%",
                "Self CPU": format_time(row["Self CPU Raw"]),
                "CPU total %": f"{row['CPU total %']:.2f}%",
                "CPU total": format_time(row["CPU total Raw"]),
                "CPU time avg": format_time(row["CPU time avg Raw"]),
                "# of Calls": row["# of Calls"]
            })
        
        # Append a summary row for total Self CPU time
        total_time_display = format_time(total_self_cpu)
        summary_row = {
            "Name": "Self CPU time total",
            "Self CPU %": "",
            "Self CPU": total_time_display,
            "CPU total %": "",
            "CPU total": "",
            "CPU time avg": "",
            "# of Calls": ""
        }
        display_data.append(summary_row)
        
        df = pd.DataFrame(display_data)
        return df


def see_structure(model_choice):
    """Return model architecture as a structured DataFrame."""
    if model_choice in model_cache:
        model_pipeline = model_cache[model_choice]  # Get the pipeline
        model = model_pipeline.model  # Extract model from the pipeline
    else:
        return pd.DataFrame(columns=["Layer", "Type"])  # Return empty table

    model_layers = [{"Layer": name, "Type": str(layer)} for name, layer in model.named_children()]
    return pd.DataFrame(model_layers)


# Gradio Block

In [20]:
with gr.Blocks(css="""
.small-font table {
  font-size: 12px !important;
}
""") as demo:
    # First Row: Chatbot Section
    with gr.Row():
        with gr.Column(scale=1):
            gr.Markdown("<h1><center>Chat with Llama Models</center></h1>")
            model_choice = gr.Dropdown(list(llama_models.keys()), label="Select Llama Model")
            chatbot = gr.Chatbot(label="Chatbot Interface", type="messages")
            txt_input = gr.Textbox(show_label=False, placeholder="Type your message here...")

            def respond(user_input, chat_history, model_choice):
                if model_choice is None:
                    model_choice = list(llama_models.keys())[0]
                updated_history = generate_chat(user_input, chat_history, model_choice)
                return "", updated_history

            txt_input.submit(respond, [txt_input, chatbot, model_choice], [txt_input, chatbot])
            submit_btn = gr.Button("Submit")
            submit_btn.click(respond, [txt_input, chatbot, model_choice], [txt_input, chatbot])
    
    # Second Row: Analysis Section
    with gr.Row():
        with gr.Column(scale=1):
            gr.Markdown("<h1><center>Analysis</center></h1>")
            analysis_dropdown = gr.Dropdown(
                choices=["Model Architecture", "Torch Profile"],
                label="Select Analysis Type"
            )
            # Apply the "small-font" class to reduce font size of the table.
            analysis_table = gr.Dataframe(label="Analysis Details", elem_classes="small-font")
            analysis_btn = gr.Button("Generate Analysis")
            analysis_btn.click(generate_analysis, [analysis_dropdown, model_choice], [analysis_table])




# running Inference

In [None]:
demo.launch(inbrowser=True)


* Running on local URL:  http://127.0.0.1:7864

To create a public link, set `share=True` in `launch()`.




: 