<h1 align="center">Large Language Models</h1>

## Integrantes

- Josué Say

## Repositorio

- [Enlace a GitHub](https://github.com/JosueSay/HugginsFaceModels)

In [42]:
# %!pip install -r requirements.txt

## Librerias


In [43]:
import os, time, psutil
from dotenv import load_dotenv
from huggingface_hub import login, whoami
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from rich.live import Live
from rich.table import Table
import pandas as pd
from transformers import TextIteratorStreamer
from threading import Thread
import matplotlib.pyplot as plt

## Constantes

In [44]:
load_dotenv()
HF_TOKEN = os.getenv("HF_TOKEN", "")

# IDs models
MODEL_ID_TINY_LLAMA="TinyLlama/TinyLlama-1.1B-Chat-v1.0"
MODEL_ID_QWEN="Qwen/Qwen2.5-0.5B-Instruct"
MODEL_ID_DOCTOR_SHOTGUN="Doctor-Shotgun/TinyLlama-1.1B-32k-Instruct"
MODEL_ID_META_LLAMA="meta-llama/Llama-2-7b-hf"

## Login y device

In [45]:
def loginHF(hfToken: str):
    if hfToken == "":
        print("No se encontró HF_TOKEN en el .env. Usando valor vacío...")
    login(hfToken)
    print("Login de HuggingFace completado.")

In [46]:
def getDevice() -> str:
    if torch.cuda.is_available():
        return "cuda"
    if getattr(torch.backends, "mps", None) and torch.backends.mps.is_available():
        return "mps"
    return "cpu"

## Métricas de sistema

In [47]:
def getSystemUsage():
    vm = psutil.virtual_memory()
    return {
        "cpu_pct": psutil.cpu_percent(interval=None),
        "ram_pct": round(vm.percent, 2),
        "ram_used_mb": round((vm.total - vm.available) / (1024**2), 1),
    }

In [48]:
def getGpuUsage():
    if torch.cuda.is_available():
        idx = torch.cuda.current_device()
        torch.cuda.synchronize()
        alloc = torch.cuda.memory_allocated(idx) / (1024**2)
        reserved = torch.cuda.memory_reserved(idx) / (1024**2)
        return {
            "gpu": torch.cuda.get_device_name(idx),
            "vram_alloc_mb": round(alloc, 1),
            "vram_reserved_mb": round(reserved, 1),
        }
    return {"gpu": "none", "vram_alloc_mb": 0.0, "vram_reserved_mb": 0.0}

## Formateadores de prompts

In [49]:
def fmtTinyLlama(msg, system="You are a helpful assistant."):
    # Se usa chat template si el modelo lo tiene
    return [
        {"role": "system", "content": system},
        {"role": "user", "content": msg},
    ]

In [50]:
def fmtQwen(msg, system="You are Qwen, a helpful assistant."):
    return [
        {"role": "system", "content": system},
        {"role": "user", "content": msg},
    ]

In [51]:
def fmtDoctorShotgun(msg, system="Follow the instruction."):
    # Formato Alpaca modificado de la card
    return f"""### Instruction:
                {system}

                ### Input:
                {msg}

                ### Response:
            """

In [52]:
def fmtLlama2(msg):  # pretrained (no chat)
    return msg

## Presets por modelo

In [53]:
MODEL_PRESETS = {
    MODEL_ID_TINY_LLAMA: {
        "gen_kwargs": dict(
            max_new_tokens=256, temperature=0.7, top_p=0.95, top_k=50,
            repetition_penalty=1.1, do_sample=True
        ),
        "prompt_fmt": fmtTinyLlama,
        "use_chat_template": True
    },
    MODEL_ID_QWEN: {
        "gen_kwargs": dict(
            max_new_tokens=512, temperature=0.7, top_p=0.9, top_k=40,
            repetition_penalty=1.05, do_sample=True
        ),
        "prompt_fmt": fmtQwen,
        "use_chat_template": True
    },
    MODEL_ID_DOCTOR_SHOTGUN: {
        "gen_kwargs": dict(
            max_new_tokens=300, temperature=0.7, top_p=0.9, top_k=40,
            repetition_penalty=1.05, do_sample=True
        ),
        "prompt_fmt": fmtDoctorShotgun,
        "use_chat_template": False
    },
    MODEL_ID_META_LLAMA: {
        "gen_kwargs": dict(
            max_new_tokens=200, temperature=0.7, top_p=0.9, top_k=40,
            repetition_penalty=1.1, do_sample=True
        ),
        "prompt_fmt": fmtLlama2,
        "use_chat_template": False
    },
}

## Cargar modelo y tokenizer

In [54]:
def loadModel(modelId, use4bit=False, dtype=None):
    device = getDevice()
    if dtype is None:
        dtype = torch.bfloat16 if (torch.cuda.is_available() and torch.cuda.is_bf16_supported()) else torch.float16

    tokenizer = AutoTokenizer.from_pretrained(modelId, use_fast=True)

    if use4bit:
        model = AutoModelForCausalLM.from_pretrained(modelId, device_map="auto", load_in_4bit=True)
    else:
        model = AutoModelForCausalLM.from_pretrained(modelId, torch_dtype=dtype, device_map="auto")

    gen = pipeline("text-generation", model=model, tokenizer=tokenizer, return_full_text=False)
    print(f"Modelo '{modelId}' cargado en {device}.")
    return gen, tokenizer

## Construcción de prompt

In [55]:
def buildPrompt(modelId, userText, systemText=None, tokenizer=None):
    preset = MODEL_PRESETS[modelId]
    fmt = preset["prompt_fmt"]
    payload = fmt(userText, systemText) if systemText is not None else fmt(userText)
    if preset.get("use_chat_template", False):
        return tokenizer.apply_chat_template(payload, tokenize=False, add_generation_prompt=True)
    return payload

## Inferencia

In [56]:
def runInferenceBatch(modelId, userText, systemText=None, overrideGenKwargs=None, use4bit=False):
    gen, tok = loadModel(modelId, use4bit=use4bit)
    prompt = buildPrompt(modelId, userText, systemText, tokenizer=tok)

    genKwargs = MODEL_PRESETS[modelId]["gen_kwargs"].copy()
    if overrideGenKwargs:
        genKwargs.update(overrideGenKwargs)

    sys_before = getSystemUsage()
    gpu_before = getGpuUsage()
    t0 = time.time()
    out = gen(prompt, **genKwargs)
    t1 = time.time()
    sys_after = getSystemUsage()
    gpu_after = getGpuUsage()

    text = out[0]["generated_text"]
    input_tokens  = len(tok(prompt).input_ids)
    output_tokens = len(tok(text).input_ids)
    elapsed = round(t1 - t0, 3)

    metrics = {
        "modelo": modelId,
        "input_tokens": input_tokens,
        "output_tokens": output_tokens,
        "tiempo_seg": elapsed,
        "tok_s": round(output_tokens / max(elapsed, 1e-6), 2),
        "temperature": genKwargs.get("temperature"),
        "max_new_tokens": genKwargs.get("max_new_tokens"),
        "top_p": genKwargs.get("top_p"),
        "top_k": genKwargs.get("top_k"),
        "repetition_penalty": genKwargs.get("repetition_penalty"),
        "cpu_before": sys_before["cpu_pct"], "cpu_after": sys_after["cpu_pct"],
        "ram_before_pct": sys_before["ram_pct"], "ram_after_pct": sys_after["ram_pct"],
        "vram_before_mb": gpu_before["vram_alloc_mb"], "vram_after_mb": gpu_after["vram_alloc_mb"]
    }
    return text, metrics

## Inferencia con streaming

In [57]:
def makeLiveTable():
    table = Table(title="Inferencia (en vivo)", expand=True)
    table.add_column("Modelo")
    table.add_column("Tokens out")
    table.add_column("t (s)")
    table.add_column("tok/s")
    table.add_column("CPU %")
    table.add_column("RAM %")
    table.add_column("VRAM MB")
    return table

In [58]:
def runInferenceStreaming(modelId, userText, systemText=None, overrideGenKwargs=None, use4bit=False, csvPath=None):
    gen, tok = loadModel(modelId, use4bit=use4bit)
    prompt = buildPrompt(modelId, userText, systemText, tokenizer=tok)

    genKwargs = MODEL_PRESETS[modelId]["gen_kwargs"].copy()
    if overrideGenKwargs:
        genKwargs.update(overrideGenKwargs)

    inputs = tok(prompt, return_tensors="pt").to(gen.model.device)

    sys_before = getSystemUsage()
    gpu_before = getGpuUsage()
    t0 = time.time()

    streamer = TextIteratorStreamer(tok, skip_special_tokens=True, skip_prompt=True)

    def _worker():
        gen.model.generate(
            **inputs,
            streamer=streamer,
            do_sample=genKwargs.get("do_sample", True),
            max_new_tokens=genKwargs.get("max_new_tokens", 200),
            temperature=genKwargs.get("temperature", 0.7),
            top_p=genKwargs.get("top_p", 0.9),
            top_k=genKwargs.get("top_k", 40),
            repetition_penalty=genKwargs.get("repetition_penalty", 1.0),
        )

    thread = Thread(target=_worker)
    thread.start()

    tokens_collected = []
    table = makeLiveTable()
    with Live(table, refresh_per_second=4):
        partial = ""
        while True:
            try:
                token = next(streamer)
                partial += token
                tokens_collected.append(token)
            except StopIteration:
                break

            elapsed = max(time.time() - t0, 1e-6)
            out_tok = len(tok(partial).input_ids)
            sys_now = getSystemUsage()
            gpu_now = getGpuUsage()

            table.rows = []
            table.add_row(
                modelId,
                str(out_tok),
                f"{elapsed:.2f}",
                f"{out_tok/elapsed:.2f}",
                f"{sys_now['cpu_pct']:.0f}",
                f"{sys_now['ram_pct']:.1f}",
                f"{gpu_now['vram_alloc_mb']:.0f}"
            )

    thread.join()
    t1 = time.time()

    text = partial
    input_tokens  = len(inputs.input_ids[0])
    output_tokens = len(tok(text).input_ids)
    elapsed = round(t1 - t0, 3)
    sys_after = getSystemUsage()
    gpu_after = getGpuUsage()

    metrics = {
        "modelo": modelId,
        "input_tokens": input_tokens,
        "output_tokens": output_tokens,
        "tiempo_seg": elapsed,
        "tok_s": round(output_tokens / max(elapsed, 1e-6), 2),
        "temperature": genKwargs.get("temperature"),
        "max_new_tokens": genKwargs.get("max_new_tokens"),
        "top_p": genKwargs.get("top_p"),
        "top_k": genKwargs.get("top_k"),
        "repetition_penalty": genKwargs.get("repetition_penalty"),
        "cpu_before": sys_before["cpu_pct"], "cpu_after": sys_after["cpu_pct"],
        "ram_before_pct": sys_before["ram_pct"], "ram_after_pct": sys_after["ram_pct"],
        "vram_before_mb": gpu_before["vram_alloc_mb"], "vram_after_mb": gpu_after["vram_alloc_mb"]
    }

    if csvPath:
        df = pd.DataFrame([metrics])
        df.to_csv(csvPath, mode="a", index=False, header=not os.path.exists(csvPath))

    return text, metrics

## Esportación csv

In [59]:
def benchmarkModels(models, prompt, systemText=None, runs=1, use4bit=True, csvPath="metrics.csv", override=None):
    rows = []
    for modelId in models:
        for i in range(runs):
            print(f"\n=== {modelId} | run {i+1}/{runs} ===")
            text, m = runInferenceStreaming(
                modelId, prompt, systemText=systemText,
                overrideGenKwargs=override, use4bit=use4bit, csvPath=csvPath
            )
            m["run"] = i+1
            rows.append(m)
    return pd.DataFrame(rows)

## Lectura de csv

In [60]:
def loadResults(csvPath="metrics.csv"):
    return pd.read_csv(csvPath) if os.path.exists(csvPath) else pd.DataFrame()

In [61]:
def plotQuick(df):
    if df.empty:
        print("No hay datos para graficar.")
        return
    # tiempo
    plt.figure()
    df.groupby("modelo")["tiempo_seg"].mean().plot(kind="bar")
    plt.title("Tiempo promedio por modelo (s)")
    plt.ylabel("segundos")
    plt.show()
    # tok/s
    plt.figure()
    df.groupby("modelo")["tok_s"].mean().plot(kind="bar")
    plt.title("Velocidad promedio (tok/s)")
    plt.ylabel("tokens/segundo")
    plt.show()

## Pipeline