In [None]:
!pip install -U transformers
!pip install -q torch
!pip install -U accelerate
!pip install -q datasets
!pip install -U datasets
!pip install -q sentencepiece
!pip install -q pynvml psutil
!pip install -U bitsandbytes

In [None]:
import torch
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import BitsAndBytesConfig
from datasets import load_dataset
from pynvml import nvmlInit, nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo
import psutil
import time
import threading
import numpy as np
from torch import nn
import os
from torch.utils.data import DataLoader


In [None]:
# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")


In [None]:
from huggingface_hub import login

hf_tokenn ="token"
login(hf_tokenn)
model_name = 'meta-llama/Llama-3.2-1B'
#model = AutoModelForCausalLM.from_pretrained(model_name,load_in_8bit=True,device_map="auto")
#model = AutoModelForCausalLM.from_pretrained(model_name)
#tokenizer = AutoTokenizer.from_pretrained("ItzGenes/pruned70-llama-1b-KD-Bueno", torch_dtype=torch.float16)
#model = AutoModelForCausalLM.from_pretrained("ItzGenes/pruned70-llama-1b-KD-Bueno", torch_dtype=torch.float16)
#tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained("ItzGenes/pruned70-llama-1b-KD-Bueno",load_in_8bit=True,device_map={"": "cuda"})
tokenizer = AutoTokenizer.from_pretrained("ItzGenes/pruned70-llama-1b-KD-Bueno")
model.eval()
#model.to(device)

In [None]:
model.to(device)

In [None]:
for name, module in model.named_modules():
    if "Linear8bitLt" in str(type(module)):
        print(f"El módulo '{name}' está en 8 bits: {type(module)}")
        break
else:
    print("No se encontraron módulos en 8 bits.")

# QUANT 4Bits

In [None]:
from huggingface_hub import login


hf_tokenn ="token"
login(hf_tokenn)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
)




model_name = 'ItzGenes/pruned50-llama-1b-KD-Bueno'
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map={"": "cuda"}
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
model.eval()

# PRUNING

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters())
original_param_count = count_parameters(model)

In [None]:
def compute_neuron_pair_importance(gate_weight, up_weight):

  gate_max_abs = torch.max(gate_weight, dim=1).values + torch.abs(torch.min(gate_weight, dim=1).values)
  up_max_abs = torch.max(up_weight, dim=1).values + torch.abs(torch.min(up_weight, dim=1).values)
  importance_scores = gate_max_abs + up_max_abs
  return importance_scores

  def prune_neuron_pairs(mlp, prune_percent):

    gate_weight = mlp.gate_proj.weight.data.float()
    up_weight = mlp.up_proj.weight.data.float()

    importance_scores = compute_neuron_pair_importance(gate_weight, up_weight)

    original_intermediate_size = gate_weight.size(0)
    num_neuron_pairs_to_prune = min(int(prune_percent * original_intermediate_size), original_intermediate_size - 1)
    k = original_intermediate_size - num_neuron_pairs_to_prune

    if k <= 0:
        raise ValueError(f"Invalid number of neuron pairs to keep: {k}. Adjust the prune_percent.")

    _, indices_to_keep = torch.topk(importance_scores, k, largest=True, sorted=True)
    indices_to_keep = indices_to_keep.sort().values

    new_gate_proj = nn.Linear(mlp.gate_proj.in_features, k, bias=False).to(device)
    new_up_proj = nn.Linear(mlp.up_proj.in_features, k, bias=False).to(device)
    new_down_proj = nn.Linear(k, mlp.down_proj.out_features, bias=False).to(device)

    new_gate_proj.weight.data = mlp.gate_proj.weight.data[indices_to_keep, :]
    new_up_proj.weight.data = mlp.up_proj.weight.data[indices_to_keep, :]
    new_down_proj.weight.data = mlp.down_proj.weight.data[:, indices_to_keep]

    return new_gate_proj, new_up_proj, new_down_proj, k


In [None]:
def update_model(model, prune_percent):
    """
    It modifies each mlp layer present in model, to retain only the most
    important neurons. Creating new smaller versions of each layer pruned.

    Args:
    - model: Model to prune.
    - prune_percent: Percentage of neurons to prune.

    Returns:
    - model: New pruned model.
    """
    new_intermediate_size = None

    #loop for each model layer.
    for idx, layer in enumerate(model.model.layers):
        #Since each layer is a LlamaDecoderLayer it contains multiple components
        # Attention, MLP and Layer norms. We're targetting MLP component
        # by accesing layer.mlp.
        mlp = layer.mlp

        #Call the prune_neiron_pairs with the layers and receiving the pruned.
        new_gate_proj, new_up_proj, new_down_proj, new_size = prune_neuron_pairs(mlp, prune_percent)

        #Replace the Origiginal Layers with Pruned Layers.
        mlp.gate_proj = new_gate_proj
        mlp.up_proj = new_up_proj
        mlp.down_proj = new_down_proj

        #new_intermediate_size only needs to be set once
        if new_intermediate_size is None:
            new_intermediate_size = new_size

    #Update the model config file.
    model.config.intermediate_size = new_intermediate_size

    return model


In [None]:
def prune_neuron_pairs(mlp, prune_percent):
    """
    Reduces the dimensions of the **gate_proj**,**up_proj**, **down_proj**
    layers removing the least important neurons.

    Args:
    - mlp: Layers to prune.
    - prune_percent: Percentage of neurons to prune.

    Returns:
    - new_gate_proj, new_up_proj, new_down_proj:  New pruned layers.
    - k: New intermediate size.

    """
    # Extract the weights from the MLP layers
    #  these weights are used to calculate each neuron's
    #  importance score in the next step.
    gate_weight = mlp.gate_proj.weight.data.float()
    up_weight = mlp.up_proj.weight.data.float()

    #Compute importance stores. Neurons with higher importance scores
    # are considered more important and less likely to be pruned.
    importance_scores = compute_neuron_pair_importance(gate_weight, up_weight)

    #Store the original number of neurons in the intermediate layer.
    original_intermediate_size = gate_weight.size(0)
    #Computes the number of neurons to prune.
    num_neuron_pairs_to_prune = min(int(prune_percent * original_intermediate_size), original_intermediate_size - 1)
    #Calculate the number of neurons to keep. The new intermediate size.
    k = original_intermediate_size - num_neuron_pairs_to_prune

    #Just check that there is no big error calculating k. We can't prune all the neurons.
    if k <= 0:
        raise ValueError(f"Invalid number of neuron pairs to keep: {k}. Adjust the prune_percent.")

    #Select the neuros to keep, by obtaining the indices to keep.
    _, indices_to_keep = torch.topk(importance_scores, k, largest=True, sorted=True)
    indices_to_keep = indices_to_keep.sort().values

    #create the new layers
    new_gate_proj = nn.Linear(mlp.gate_proj.in_features, k, bias=False).to(device)
    new_up_proj = nn.Linear(mlp.up_proj.in_features, k, bias=False).to(device)
    new_down_proj = nn.Linear(k, mlp.down_proj.out_features, bias=False).to(device)

    #copy weights to the new layers.
    new_gate_proj.weight.data = mlp.gate_proj.weight.data[indices_to_keep, :]
    new_up_proj.weight.data = mlp.up_proj.weight.data[indices_to_keep, :]
    new_down_proj.weight.data = mlp.down_proj.weight.data[:, indices_to_keep]

    #return new layers and intermediate size.
    return new_gate_proj, new_up_proj, new_down_proj, k

In [None]:
prune_percent = 0.7  # Prune 20% of neurons
model = update_model(model, prune_percent)

In [None]:
# Recalculate the number of parameters

pruned_param_count = count_parameters(model)
reduction_in_params = original_param_count - pruned_param_count
percentage_savings = (reduction_in_params / original_param_count) * 100

print(f"Pruned model parameters: {pruned_param_count}")
print(f"Reduction in parameters: {reduction_in_params}")
print(f"Percentage of weight savings: {percentage_savings:.2f}%")
torch.cuda.empty_cache()

In [None]:
print(model)

In [None]:
new_model_name = 'pruned70-llama-1b-32'
output_dir = './'+new_model_name
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
print(f"Pruned model saved to {output_dir}")

In [None]:
model.push_to_hub(new_model_name, private=True)
tokenizer.push_to_hub(new_model_name)

# TESTING


In [None]:
dataset = load_dataset("c4", "en", split="validation", streaming=True)
small_subset = []
for i, example in enumerate(dataset):
    small_subset.append(example)
    if i >= 3000:
        break

print(f"Usando {len(small_subset)} ejemplos en streaming.")

In [None]:

stride = 512
max_length=5000

In [None]:
import math

def text_nll(text: str):
    encodings = tokenizer(text, return_tensors="pt", truncation=True, max_length=5000).to(device)
    input_ids = encodings["input_ids"].to(device)
    seq_len = input_ids.size(1)

    nll_sum = 0.0
    n_tokens = 0
    prev_end_loc = 0

    for begin_loc in range(0, seq_len, stride):
        end_loc = min(begin_loc + max_length, seq_len)
        trg_len = end_loc - prev_end_loc

        chunk_ids = input_ids[:, begin_loc:end_loc]
        target_ids = chunk_ids.clone()
        target_ids[:, :-trg_len] = -100

        with torch.no_grad():
            outputs = model(chunk_ids, labels=target_ids)
            neg_log_likelihood = outputs.loss

        num_valid = (target_ids != -100).sum().item()
        batch_sz = target_ids.size(0)
        num_loss_tokens = num_valid #- batch_sz prueba

        nll_sum += neg_log_likelihood.item() * num_loss_tokens
        n_tokens += num_loss_tokens

        prev_end_loc = end_loc
        if end_loc == seq_len:
            break

    return nll_sum, n_tokens

total_nll = 0.0
total_tokens = 0

for ex in tqdm(small_subset, desc="Evaluando subset"):
    text = ex["text"]
    nll, toks = text_nll(text)
    if math.isnan(nll):
      print(f"NLL es NaN — texto: {repr(text[:200])}")
    total_nll += nll
    total_tokens += toks
    torch.cuda.empty_cache()
    #print(f"text_nll: nll={nll}, toks={toks}")


if total_tokens == 0:
    raise ValueError("No se evaluaron tokens válidos. Revisa el subset o la lógica del procesamiento.")
print(total_nll)
avg_nll = total_nll / total_tokens
perplexity = torch.exp(torch.tensor(avg_nll))

print(f"\n=== RESULTADOS ===")
print(f"Tokens totales evaluados: {total_tokens}")
print(f"Pérdida media por token (NLL): {avg_nll:.4f}")
print(f"Perplexity global: {perplexity:.2f}")

In [None]:

def accuracy_for_text(text: str):
    with torch.no_grad():
      enc = tokenizer(text, return_tensors="pt", truncation=True, max_length=5000)
      input_ids = enc.input_ids.to(device)
      seq_len = input_ids.size(1)

      n_tokens = 0
      n_correct = 0
      prev_end = 0

      for start in range(0, seq_len, stride):
          end = min(start + max_length, seq_len)
          trg_len = end - prev_end

          chunk = input_ids[:, start:end]
          target = chunk.clone()
          target[:, :-trg_len] = -100

          outputs = model(chunk, labels=target)
          logits = outputs.logits
          preds = torch.argmax(logits, dim=-1)
          mask = target != -100
          correct = (preds == target) & mask

          n_correct += correct.sum().item()
          n_tokens += mask.sum().item()

          prev_end = end
          if end == seq_len:
              break


    return n_tokens, n_correct

total_tokens = 0
total_correct = 0

for ex in tqdm(small_subset, desc="Calculando accuracy"):
    text = ex["text"]
    toks, correct = accuracy_for_text(text)
    total_tokens += toks
    total_correct += correct
    torch.cuda.empty_cache()


accuracy = total_correct / total_tokens
print(f"\n=== RESULTADO ===")
print(f"Tokens evaluados: {total_tokens}")
print(f"Accuracy por token: {accuracy:.4%}")

In [None]:
#model.to(device)

@torch.no_grad()
def topk_accuracy(text, k=5, max_length=2048, stride=512):
    enc = tokenizer(text, return_tensors="pt", truncation=True, max_length=None)
    input_ids = enc.input_ids.to(device)
    seq_len = input_ids.size(1)

    correct = 0
    total = 0

    for start in range(0, seq_len - 1, stride):
        end = min(start + max_length, seq_len)
        input_chunk = input_ids[:, start:end]
        labels = input_chunk[:, 1:]
        inputs = input_chunk[:, :-1]

        outputs = model(inputs)
        logits = outputs.logits
        topk = torch.topk(logits, k=k, dim=-1).indices
        match = (topk == labels.unsqueeze(-1)).any(dim=-1)
        correct += match.sum().item()
        total += match.numel()

        if end == seq_len:
            break

    return correct, total

total_correct = 0
total_count = 0

for ex in tqdm(small_subset, desc="Evaluando Top-5 accuracy"):
    text = ex["text"]
    correct, total = topk_accuracy(text, k=5)
    total_correct += correct
    total_count += total

top5_acc = total_correct / total_count
print(f"\nTop-5 Accuracy: {top5_acc:.4%} ({total_correct}/{total_count})")

Uso de Memoria

In [None]:
from transformers.utils import logging
logging.set_verbosity_error()
#model.to(device)
max_new_tokens = 50
tokenizer.pad_token = tokenizer.eos_token

def memory_for_prompt(text: str) -> float:
    with torch.no_grad():
        inputs = tokenizer(
            text,
            return_tensors="pt",
            truncation=True,
            max_length=2048,
            padding=True
        ).to(device)

        torch.cuda.reset_peak_memory_stats(device)

        _ = model.generate(
            input_ids=inputs.input_ids,
            attention_mask=inputs.attention_mask,
            max_new_tokens=max_new_tokens,
            do_sample=False,
            pad_token_id=tokenizer.eos_token_id
        )

        peak_bytes = torch.cuda.max_memory_allocated(device)
        peak_gb = peak_bytes / 1e9
        return peak_gb

memory_peaks = []

for ex in tqdm(small_subset, desc="Midiendo memoria por ejemplo"):
    text = ex["text"]
    peak = memory_for_prompt(text)
    memory_peaks.append(peak)
    torch.cuda.empty_cache()

# Calcular métricas finales
peak_total = max(memory_peaks)
mean_peak = float(np.mean(memory_peaks))
std_peak  = float(np.std(memory_peaks))

# Imprimir resultados
print(f"\n=== RESULTADO MEMORIA ===")
print(f"Pico total observado:     {peak_total:.3f} GB")
print(f"Media por ejemplo:        {mean_peak:.3f} GB ± {std_peak:.3f} GB (σ)")
print(memory_peaks)

`\Medir latencia + TTFT y TPOT (Time To First Token y Time Per Output Token)`

In [None]:
import time
from transformers.utils import logging

max_new_tokens = 50
logging.set_verbosity_error()

def timing_for_prompt(text: str):
    enc = tokenizer(text, return_tensors="pt", truncation=True, max_length=2048)
    input_ids = enc.input_ids.to(device)

    torch.cuda.synchronize(device)
    start_ttft = time.perf_counter()
    with torch.no_grad():
        _ = model.generate(
            input_ids,
            max_new_tokens=1,
            do_sample=False,
        )
    torch.cuda.synchronize(device)
    ttft = time.perf_counter() - start_ttft

    torch.cuda.synchronize(device)
    start_total = time.perf_counter()
    with torch.no_grad():
        output = model.generate(
            input_ids,
            max_new_tokens=max_new_tokens,
            do_sample=False,
        )
    torch.cuda.synchronize(device)
    total_latency = time.perf_counter() - start_total


    n_generated = output.size(-1) - input_ids.size(-1)
    if n_generated > 1:
        tpot = (total_latency - ttft) / (n_generated - 1)
    else:
        tpot = 0.0

    return ttft, total_latency, tpot

# Listas para acumular métricas
ttfts, totals, tpots = [], [], []

# Iterar sobre el subset
for ex in tqdm(small_subset, desc="Midiendo latencias por ejemplo"):
    text = ex["text"]
    ttft, total, tpot = timing_for_prompt(text)
    ttfts.append(ttft)
    totals.append(total)
    tpots.append(tpot)
    torch.cuda.empty_cache()

metrics = {
    "TTFT (s)": ttfts,
    "Latencia total (s)": totals,
    "TPOT (s/token)": tpots
}

for name, vals in metrics.items():
    peak = max(vals)
    mean = float(np.mean(vals))
    std  = float(np.std(vals))
    print(f"\n=== {name} ===")
    print(f"Peak : {peak:.3f} s")
    print(f"Media: {mean:.3f} s ± {std:.3f} s (σ)")

In [None]:

print(totals)



In [None]:
from bitsandbytes.nn import Linear4bit

print("¿Usa 4-bit?", any(isinstance(m, Linear4bit) for m in model.modules()))