In [None]:
!pip install -q transformers
!pip install -q torch
!pip install -q datasets
!pip install -q sentencepiece  # Required for LLaMA tokenizer

In [None]:
import torch
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
from torch import nn
from torch.utils.data import DataLoader
import os
from tqdm import tqdm

In [None]:
# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

In [None]:
from huggingface_hub import login

hf_tokenn ="token"
login(hf_tokenn)
model_name = 'meta-llama/Llama-3.2-1B'
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)
tokenizer = AutoTokenizer.from_pretrained(model_name)
#tokenizer.pad_token = tokenizer.eos_token  # Set pad token

In [None]:
student_model_name = "./pruned70-llama-1b-32"

student_model =  AutoModelForCausalLM.from_pretrained("ItzGenes/pruned70-llama-1b-32")

In [None]:
!pip install -U datasets


In [None]:
from datasets import load_dataset
from datasets import Dataset
import shutil
import os
from itertools import islice
dataset = load_dataset("c4", "en", split="train", streaming=True)
dataset = list(islice(dataset, 3000))


dataset = Dataset.from_list(dataset)

In [None]:
tokenizer.pad_token = tokenizer.eos_token

In [None]:
def tokenize_function(examples):
    tokenized = tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=128,
    )

    input_ids = tokenized["input_ids"]
    labels = input_ids.copy()

    return {
        "input_ids": input_ids,
        "attention_mask": tokenized["attention_mask"],
        "labels": labels,
    }

In [None]:
# Process the dataset with progress bar
print("Tokenizing dataset...")
tokenized_datasets = dataset.map(
    tokenize_function,
    batched=True,
    batch_size=32,
    remove_columns=dataset.column_names,
    desc="Processing examples",
    load_from_cache_file=False
)

In [None]:

from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

dataloader = DataLoader(
    tokenized_datasets,
    batch_size=4,
    shuffle=True,
    collate_fn=data_collator
)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
student_model.to(device)

In [None]:
model.eval()

In [None]:

from torch.optim import AdamW
optimizer = AdamW(student_model.parameters(), lr=1e-5)


num_epochs = 10
temperature = 2.0
alpha = 1

accumulation_steps = 8

In [None]:
torch.cuda.empty_cache()
print(torch.cuda.memory_allocated() / 1e6, "MB")

In [None]:
#En realidad para KD no hace falta ejecutar los codigos de arriba, estan aqui ya integrados, estan repetidos arriba por motivo de errores que daban anteiormente y hacer pruebas varias
def tokenize_function(examples):
    tokenized = tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=128,
        return_tensors="pt"
    )

    input_ids = tokenized["input_ids"]

    labels = input_ids.clone()

    return {
        "input_ids": input_ids,
        "attention_mask": tokenized["attention_mask"],
        "labels": labels,
    }


print("Tokenizing dataset...")
tokenized_datasets = dataset.map(
    tokenize_function,
    batched=True,
    batch_size=32,
    remove_columns=dataset.column_names,
    desc="Processing examples",
    load_from_cache_file=False
)

from transformers import DataCollatorWithPadding


data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

dataloader = DataLoader(
    tokenized_datasets,
    batch_size=4,
    shuffle=True,
    collate_fn=data_collator
)


model.eval()


from torch.optim import AdamW
import torch.nn.functional as F

optimizer = AdamW(student_model.parameters(), lr=1e-5)

num_epochs = 5
temperature = 2.0
alpha = 1

accumulation_steps = 8

for epoch in range(num_epochs):

    student_model.train()

    total_loss = 0

    for batch_idx, batch in enumerate(dataloader):

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        with torch.no_grad():

            teacher_outputs = model(
                input_ids,
                attention_mask=attention_mask,
                output_hidden_states=True
            )
            teacher_logits = teacher_outputs.logits / temperature

        student_outputs = student_model(
            input_ids,
            attention_mask=attention_mask,
            output_hidden_states=True
        )
        student_logits = student_outputs.logits


        teacher_probs = F.softmax(teacher_logits, dim=-1)

        student_log_probs = F.log_softmax(student_logits / temperature, dim=-1)
        loss = F.kl_div(student_log_probs, teacher_probs, reduction='batchmean')

        loss = loss / accumulation_steps


        loss.backward()


        if ((batch_idx + 1) % accumulation_steps == 0) or (batch_idx + 1 == len(dataloader)):
            optimizer.step()
            optimizer.zero_grad()


        total_loss += loss.item() * accumulation_steps

        if (batch_idx + 1) % 100 == 0:
            print(f"Epoch {epoch+1}/{num_epochs}, Batch {batch_idx+1}, Loss: {loss.item():.4f}")


    avg_loss = total_loss / len(dataloader)
    print(f"Epoch {epoch+1}/{num_epochs}, Average Loss: {avg_loss:.4f}")

#Upload the model to HuggingFace.

In [None]:
new_model_name = 'pruned70-llama-1b-KD-Bueno'
output_dir = './'+new_model_name
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

student_model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
print(f"Pruned model saved to {output_dir}")

In [None]:
student_model.push_to_hub(new_model_name, private=True)

In [None]:
tokenizer.push_to_hub(new_model_name)