In [1]:
# %pip install transformers datasets torch scikit-learn
# %pip install transformers
# %pip install sentencepiece
# %pip install safetensors
# %pip install datasets
# pip install evaluate


### Get Test-Train Data

In [2]:
import pandas as pd

# Load the datasets
train_df = pd.read_csv("sst2_train.csv")
test_df = pd.read_csv("sst2_test.csv")
val_df = pd.read_csv("sst2_dev.csv")

# Display samples from the training dataset
print(train_df.head())

                                            sentence  label
0  The Rock is destined to be the 21st Century 's...      3
1  The gorgeously elaborate continuation of `` Th...      4
2  Singer\/composer Bryan Adams contributes a sle...      3
3  You 'd think by now America would have had eno...      2
4               Yet the act is still charming here .      3


In [None]:
from transformers import AutoTokenizer

model_name = "meta-llama/Llama-3.2-1B"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize datasets
def tokenize_data(df):
    return tokenizer(
        df["sentence"].tolist(),
        padding="max_length",  # Pad to max sequence length
        truncation=True,       # Truncate sequences longer than max length
        max_length=128,        # Set max length to 128
        return_tensors="pt"    # Return PyTorch tensors
    )

# Set the pad_token to eos_token
tokenizer.pad_token = tokenizer.eos_token
# # Add a custom pad token
# tokenizer.add_special_tokens({'pad_token': '[PAD]'})


train_encodings = tokenize_data(train_df)
val_encodings = tokenize_data(val_df)
test_encodings = tokenize_data(test_df)


In [4]:
import torch

class SST2Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

# Create PyTorch datasets
train_dataset = SST2Dataset(train_encodings, train_df["label"].tolist())
val_dataset = SST2Dataset(val_encodings, val_df["label"].tolist())
test_dataset = SST2Dataset(test_encodings, test_df["label"].tolist())


### Accessing Llama3.2-1B Model 

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForSequenceClassification

# Load the tokenizer and model
model_name = "meta-llama/Llama-3.2-1B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=5,  # Adjust for your classification task
    torch_dtype=torch.float16,  # Mixed precision
    device_map="auto"           # Distributes the model across available GPUs
    # device_map="cpu"           # Distributes the model across available GPUs

)

model.resize_token_embeddings(len(tokenizer))  # Ensure the model handles new tokens

# Add padding token if needed
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [13]:
tokenizer

PreTrainedTokenizerFast(name_or_path='meta-llama/Llama-3.2-1B', vocab_size=128000, model_max_length=131072, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|begin_of_text|>', 'eos_token': '<|end_of_text|>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	128000: AddedToken("<|begin_of_text|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	128001: AddedToken("<|end_of_text|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	128002: AddedToken("<|reserved_special_token_0|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	128003: AddedToken("<|reserved_special_token_1|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	128004: AddedToken("<|finetune_right_pad_id|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	128005: AddedToken("<|reserved_special_token_2|>", rst

In [6]:
# # Prepare input text
# input_text = "The key to life is"
# inputs = tokenizer(input_text, return_tensors="pt", padding=True).to("cuda")  # Ensure tensors are on the correct device

# # Generate outputs
# outputs = model.generate(**inputs, max_length=50, num_return_sequences=1)
# decoded_outputs = tokenizer.decode(outputs[0], skip_special_tokens=True)

# print(decoded_outputs)

### No. of Parameters~

In [7]:
# Calculate the total number of parameters
total_params = sum(p.numel() for p in model.parameters())

# Print results
print(f"Total parameters in {model_name}: {total_params / 1e6:.2f}M ({total_params} parameters)")

# Expected parameter count (from the paper)
reported_params = 1e9  # 1 billion parameters

# Compare the calculated count with the reported count
if total_params == reported_params:
    print("The calculated parameters match the reported parameters in the paper.")
else:
    print("The calculated parameters DO NOT match the reported parameters in the paper.")

Total parameters in meta-llama/Llama-3.2-1B: 1235.82M (1235824640 parameters)
The calculated parameters DO NOT match the reported parameters in the paper.


### Moddel Finetunning For Classification: SST-2

In [None]:
from torch.utils.data import DataLoader
from transformers import get_scheduler


In [None]:
batch_size = 8

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)
# test_dataloader = DataLoader(test_dataset, batch_size=batch_size)


In [10]:

# Optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

# Scheduler
num_training_steps = len(train_dataloader) * 3  # 3 epochs
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)


In [11]:
from tqdm import tqdm
import evaluate

# Metric
metric = evaluate.load("accuracy")

# Training loop
epochs = 3
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
# device = torch.device("cpu")

for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")

    # Training phase
    model.train()
    train_loss = 0

    for batch in tqdm(train_dataloader):
        batch = {k: v.to(device) for k, v in batch.items()}
        print("in train, ")
        # Forward pass
        outputs = model(**batch)
        loss = outputs.loss
        train_loss += loss.item()

        # Backward pass
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

    avg_train_loss = train_loss / len(train_dataloader)
    print(f"Training Loss: {avg_train_loss:.4f}")

    # Evaluation phase
    model.eval()
    val_loss = 0

    with torch.no_grad():
        for batch in val_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            print("in validate, ")

            # Forward pass
            outputs = model(**batch)
            val_loss += outputs.loss.item()

            # Compute metrics
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=-1)
            metric.add_batch(predictions=predictions, references=batch["labels"])

    avg_val_loss = val_loss / len(val_dataloader)
    val_accuracy = metric.compute()["accuracy"]

    print(f"Validation Loss: {avg_val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}")


Epoch 1/3


  0%|          | 0/508 [00:00<?, ?it/s]

in train, 


  0%|          | 0/508 [00:00<?, ?it/s]


ValueError: Cannot handle batch sizes > 1 if no padding token is defined.

In [None]:
model.save_pretrained("./sst2_fine_tuned_model")
tokenizer.save_pretrained("./sst2_fine_tuned_tokenizer")


In [None]:
from transformers import pipeline

# Load the fine-tuned model
sentiment_pipeline = pipeline("text-classification", model="./sst2_fine_tuned_model", tokenizer=tokenizer)

# Test on new sentences
test_sentences = ["I absolutely loved the movie!", "The food was horrible."]
predictions = sentiment_pipeline(test_sentences)

print(predictions)
