In [None]:
# !pip install -U "huggingface_hub[cli]"

In [None]:
# !huggingface-cli login

In [None]:
# !pip install transformers datasets torch scikit-learn
# !pip install transformers
# !pip install sentencepiece
# !pip install safetensors
# !pip install datasets
# !pip install evaluate


### Get Test-Train Data

In [None]:
import pandas as pd

# Load the datasets
train_df = pd.read_csv("sst2_train.csv")
test_df = pd.read_csv("sst2_test.csv")
val_df = pd.read_csv("sst2_dev.csv")

# Display samples from the training dataset
print(train_df.head())

                                            sentence  label
0  The Rock is destined to be the 21st Century 's...      3
1  The gorgeously elaborate continuation of `` Th...      4
2  Singer\/composer Bryan Adams contributes a sle...      3
3  You 'd think by now America would have had eno...      2
4               Yet the act is still charming here .      3


In [None]:
from transformers import AutoTokenizer

model_name = "meta-llama/Llama-3.2-1B"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize datasets
def tokenize_data(df):
    return tokenizer(
        df["sentence"].tolist(),
        # padding=True,
        padding="max_length",  # Pad to max sequence length
        truncation=True,       # Truncate sequences longer than max length
        max_length=128,        # Set max length to 128
        return_tensors="pt"    # Return PyTorch tensors
    )

# Set the pad_token to eos_token
tokenizer.pad_token = tokenizer.eos_token
# # Add a custom pad token
# tokenizer.add_special_tokens({'pad_token': '[PAD]'})


train_encodings = tokenize_data(train_df)
val_encodings = tokenize_data(val_df)
test_encodings = tokenize_data(test_df)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
import torch

class SST2Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

# Create PyTorch datasets
train_dataset = SST2Dataset(train_encodings, train_df["label"].tolist())
val_dataset = SST2Dataset(val_encodings, val_df["label"].tolist())
test_dataset = SST2Dataset(test_encodings, test_df["label"].tolist())


### Accessing Llama3.2-1B Model

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForSequenceClassification

# Load the tokenizer and model
model_name = "meta-llama/Llama-3.2-1B"
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=5,  # Adjust for your classification task
    torch_dtype=torch.float16,  # Mixed precision
    device_map="auto",           # Distributes the model across available GPUs
    # device_map="cpu"           # Distributes the model across available GPUs
    low_cpu_mem_usage=True,
)

model.resize_token_embeddings(len(tokenizer))  # Ensure the model handles new tokens

# Add padding token if needed
# if tokenizer.pad_token is None:
#     tokenizer.pad_token = tokenizer.eos_token

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LlamaForSequenceClassification(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 2048)
    (layers): ModuleList(
      (0-15): 16 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=512, bias=False)
          (v_proj): Linear(in_features=2048, out_features=512, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
   

In [None]:
# tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
# tokenizer

In [None]:
# # Prepare input text
# input_text = "The key to life is"
# inputs = tokenizer(input_text, return_tensors="pt", padding=True).to("cuda")  # Ensure tensors are on the correct device

# # Generate outputs
# outputs = model.generate(**inputs, max_length=50, num_return_sequences=1)
# decoded_outputs = tokenizer.decode(outputs[0], skip_special_tokens=True)

# print(decoded_outputs)

### No. of Parameters~

In [None]:
# Calculate the total number of parameters
total_params = sum(p.numel() for p in model.parameters())

# Print results
print(f"Total parameters in {model_name}: {total_params / 1e6:.2f}M ({total_params} parameters)")

# Expected parameter count (from the paper)
reported_params = 1e9  # 1 billion parameters

# Compare the calculated count with the reported count
if total_params == reported_params:
    print("The calculated parameters match the reported parameters in the paper.")
else:
    print("The calculated parameters DO NOT match the reported parameters in the paper.")

Total parameters in meta-llama/Llama-3.2-1B: 1235.82M (1235824640 parameters)
The calculated parameters DO NOT match the reported parameters in the paper.


### Moddel Finetunning For Classification: SST-2

In [None]:
from torch.utils.data import DataLoader
from transformers import get_scheduler


In [None]:
batch_size = 16

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)
# test_dataloader = DataLoader(test_dataset, batch_size=batch_size)


In [None]:

# Optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

# Scheduler
num_training_steps = len(train_dataloader) * 3  # 3 epochs
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)


In [None]:
model.config.pad_token_id = tokenizer.pad_token_id


In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

trainer.train()


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mjenil-patel910[0m ([33mjenil-patel910-indian-institute-of-technology-gandhinagar[0m). Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss
1,1.7339,
2,0.0,
3,0.0,


TrainOutput(global_step=1524, training_loss=0.5688745693897638, metrics={'train_runtime': 1631.1166, 'train_samples_per_second': 14.929, 'train_steps_per_second': 0.934, 'total_flos': 1.8199549691559936e+16, 'train_loss': 0.5688745693897638, 'epoch': 3.0})

In [None]:
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")


Evaluation results: {'eval_loss': nan, 'eval_runtime': 10.6763, 'eval_samples_per_second': 97.786, 'eval_steps_per_second': 6.182, 'epoch': 3.0}


In [None]:
# from tqdm import tqdm
# import evaluate

# # Metric
# metric = evaluate.load("accuracy")

# # Training loop
# epochs = 3
# device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
# # device = torch.device("cpu")

# for epoch in range(epochs):
#     print(f"Epoch {epoch + 1}/{epochs}")

#     # Training phase
#     model.train()
#     train_loss = 0

#     for batch in tqdm(train_dataloader):
#         batch = {k: v.to(device) for k, v in batch.items()}
#         # print("in train, ")
#         # Forward pass
#         outputs = model(**batch)
#         loss = outputs.loss
#         train_loss += loss.item()

#         # Backward pass
#         loss.backward()
#         optimizer.step()
#         lr_scheduler.step()
#         optimizer.zero_grad()

#     avg_train_loss = train_loss / len(train_dataloader)
#     print(f"Training Loss: {avg_train_loss:.4f}")

#     # Evaluation phase
#     model.eval()
#     val_loss = 0

#     with torch.no_grad():
#         for batch in val_dataloader:
#             batch = {k: v.to(device) for k, v in batch.items()}
#             # print("in validate, ")

#             # Forward pass
#             outputs = model(**batch)
#             val_loss += outputs.loss.item()

#             # Compute metrics
#             logits = outputs.logits
#             predictions = torch.argmax(logits, dim=-1)
#             metric.add_batch(predictions=predictions, references=batch["labels"])

#     avg_val_loss = val_loss / len(val_dataloader)
#     val_accuracy = metric.compute()["accuracy"]

#     print(f"Validation Loss: {avg_val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}")


In [None]:
model.save_pretrained("./sst2_fine_tuned_model")
tokenizer.save_pretrained("./sst2_fine_tuned_tokenizer")


('./sst2_fine_tuned_tokenizer/tokenizer_config.json',
 './sst2_fine_tuned_tokenizer/special_tokens_map.json',
 './sst2_fine_tuned_tokenizer/tokenizer.json')

In [None]:
# from transformers import pipeline

# # Load the fine-tuned model
# sentiment_pipeline = pipeline("text-classification", model="./sst2_fine_tuned_model", tokenizer=tokenizer)

# # Test on new sentences
# test_sentences = ["I absolutely loved the movie!", "The food was horrible."]
# predictions = sentiment_pipeline(test_sentences)

# print(predictions)


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


[{'label': 'LABEL_0', 'score': nan}, {'label': 'LABEL_0', 'score': nan}]


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!zip -r data.zip results/

  adding: results/ (stored 0%)
  adding: results/checkpoint-508/ (stored 0%)
  adding: results/checkpoint-508/optimizer.pt (deflated 100%)
  adding: results/checkpoint-508/training_args.bin (deflated 51%)
  adding: results/checkpoint-508/model.safetensors (deflated 100%)
  adding: results/checkpoint-508/trainer_state.json (deflated 56%)
  adding: results/checkpoint-508/rng_state.pth (deflated 25%)
  adding: results/checkpoint-508/scheduler.pt (deflated 55%)
  adding: results/checkpoint-508/config.json (deflated 55%)
  adding: results/checkpoint-1524/ (stored 0%)
  adding: results/checkpoint-1524/optimizer.pt (deflated 100%)
  adding: results/checkpoint-1524/training_args.bin (deflated 51%)
  adding: results/checkpoint-1524/model.safetensors (deflated 100%)
  adding: results/checkpoint-1524/trainer_state.json (deflated 66%)
  adding: results/checkpoint-1524/rng_state.pth (deflated 25%)
  adding: results/checkpoint-1524/scheduler.pt (deflated 56%)
  adding: results/checkpoint-1524/config

In [None]:
!zip -r data2.zip sample_data/ /sst2_fine_tuned_model /sst2_fine_tuned_tokenizer


  adding: sample_data/ (stored 0%)
  adding: sample_data/anscombe.json (deflated 83%)
  adding: sample_data/README.md (deflated 39%)
  adding: sample_data/california_housing_train.csv (deflated 79%)
  adding: sample_data/mnist_test.csv (deflated 88%)
  adding: sample_data/mnist_train_small.csv


zip error: Interrupted (aborting)


In [None]:
!zip -r wand.zip /content/wandb


  adding: content/wandb/ (stored 0%)
  adding: content/wandb/latest-run/ (stored 0%)
  adding: content/wandb/latest-run/tmp/ (stored 0%)
  adding: content/wandb/latest-run/tmp/code/ (stored 0%)
  adding: content/wandb/latest-run/run-3nesjqpa.wandb (deflated 78%)
  adding: content/wandb/latest-run/files/ (stored 0%)
  adding: content/wandb/latest-run/files/output.log (deflated 76%)
  adding: content/wandb/latest-run/files/requirements.txt (deflated 55%)
  adding: content/wandb/latest-run/files/wandb-metadata.json (deflated 44%)
  adding: content/wandb/latest-run/logs/ (stored 0%)
  adding: content/wandb/latest-run/logs/debug-internal.log (deflated 80%)
  adding: content/wandb/latest-run/logs/debug.log (deflated 75%)
  adding: content/wandb/latest-run/logs/debug-core.log (deflated 58%)
  adding: content/wandb/debug-internal.log (deflated 80%)
  adding: content/wandb/debug.log (deflated 75%)
  adding: content/wandb/run-20241119_131446-5ltc0fhj/ (stored 0%)
  adding: content/wandb/run-2024

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!cp -r /content/sst2_fine_tuned_model/ /content/drive/MyDrive/NLP_A3/


In [None]:
!cp -r /content/sst2_fine_tuned_tokenizer/ /content/drive/MyDrive/NLP_A3/


In [None]:
!cp -r /content/wandb/ /content/drive/MyDrive/NLP_A3/


In [None]:
!cp -r /content/results/ /content/drive/MyDrive/NLP_A3/


In [None]:
torch.cuda.empty_cache()