In [1]:
# !pip install transformers torch
# !pip install pandas pyarrow datasets


In [2]:
# !huggingface-cli login

In [3]:
import pandas as pd

# Load the training and validation datasets
train_df = pd.read_parquet("train-00000-of-00001.parquet")
val_df = pd.read_parquet("validation-00000-of-00001.parquet")

# Check the structure
# print(train_df.head())

In [4]:
# print(train_df[:3])
# print(len(val_df))

In [5]:
from datasets import Dataset

# nRows = len(val_df)*4
# nRows = 57625
# nRows = 36040
nRows = 29729

# Convert DataFrame to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df[:nRows])
val_dataset = Dataset.from_pandas(val_df)

# Check sample data
print(train_dataset[0])


{'id': '56be85543aeaaa14008c9063', 'title': 'Beyoncé', 'context': 'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny\'s Child. Managed by her father, Mathew Knowles, the group became one of the world\'s best-selling girl groups of all time. Their hiatus saw the release of Beyoncé\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".', 'question': 'When did Beyonce start becoming popular?', 'answers': {'answer_start': [269], 'text': ['in the late 1990s']}}


In [6]:
# print(train_dataset[0]['id'])


In [7]:
from transformers import AutoTokenizer

model_name = "meta-llama/Llama-3.2-1B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [8]:
train_dataset["answers"][0]["text"]

['in the late 1990s']

In [None]:
# Define the preprocess function to tokenize the data
def preprocess_function(examples):
    # Tokenize the question and context
    inputs = tokenizer(
        examples["question"],
        examples["context"],
        max_length=384,
        truncation=True,
        padding="max_length",
        return_offsets_mapping=True
    )
    # print(examples["answers"][0]["answer_start"])
    # print(examples["context"])
    # return
    start_positions = []
    end_positions = []

    for i, offsets in enumerate(inputs["offset_mapping"]):
        # If answers exist, map them to token positions
        if len(examples["answers"][i]["text"]) == 0:  # Handle no answer case
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Extract the start and end character positions of the answer
            start_char = examples["answers"][i]["answer_start"][0]
            end_char = start_char + len(examples["answers"][i]["text"][0])

            # Map character positions to token positions
            start_token = next((idx for idx, offset in enumerate(offsets) if offset[0] == start_char), 0)
            end_token = next((idx for idx, offset in enumerate(offsets) if offset[1] == end_char), 0)

            start_positions.append(start_token)
            end_positions.append(end_token)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    inputs.pop("offset_mapping")  # Remove offset mapping after use to save memory

    return inputs


# Apply the preprocessing function to the datasets
train_dataset = train_dataset.map(preprocess_function, batched=True)
val_dataset = val_dataset.map(preprocess_function, batched=True)

# Check the tokenized output (for the first example)
# print(train_dataset[0])


Map:   0%|          | 0/29729 [00:00<?, ? examples/s]

### Moddel Finetunning For Q&A: SQuAD

In [None]:
# torch.cuda.empty_cache()
# trainer.torch.cuda.empty_cache()


In [None]:
import torch
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer

# Load the model
model_name = "meta-llama/Llama-3.2-1B"
model = AutoModelForQuestionAnswering.from_pretrained(
    model_name,
    torch_dtype=torch.float16,  # Mixed precision
    device_map="auto",           # Distributes the model across available GPUs
    low_cpu_mem_usage=True,
)

model.resize_token_embeddings(len(tokenizer))  # Ensure the model handles new tokens

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

In [None]:
training_args = TrainingArguments(
    output_dir="./results",           # Directory for saving results
    eval_strategy="epoch",     # Evaluate after each epoch
    save_strategy="epoch",           # Save model after each epoch
    learning_rate=3e-3,              # Learning rate
    per_device_train_batch_size=8,  # Training batch size
    per_device_eval_batch_size=8,   # Evaluation batch size
    num_train_epochs=3,              # Number of epochs
    weight_decay=0.01,               # Weight decay
    logging_dir="./logs",            # Directory for logs
    logging_steps=1000,                # Log every 1000 steps
    save_total_limit=1,              # Save only the 1 most recent checkpoints
    load_best_model_at_end=True      # Load the best model after training
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    processing_class=tokenizer
)

trainer.train()

In [None]:
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")


In [None]:
model.save_pretrained("./squad_fine_tuned_model")
tokenizer.save_pretrained("./squad_fine_tuned_tokenizer")


In [None]:
from transformers import pipeline

# Load the pipeline for question answering
qa_pipeline = pipeline("question-answering", model="./squad_fine_tuned_model", tokenizer=tokenizer)

# Test on new examples
context = "The Eiffel Tower is located in Paris and is one of the most famous landmarks in the world."
question = "Where is the Eiffel Tower located?"

result = qa_pipeline({"context": context, "question": question})
print(result)


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!cp -r /content/squad_fine_tuned_model/ /content/drive/MyDrive/NLP_A3/


In [None]:
!cp -r /content/squad_fine_tuned_tokenizer/ /content/drive/MyDrive/NLP_A3/
