In [1]:
%%capture
%pip install -U transformers
%pip install -U datasets
%pip install -U accelerate
%pip install -U peft
%pip install -U trl
%pip install -U bitsandbytes
%pip install -U wandb


In [2]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import (
    LoraConfig,
    PeftModel,
    prepare_model_for_kbit_training,
    get_peft_model,
)
import os, torch, wandb
from datasets import load_dataset
from trl import SFTTrainer, setup_chat_format

In [3]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) y
Token is valid (permission: fineGrained).
The token `mariamattiaa` has been saved to /root/.cache/huggingface/stored_tokens
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re

In [4]:
base_model = "meta-llama/Llama-3.2-1B-Instruct"
new_model = "/content/llama3.2-1B-finetuned"
dataset = "/content/cleaned_dataset2.csv"

In [5]:
# Set torch dtype and attention implementation
if torch.cuda.get_device_capability()[0] >= 8:
    !pip install -qqq flash-attn
    torch_dtype = torch.bfloat16
    attn_implementation = "flash_attention_2"
else:
    torch_dtype = torch.float16
    attn_implementation = "eager"

In [6]:
# QLoRA config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch_dtype,
    bnb_4bit_use_double_quant=True,
)
# Load model
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map="auto",
    attn_implementation=attn_implementation
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/877 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

In [7]:
# Importing the dataset
from datasets import Dataset
import pandas as pd

df = pd.read_csv(dataset, encoding='latin-1')
df = Dataset.from_pandas(df)

instruction = """You are an expert in translating Gardiner codes into their English meanings.
    Answer questions about the meaning of any Gardiner code concisely and accurately.
    """

# Function to format each row as a chat template
def format_chat_template(row):
    row_json = [
        {"role": "system", "content": instruction},
        {"role": "user", "content": f"What does '{row['gardiner_code']}' mean?"},
        {"role": "assistant", "content": row["english_translation"]}
    ]

    # Format the row for chat template
    row["text"] = tokenizer.apply_chat_template(row_json, tokenize=False)
    return row

# Map the dataset to the desired format
dataset = df.map(
    format_chat_template,
    num_proc=4,
)


Map (num_proc=4):   0%|          | 0/762 [00:00<?, ? examples/s]

In [8]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)

# Add padding token if it doesn't exist
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer)) # Update model embeddings

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


In [9]:
dataset['text'][390]

"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 18 Dec 2024\n\nYou are an expert in translating Gardiner codes into their English meanings.\n    Answer questions about the meaning of any Gardiner code concisely and accurately.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhat does 'N19' mean?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nhorizon, Horakhty.<|eot_id|>"

In [10]:
import bitsandbytes as bnb

def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
    if 'lm_head' in lora_module_names:  # needed for 16 bit
        lora_module_names.remove('lm_head')
    return list(lora_module_names)

modules = find_all_linear_names(model)

In [11]:
# LoRA config
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=modules
)
model = get_peft_model(model, peft_config)

In [25]:
# Training arguments
training_arguments = TrainingArguments(
    output_dir=new_model,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    optim="adamw_torch",
    num_train_epochs=15,
    evaluation_strategy="no",
    save_strategy="epoch",
    logging_strategy="epoch",
    learning_rate=1e-4,
    warmup_steps=20,
    weight_decay=0.01,
    group_by_length=False,
    fp16=False,
    bf16=False,
    save_total_limit=2,
    report_to="none",
)




In [26]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    tokenizer=tokenizer,
    args=training_arguments,

)


  trainer = SFTTrainer(


Map:   0%|          | 0/762 [00:00<?, ? examples/s]

In [27]:
trainer.train()

Step,Training Loss
47,0.1338
94,0.1281
141,0.1087




TrainOutput(global_step=141, training_loss=0.1235424778985639, metrics={'train_runtime': 186.0878, 'train_samples_per_second': 12.285, 'train_steps_per_second': 0.758, 'total_flos': 1103784929869824.0, 'train_loss': 0.1235424778985639, 'epoch': 2.9842931937172774})

In [28]:
# Define the system instruction and the user query
instruction = "You are an expert in translating Gardiner codes into their English meanings. Answer questions about Gardiner codes concisely and accurately."

# Create a message template for the test input
messages = [
    {"role": "system", "content": instruction},
    {"role": "user", "content": "What does 'A12' mean?"}
]

# Generate the prompt using your tokenizer
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

# Tokenize the prompt
inputs = tokenizer(prompt, return_tensors='pt', padding=True, truncation=True).to("cuda")

# Generate the response using the fine-tuned model
outputs = model.generate(**inputs, max_new_tokens=50, num_return_sequences=1)

# Decode the model's response
text = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Extract and print the assistant's response
response = text.split("assistant")[-1].strip()
print(f"Model Response: {response}")


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Model Response: army, soldier


In [29]:
# Define the system instruction and the user query
instruction = "You are an expert in translating Gardiner codes into their English meanings. Answer questions about Gardiner codes concisely and accurately."

# Create a message template for the test input
messages = [
    {"role": "system", "content": instruction},
    {"role": "user", "content": "What does 'A1' mean?"}
]

# Generate the prompt using your tokenizer
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

# Tokenize the prompt
inputs = tokenizer(prompt, return_tensors='pt', padding=True, truncation=True).to("cuda")

# Generate the response using the fine-tuned model
outputs = model.generate(**inputs, max_new_tokens=50, num_return_sequences=1)

# Decode the model's response
text = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Extract and print the assistant's response
response = text.split("assistant")[-1].strip()
print(f"Model Response: {response}")


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Model Response: man, names


In [30]:
# Define the system instruction and the user query
instruction = "You are an expert in translating Gardiner codes into their English meanings. Answer questions about Gardiner codes concisely and accurately."

# Create a message template for the test input
messages = [
    {"role": "system", "content": instruction},
    {"role": "user", "content": "What does 'B7' mean?"}
]

# Generate the prompt using your tokenizer
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

# Tokenize the prompt
inputs = tokenizer(prompt, return_tensors='pt', padding=True, truncation=True).to("cuda")

# Generate the response using the fine-tuned model
outputs = model.generate(**inputs, max_new_tokens=50, num_return_sequences=1)

# Decode the model's response
text = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Extract and print the assistant's response
response = text.split("assistant")[-1].strip()
print(f"Model Response: {response}")


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Model Response: queens names


In [18]:
!pip install evaluate
!pip install rouge-score

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3
Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=d5c029638f9854417ddc7587ae21d76184bf280b3d2706a80d54a56f5ffdadb8
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [31]:
predictions = []
references = []

for row in dataset:
    gardiner_code = row['gardiner_code']
    reference_translation = row['english_translation']

    messages = [
        {"role": "system", "content": instruction},
        {"role": "user", "content": f"What does '{gardiner_code}' mean?"}
    ]

    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

    inputs = tokenizer(prompt, return_tensors='pt', padding=True, truncation=True).to("cuda")

    outputs = model.generate(**inputs, max_new_tokens=50, num_return_sequences=1)
    text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    response = text.split("assistant")[-1].strip()

    predictions.append(response)
    references.append(reference_translation)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for

In [32]:
from nltk.translate.bleu_score import corpus_bleu

tokenized_references = [[ref.split()] for ref in references]
tokenized_predictions = [pred.split() for pred in predictions]

bleu_score = corpus_bleu(tokenized_references, tokenized_predictions)
print(f"BLEU Score: {bleu_score * 100:.2f}")


BLEU Score: 40.87


In [33]:
from rouge_score import rouge_scorer

scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
rouge_scores = [scorer.score(ref, pred) for ref, pred in zip(references, predictions)]

rouge_l_scores = [score['rougeL'].fmeasure for score in rouge_scores]
average_rouge_l = sum(rouge_l_scores) / len(rouge_l_scores)
print(f"Average ROUGE-L Score: {average_rouge_l:.2f}")


Average ROUGE-L Score: 0.94


In [34]:
def compute_f1(pred_tokens, ref_tokens):
    common = set(pred_tokens) & set(ref_tokens)
    precision = len(common) / len(pred_tokens) if len(pred_tokens) > 0 else 0
    recall = len(common) / len(ref_tokens) if len(ref_tokens) > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    return f1

# Calculate F1 for each sentence
sentence_f1_scores = [
    compute_f1(pred.split(), ref.split()) for pred, ref in zip(predictions, references)
]

# Average F1 score
average_f1 = sum(sentence_f1_scores) / len(sentence_f1_scores)
print(f"Sentence-Level F1 Score: {average_f1:.4f}")


Sentence-Level F1 Score: 0.9356
