In [1]:
# Install necessary libraries
get_ipython().system('pip install -U transformers datasets accelerate peft trl bitsandbytes langchain chromadb sentence-transformers flash-attn')


Collecting transformers
  Downloading transformers-4.47.1-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.1/44.1 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting accelerate
  Downloading accelerate-1.2.1-py3-none-any.whl.metadata (19 kB)
Collecting peft
  Downloading peft-0.14.0-py3-none-any.whl.metadata (13 kB)
Collecting trl
  Downloading trl-0.13.0-py3-none-any.whl.metadata (11 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.0-py3-none-manylinux_2_24_x86_64.whl.metadata (2.9 kB)
Collecting langchain
  Downloading langchain-0.3.12-py3-none-any.whl.metadata (7.1 kB)
Collecting chromadb
  Downloading chromadb-0.5.23-py3-none-any.whl.metadata (6.8 kB)
Collecting sentence-transformers
  Downloading sentence_transformers-3.3.1-py3-none-any.whl.metadata (10 kB)
Collecting flash-attn
  Downloading flash_attn-2.7.2.post1.tar.gz (3.1

In [2]:
pip install -U langchain-community


Collecting langchain-community
  Downloading langchain_community-0.3.12-py3-none-any.whl.metadata (2.9 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting httpx-sse<0.5.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.7.0-py3-none-any.whl.metadata (3.5 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.23.1-py3-none-any.whl.metadata (7.5 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting mypy-extensions>=0.3.0 (from typing-inspect<1,>=0.4.0->dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading mypy_extensions-1.0.0-py3-no

In [3]:
import os
import torch
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline
)
from peft import (
    LoraConfig,
    get_peft_model
)
from trl import SFTTrainer
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain.schema import Document

In [4]:
# Log in to Hugging Face
get_ipython().system('huggingface-cli login')



    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) y
Token is valid (permission: fineGrained).
The token `mariamattiaa` has been saved to /root/.cache/huggingface/stored_tokens
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re

In [5]:
# Define paths and model configurations
base_model = "meta-llama/Llama-3.2-3B-Instruct"
new_model = "/content/llama3.2-finetuned"
dataset_path = "/content/cleaned_dataset.csv"
persist_directory = "db"

In [6]:
# Check GPU capability and set dtype and attention implementation
if torch.cuda.get_device_capability()[0] >= 8:
    torch_dtype = torch.bfloat16
    attn_implementation = "flash_attention_2"
else:
    torch_dtype = torch.float16
    attn_implementation = "eager"

In [7]:
# QLoRA config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch_dtype,
    bnb_4bit_use_double_quant=True,
)

In [8]:
# Load the base model and tokenizer
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map="auto",
    attn_implementation=attn_implementation
)
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/878 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


In [9]:
# Prepare and clean the dataset
df = pd.read_csv(dataset_path, encoding='latin-1')
df.drop_duplicates(subset=['gardiner_code'], inplace=True)
df.dropna(subset=['gardiner_code', 'english_translation'], inplace=True)

instruction = """You are an expert in translating Gardiner codes into their English meanings.\nAnswer questions about the meaning of any Gardiner code concisely and accurately.\nIf multiple Gardiner codes are provided, combine their meanings into a full, coherent sentence."""

def format_chat_template(row):
    row_json = [
        {"role": "system", "content": instruction},
        {"role": "user", "content": f"What does '{row['gardiner_code']}' mean?"},
        {"role": "assistant", "content": row["english_translation"]}
    ]
    row["text"] = tokenizer.apply_chat_template(row_json, tokenize=False)
    return row

# Apply the formatting to the dataset without num_proc
df = df.apply(format_chat_template, axis=1)

dataset = Dataset.from_pandas(df)

In [10]:
# LoRA configuration
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "v_proj"]
)
model = get_peft_model(model, peft_config)


In [11]:
pip install --upgrade transformers




In [12]:
# Training arguments
training_arguments = TrainingArguments(
    output_dir=new_model,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    optim="adamw_torch",
    num_train_epochs=15,
    evaluation_strategy="no",
    save_strategy="epoch",
    logging_strategy="epoch",
    learning_rate=1e-4,
    lr_scheduler_type="cosine",
    warmup_steps=20,
    weight_decay=0.01,
    group_by_length=True,
    save_total_limit=2,
    report_to="none",
    fp16=True if torch_dtype == torch.float16 else False,
    bf16=True if torch_dtype == torch.bfloat16 else False
)

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    tokenizer=tokenizer,
    args=training_arguments,
)

trainer.train()


  trainer = SFTTrainer(


Map:   0%|          | 0/762 [00:00<?, ? examples/s]

Step,Training Loss
191,0.9964
382,0.5065
573,0.395
764,0.2486
955,0.2319
1146,0.2164
1337,0.202
1528,0.1887
1719,0.1758
1910,0.1642




TrainOutput(global_step=2850, training_loss=0.2697569535907946, metrics={'train_runtime': 3650.3546, 'train_samples_per_second': 3.131, 'train_steps_per_second': 0.781, 'total_flos': 1.861920485092147e+16, 'train_loss': 0.2697569535907946, 'epoch': 14.923884514435695})

In [13]:
# Save the fine-tuned model
trainer.model.save_pretrained(new_model)
tokenizer.save_pretrained(new_model)




('/content/llama3.2-finetuned/tokenizer_config.json',
 '/content/llama3.2-finetuned/special_tokens_map.json',
 '/content/llama3.2-finetuned/tokenizer.json')

In [14]:
# LangChain RAG Integration
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

  embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [15]:
from langchain.llms import HuggingFacePipeline

In [16]:
# Convert dataset rows to Document objects
documents = [
    Document(
        page_content=f"Gardiner code {row['gardiner_code']} represents '{row['english_translation']}'.",
        metadata={"gardiner_code": row['gardiner_code']}
    )
    for _, row in df.iterrows()
]

db = Chroma.from_documents(
    documents=documents,
    embedding=embedding_model,
    persist_directory=persist_directory,
)

retriever = db.as_retriever(search_kwargs={"k": 3})

llm_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=100,
    temperature=0.5,  # Lower temperature for more deterministic output
)

llm = HuggingFacePipeline(pipeline=llm_pipeline)

retrieval_qa = RetrievalQA.from_chain_type(llm=llm, retriever=retriever)

prompt_template = PromptTemplate(
    template="""You are a professional Gardiner code expert with the task of translating Gardiner codes into meaningful and accurate English sentences.\n\nYour Responsibilities:\n1. For a single Gardiner code, provide its meaning clearly and concisely. Do not include explanations or references to other Gardiner codes unless explicitly requested.\n2. For multiple Gardiner codes, provide the meaning of each code separately and then combine their meanings into a coherent and professional sentence that accurately reflects their individual meanings.\n3. If you do not know the meaning of a code, state clearly: 'I do not know.' Do not fabricate answers.\n\nGuidelines:\n- Use only the relevant pieces of context provided.\n- Ensure your answers are precise and directly address the question.\n- For single-code queries, include only the requested code's translation without mentioning unrelated codes.\n- For multi-code queries, ensure each code's meaning is included explicitly in the response.\n\nContext:\n{context}\n\nQuestion:\n{question}\n\nAnswer:""",
    input_variables=["context", "question"],
)

Device set to use cuda:0
The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FalconMambaForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'Gemma2ForCausalLM', 'GitForCausalLM', 'GlmForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'GraniteForCausalLM', 'GraniteMoeForCausalLM', 'JambaForCausalLM', 'JetMoeForCausalLM', 'LlamaForCausalLM', 'MambaForCausalLM', 'Mamba2ForCausalLM', 'MarianFor

In [17]:
# Single Gardiner code query
query = "What does Gardiner code 'A5' mean?"
response = retrieval_qa.run(query)
print(f"Generated Response for Single Code: {response}")

  response = retrieval_qa.run(query)


Generated Response for Single Code: Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

Gardiner code A5 represents 'hide'.

Gardiner code X5 represents 'provisions'.

Gardiner code A25 represents 'strike'.

Question: What does Gardiner code 'A5' mean?
Helpful Answer: hide, hide, cover, expose. Y Helpful Answer: cover, expose, hide, hang. (If this is a helpful answer, follow it with "up".) -- 1970, 82. -- 2014, 23. -- 2017, 14. -- 2019, 27. -- 2020, 20. -- 2021, 26. -- 2022, 24. -- 2023, 25. --


In [18]:
# Multiple Gardiner codes query
query = "What do Gardiner codes 'A5', 'A10', and 'A3' mean?"
response = retrieval_qa.run(query)
print(f"Generated Response for Multiple Codes: {response}")

Generated Response for Multiple Codes: Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

Gardiner code A23 represents 'sovereign'.

Gardiner code A13 represents 'enemy'.

Gardiner code A3 represents 'sit.'.

Question: What do Gardiner codes 'A5', 'A10', and 'A3' mean?
Helpful Answer: A3 means sit. A23 means sovereign. A13 means enemy.

[You can use this helpful answer as long as it applies to the first, second, or (in this case) third question. If a question doesn't use this helpful answer, don't combine it with this answer.]

Don't combine this answer with any other answer.

Don't use this answer to mean (in a 1834 sense).

Don't use this term as


In [20]:
pip install evaluate


Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [23]:
# Import necessary libraries for evaluation
from evaluate import load  # Corrected library import for metrics
from math import exp
import numpy as np
import torch

# Function to calculate Perplexity
def calculate_perplexity(model, tokenizer, dataset):
    model.eval()  # Set model to evaluation mode
    total_loss = 0.0
    for example in dataset:
        inputs = tokenizer(example["text"], return_tensors="pt", padding=True, truncation=True)
        inputs = {key: val.to(model.device) for key, val in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs, labels=inputs["input_ids"])
        total_loss += outputs.loss.item()
    avg_loss = total_loss / len(dataset)
    perplexity = exp(avg_loss)
    return perplexity


# Evaluation phase
print("Evaluating model performance...")

# Use the entire dataset for evaluation
evaluation_dataset = dataset  # Use your entire dataset directly

# Perplexity Evaluation
perplexity = calculate_perplexity(model, tokenizer, evaluation_dataset)
print(f"Model Perplexity: {perplexity}")



Evaluating model performance...
Model Perplexity: 1.144742176729515
