In [1]:
#!/usr/bin/env python
# coding: utf-8

# Install necessary libraries
!pip install -U transformers datasets accelerate peft trl bitsandbytes


Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting trl
  Downloading trl-0.13.0-py3-none-any.whl.metadata (11 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.0-py3-none-manylinux_2_24_x86_64.whl.metadata (2.9 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m667.1 kB/s[0m eta [36m0:00:00[0m
[?25hDownloading trl-0.13.0-py3-none-any

In [2]:
# Log in to Hugging Face
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) y
Token is valid (permission: fineGrained).
The token `mariamattiaa` has been saved to /root/.cache/huggingface/stored_tokens
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re

In [3]:
# Define paths and model configurations
base_model = "meta-llama/Llama-3.2-3B-Instruct"


In [4]:
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    pipeline
)
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


In [5]:
# Check if GPU is available
print("CUDA Available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("Device Name:", torch.cuda.get_device_name(0))
else:
    print("No GPU detected. Ensure GPU is available for optimal performance.")

CUDA Available: True
Device Name: Tesla T4


In [6]:
# QLoRA Configuration
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)


In [7]:
# Load the base model and tokenizer
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/878 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


In [8]:
# Function to generate prompt with Gardiner codes and meanings
def generate_prompt_with_data(dataframe):
    base_prompt = (
        "You are an expert in translating Gardiner codes into meaningful English sentences. "
        "Your task involves the following steps: \n"
        "1. For a single Gardiner code, provide its meaning clearly and concisely.\n"
        "2. For multiple Gardiner codes, explain each code individually and combine their meanings into a coherent sentence.\n"
        "3. If you encounter an unknown Gardiner code, respond with 'I do not know'.\n"
        "Guidelines: \n"
        "- Ensure accuracy and professionalism.\n"
        "- Use only the provided Gardiner codes and meanings.\n"
        "Available Gardiner codes and their meanings: \n"
    )
    codes_meanings = "\n".join(
        f"{row['gardiner_code']}: {row['english_translation']}"
        for _, row in dataframe.iterrows()
    )
    return base_prompt + codes_meanings

In [9]:
# Load data and generate the full prompt
data_path = '/content/cleaned_dataset2 (1).csv'
data = pd.read_csv(data_path, encoding='latin1')
explanation_prompt = generate_prompt_with_data(data)

In [10]:

# Initialize the text-generation pipeline
llm_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=100,
    temperature=0.5  # Lower temperature for more deterministic output
)

Device set to use cuda:0


In [11]:
# Single Gardiner code query
single_query = f"{explanation_prompt}\nWhat does Gardiner code 'A5' mean?"
single_response = llm_pipeline(single_query)
print(f"Response for Single Code: {single_response[0]['generated_text']}")


Response for Single Code: You are an expert in translating Gardiner codes into meaningful English sentences. Your task involves the following steps: 
1. For a single Gardiner code, provide its meaning clearly and concisely.
2. For multiple Gardiner codes, explain each code individually and combine their meanings into a coherent sentence.
3. If you encounter an unknown Gardiner code, respond with 'I do not know'.
Guidelines: 
- Ensure accuracy and professionalism.
- Use only the provided Gardiner codes and meanings.
Available Gardiner codes and their meanings: 
A1: man, names
A2: eat, drink, speak, think.
A3: sit.
A4: adoration, hide
A5: hide
A6: purity, cleanliness
A7: weary, weak
A8: jubilation
A9: load, carry, work
A10: sail
A11: friend
A12: army, soldier
A13: enemy
A14: die, enemy
A14a: die, enemy
A15: fall, overthrow
A16: bow
A17: young, child
A17a: sit, young
A18: child-king
A19: old, eldest, great one, chief
A20: old, eldest, great one, chief
A21: Official, noble
A22: sta

In [12]:
# Multiple Gardiner codes query
multiple_query = f"{explanation_prompt}\nWhat do Gardiner codes 'A5', 'A10', and 'A3' mean?"
multiple_response = llm_pipeline(multiple_query)
print(f"Response for Multiple Codes: {multiple_response[0]['generated_text']}")

Response for Multiple Codes: You are an expert in translating Gardiner codes into meaningful English sentences. Your task involves the following steps: 
1. For a single Gardiner code, provide its meaning clearly and concisely.
2. For multiple Gardiner codes, explain each code individually and combine their meanings into a coherent sentence.
3. If you encounter an unknown Gardiner code, respond with 'I do not know'.
Guidelines: 
- Ensure accuracy and professionalism.
- Use only the provided Gardiner codes and meanings.
Available Gardiner codes and their meanings: 
A1: man, names
A2: eat, drink, speak, think.
A3: sit.
A4: adoration, hide
A5: hide
A6: purity, cleanliness
A7: weary, weak
A8: jubilation
A9: load, carry, work
A10: sail
A11: friend
A12: army, soldier
A13: enemy
A14: die, enemy
A14a: die, enemy
A15: fall, overthrow
A16: bow
A17: young, child
A17a: sit, young
A18: child-king
A19: old, eldest, great one, chief
A20: old, eldest, great one, chief
A21: Official, noble
A22: 

In [15]:
# Test: Generate a sentence from multiple Gardiner codes
def test_generate_sentence_with_model():
    # Step 1: Prepare a query for multiple Gardiner codes
    gardiner_codes_query = "What do Gardiner codes 'A1', 'A2', 'G1', and 'F12' mean?"
    print("\nTesting Sentence Generation from Model Translations:")
    print(f"Query: {gardiner_codes_query}")

    # Step 2: Generate translations using the model
    translations_response = llm_pipeline(f"{explanation_prompt}\n{gardiner_codes_query}")
    translations = translations_response[0]['generated_text']
    print(f"Generated Translations: {translations}")

    # Step 3: Generate a full sentence based on the translations
    full_sentence = generate_sentence_from_translations(translations)
    print(f"Generated Sentence: {full_sentence}")

# Run the test
test_generate_sentence_with_model()




Testing Sentence Generation from Model Translations:
Query: What do Gardiner codes 'A1', 'A2', 'G1', and 'F12' mean?
Generated Translations: You are an expert in translating Gardiner codes into meaningful English sentences. Your task involves the following steps: 
1. For a single Gardiner code, provide its meaning clearly and concisely.
2. For multiple Gardiner codes, explain each code individually and combine their meanings into a coherent sentence.
3. If you encounter an unknown Gardiner code, respond with 'I do not know'.
Guidelines: 
- Ensure accuracy and professionalism.
- Use only the provided Gardiner codes and meanings.
Available Gardiner codes and their meanings: 
A1: man, names
A2: eat, drink, speak, think.
A3: sit.
A4: adoration, hide
A5: hide
A6: purity, cleanliness
A7: weary, weak
A8: jubilation
A9: load, carry, work
A10: sail
A11: friend
A12: army, soldier
A13: enemy
A14: die, enemy
A14a: die, enemy
A15: fall, overthrow
A16: bow
A17: young, child
A17a: sit, young
A1

In [16]:
# Evaluation Metrics
def evaluate_model(predictions, references):
    accuracy = accuracy_score(references, predictions)
    precision = precision_score(references, predictions, average='weighted')
    recall = recall_score(references, predictions, average='weighted')
    f1 = f1_score(references, predictions, average='weighted')

    print(f"Evaluation Metrics:\n")
    print(f"Accuracy: {accuracy:.2f}")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1 Score: {f1:.2f}")
