In [1]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    AutoTokenizer,
    TrainingArguments,
    pipeline,
)
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training
from trl import SFTTrainer
from utils import calculate_length, preprocess_article

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data = load_dataset("csv", data_files="/home/getachew_abebe/LLM_Fine-Tunning_Amharic/data/Amharic.csv")

In [3]:
data['train'] = data['train'].map(calculate_length, batched=False)
data['train'] = data['train'].map(preprocess_article, batched=False)

In [4]:
# Model
base_model = "NousResearch/Llama-2-7b-hf"
new_model = "llama-2-7b-Amharic"
# Dataset
dataset = data
# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model, use_fast=True)
tokenizer.pad_token = tokenizer.unk_token
tokenizer.padding_side = "right"

In [5]:
# Quantization configuration
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

# LoRA configuration
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj']
)

# Load base moodel
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map={"": 0}
)

# Cast the layernorm in fp32, make output embedding layer require grads, add the upcasting of the lmhead to fp32
model = prepare_model_for_kbit_training(model)

Loading checkpoint shards: 100%|██████████| 2/2 [00:13<00:00,  6.94s/it]


: 

In [13]:
print(list(dataset.keys()))

['train']


In [24]:
# Set training arguments
training_arguments = TrainingArguments(
        output_dir="./results",
        num_train_epochs=3,
        per_device_train_batch_size=10,
        gradient_accumulation_steps=1,
        evaluation_strategy="steps",
        eval_steps=1000,
        logging_steps=1,
        optim="paged_adamw_8bit",
        learning_rate=2e-4,
        lr_scheduler_type="linear",
        warmup_steps=10,
        # report_to="wandb",
        max_steps=2, # Remove this line for a real fine-tuning
        logging_strategy="steps",  # <--- Add this line
        save_strategy="steps",  # <--- Add this line

)
# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset['train'],
    eval_dataset=dataset['train'],
    peft_config=peft_config,
    dataset_text_field="article",
    max_seq_length=512,
    tokenizer=tokenizer,
    args=training_arguments,
)

# Train model
trainer.train()
# # Access the training and evaluation losses
# train_loss = trainer.state.log_history[-1]['train_loss']  # Last training loss
# eval_loss = trainer.state.log_history[-1]['eval_loss']   # Last evaluation loss

# print(f"Training Loss: {train_loss}")
# print(f"Evaluation Loss: {eval_loss}")
# Save trained model
trainer.model.save_pretrained(new_model)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


max_steps is given, it will override any value given in num_train_epochs


Step,Training Loss,Validation Loss


In [18]:
dataset['article']

AttributeError: 'DatasetDict' object has no attribute 'describe'

In [None]:
category = list(set(data['train']['category']))

checkpoint = "meta-llama/Llama-2-7b-hf"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# Tokenize the dataset
category_to_id = {cat: idx for idx, cat in enumerate(category)}

def tokenize_function(example):
    inputs = tokenizer(example['article'], padding=True, truncation=True, max_length=512)
    inputs["labels"] = category_to_id[example["category"]]  # Assuming category is already integer-encoded
    return inputs



In [9]:
tokenized_datasets = raw_datasets.map(tokenize_function)
# Use a data collator to apply dynamic batches

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors='pt')

tokenized_datasets.set_format("torch")

print(tokenized_datasets)

Map: 100%|██████████| 49532/49532 [03:15<00:00, 253.63 examples/s]
Map: 100%|██████████| 12383/12383 [00:48<00:00, 254.55 examples/s]

DatasetDict({
    train: Dataset({
        features: ['article', 'category', 'word_count', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 49532
    })
    test: Dataset({
        features: ['article', 'category', 'word_count', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 12383
    })
})





In [12]:
tokenized_datasets['train'][1]

{'article': 'ሶማልያ የሚገኘውና ሞቅዲሾ ስታዲየም ውስጥ ሰፍሮ የነበረው የተመድ ሚሽን አሚሶም ከስፖርት ሜዳው ለቆ መውጣቱ ተገለፀአፍሪካ ህብረት ሚሽን በመጨረሻ የወጣቱን ድምፅ ሰምቶ ብሄራዊው ስታዲየም እንደገና ለስፖርቱ እንቅስቃሴ እንዲውል ሜዳውን በመልቀቁ እጅግ ተደስተናል ሲሉ የወጣቶችና ስፖርት ሚኒስትሯ ከሀዲጆ ሞሀመድ በርክክቡ ስነ ስርአት ላይ ተናግረዋልየሶማልያው ፕሬዚደንት ሞሀመድ አብዱላሂ ፎርማጆ በበኩላቸው መንግስት የጦር ቀጣና ሆኖ የቆየውንና በእጅጉ የተጎዳውን ስታዲየም ለሀገሪቱ የስፖርት እንቅስቃሴ ለማዋል ጥረት እንደሚያደርግ አስታውቀዋልይህ እአአ በ1970ዎቹ በቻይናውያን የተገነባው ታዲየም ከወታደራዊ ልምምድ ሌላ ምንም አይነት የአትሌቲክስ እንቅስቃሴ እንዳላስተናገደ ይታወቃል',
 'category': 'International News',
 'word_count': tensor(443),
 'input_ids': tensor([    1, 29871,   228,   139,   185,   228,   139,   158,   228,   139,
           144,   228,   142,   174, 29871,   228,   142,   171,   228,   139,
           157,   228,   143,   139,   228,   141,   155,   228,   142,   144,
           228,   141,   150, 29871,   228,   139,   161,   228,   140,   136,
           228,   142,   181,   228,   139,   193, 29871,   228,   139,   184,
           228,   140,   182,   228,   142,   181,   228,   142,   171,   228,
  

In [13]:
# Load the model
from transformers import AutoModelForSequenceClassification

# roberta-base

model = AutoModelForSequenceClassification.from_pretrained(
    checkpoint,
    num_labels=len(category),
    id2label = {i: lbl for i, lbl in enumerate(category)},
    label2id = {lbl: i for i, lbl in enumerate(category)},
    device_map="cuda"
)

Downloading shards: 100%|██████████| 2/2 [00:19<00:00,  9.90s/it]
Loading checkpoint shards: 100%|██████████| 2/2 [00:06<00:00,  3.44s/it]
Some weights of Phi3ForSequenceClassification were not initialized from the model checkpoint at microsoft/Phi-3-mini-128k-instruct and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
from transformers import TrainingArguments

batch_size = 64
epochs = 5

training_args = TrainingArguments(
    output_dir=checkpoint+"-finetuned",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epochs,
    weight_decay=0.1,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    fp16=True,
    seed=42,
)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [15]:
import evaluate
import numpy as np

def compute_metrics(eval_preds):
  metric1 = evaluate.load("accuracy")
  metric2 = evaluate.load("precision")
  metric3 = evaluate.load("recall")
  metric4 = evaluate.load("f1")

  logits, labels = eval_preds
  predictions = np.argmax(logits, axis=-1)

  accuracy = metric1.compute(predictions=predictions, references=labels)["accuracy"]
  precision = metric2.compute(predictions=predictions, references=labels, average='weighted')["precision"]
  recall = metric3.compute(predictions=predictions, references=labels, average='weighted')["recall"]
  f1 = metric4.compute(predictions=predictions, references=labels, average='weighted')["f1"]

  return {
      "accuracy": accuracy,
      "precision": precision,
      "recall": recall,
      "f1": f1
  }

compute_metrics(([[1,0], [0,1]], [0,1]))

{'accuracy': 1.0, 'precision': 1.0, 'recall': 1.0, 'f1': 1.0}

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

In [28]:
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# Load metrics and evaluate the model

from torch.utils.data import DataLoader
eval_dataset = tokenized_datasets["test"].remove_columns([
    'article', 'category', 'word_count'
    ]).with_format("torch")



eval_dataloader = DataLoader(
    eval_dataset,
    shuffle=True,
    batch_size=16,
    collate_fn=data_collator,
)

y_test, y_pred = [], []
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to('cuda') for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    y_pred.extend(predictions.cpu().numpy())
    y_test.extend(batch["labels"].cpu().numpy())


In [29]:
import evaluate
import numpy as np

def compute_metrics(y_pred, y_test):
  metric1 = evaluate.load("accuracy")
  metric2 = evaluate.load("precision")
  metric3 = evaluate.load("recall")
  metric4 = evaluate.load("f1")

  #logits, labels = y_preds
  #predictions = np.argmax(logits, axis=-1)

  accuracy = metric1.compute(predictions=y_pred, references=y_test)["accuracy"]
  precision = metric2.compute(predictions=y_pred, references=y_test, average='weighted')["precision"]
  recall = metric3.compute(predictions=y_pred, references=y_test, average='weighted')["recall"]
  f1 = metric4.compute(predictions=y_pred, references=y_test, average='weighted')["f1"]

  return {
      "accuracy": accuracy,
      "precision": precision,
      "recall": recall,
      "f1": f1
  }


In [30]:
compute_metrics(y_pred, y_test)

{'accuracy': 0.875232173140596,
 'precision': 0.8749618364453737,
 'recall': 0.875232173140596,
 'f1': 0.8748733706909614}

In [31]:
compute_metrics(y_pred, y_test)

{'accuracy': 0.875232173140596,
 'precision': 0.8749618364453737,
 'recall': 0.875232173140596,
 'f1': 0.8748733706909614}

In [32]:
metric = evaluate.load("f1")
metric.compute(predictions=y_pred, references=y_test, average='weighted')

{'f1': 0.8748733706909614}