In [1]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
    LlamaTokenizer,
    LlamaForCausalLM, 
    GenerationConfig,
    Trainer,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer
import unicodedata
import string
from datasets import load_dataset
import re
import os
import io
from nltk import BigramCollocationFinder
import nltk.collocations


  from .autonotebook import tqdm as notebook_tqdm


# Loading the model

In [2]:
checkpoint = "iocuydi/llama-2-amharic-3784m"
commit_hash = "04fcac974701f1dab0b8e39af9d3ecfce07b3773"
# The commit hash is needed, because the model repo was rearranged after this commit (files -> finetuned/files),
# and I couldn't load the model from the new structure

tokenizer = LlamaTokenizer.from_pretrained(checkpoint, revision =commit_hash)

You are using the legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This means that tokens that come after special tokens will not be properly handled. We recommend you to read the related pull request available at https://github.com/huggingface/transformers/pull/24565


In [3]:
# Model and tokenizer loading
model_name = "NousResearch/Llama-2-7b-hf"
llama_model = LlamaForCausalLM.from_pretrained(
    model_name,
    load_in_8bit=True,
    device_map="auto"
)

Loading checkpoint shards: 100%|██████████| 2/2 [00:04<00:00,  2.16s/it]


In [4]:
import torch

torch.cuda.is_available()

True

In [5]:
llama_model.resize_token_embeddings(len(tokenizer)) # needed because the fine-tuned model extended the tokenizer

Embedding(51008, 4096)

In [6]:
# this is the model we want:
model = PeftModel.from_pretrained(llama_model, "iocuydi/llama-2-amharic-3784m",revision =commit_hash)

# Setting up the Parameters

In [7]:
################################################################################
# QLoRA parameters
################################################################################

# LoRA attention dimension
lora_r = 64

# Alpha parameter for LoRA scaling
lora_alpha = 16

# Dropout probability for LoRA layers
lora_dropout = 0.1

################################################################################
# bitsandbytes parameters
################################################################################

# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

################################################################################
# TrainingArguments parameters
################################################################################

# Output directory where the model predictions and checkpoints will be stored
output_dir = "./results"

# Number of training epochs
num_train_epochs = 1

# Enable fp16/bf16 training (set bf16 to True with an A100)
fp16 = False
bf16 = False

# Batch size per GPU for training
per_device_train_batch_size = 4

# Batch size per GPU for evaluation
per_device_eval_batch_size = 4

# Number of update steps to accumulate the gradients for
gradient_accumulation_steps = 1

# Enable gradient checkpointing
gradient_checkpointing = True

# Maximum gradient normal (gradient clipping)
max_grad_norm = 0.3

# Initial learning rate (AdamW optimizer)
learning_rate = 2e-4

# Weight decay to apply to all layers except bias/LayerNorm weights
weight_decay = 0.001

# Optimizer to use
optim = "paged_adamw_32bit"

# Learning rate schedule
lr_scheduler_type = "cosine"

# Number of training steps (overrides num_train_epochs)
max_steps = -1

# Ratio of steps for a linear warmup (from 0 to learning rate)
warmup_ratio = 0.03

# Group sequences into batches with same length
# Saves memory and speeds up training considerably
group_by_length = True

# Save checkpoint every X updates steps
save_steps = 0

# Log every X updates steps
logging_steps = 25

################################################################################
# SFT parameters
################################################################################

# Maximum sequence length to use
max_seq_length = None

# Pack multiple short examples in the same input sequence to increase efficiency
packing = False

# Load the entire model on the GPU 0
device_map = {"": 0}

# Load the data

In [8]:
from datasets import load_dataset

# List of dataset names
dataset_names = [
    "simonbutt/amharic_truthful_qa",
    "simonbutt/amharic_gsm8k",
    "EthioNLP/Amharic_LLAMA_MT",
    "EthioNLP/Amharic_Instruction_dataset",
    "Tvsybkzkmapab/Amharic_ad_generation",
    "BiniyamAjaw/amharic_dataset_v2",
    "Henok/amharic-qa"
]

# Dictionary to store loaded datasets
loaded_datasets = {}

# Loop through dataset names and load datasets
for dataset_name in dataset_names:
    try:
        dataset = load_dataset(dataset_name)
        loaded_datasets[dataset_name] = dataset
        print(f"Loaded {dataset_name} successfully.")
    except Exception as e:
        print(f"Failed to load {dataset_name}: {e}")

# Example of accessing a specific split of a dataset
if "rasyosef/amharic-news-category-classification" in loaded_datasets:
    dataset = loaded_datasets["rasyosef/amharic-news-category-classification"]
    if "train" in dataset:
        train_data = dataset["train"]
        print(f"Train data size: {len(train_data)}")
    if "test" in dataset:
        test_data = dataset["test"]
        print(f"Test data size: {len(test_data)}")
    if "validation" in dataset:
        val_data = dataset["validation"]
        print(f"Validation data size: {len(val_data)}")


Loaded simonbutt/amharic_truthful_qa successfully.
Loaded simonbutt/amharic_gsm8k successfully.
Loaded EthioNLP/Amharic_LLAMA_MT successfully.
Loaded EthioNLP/Amharic_Instruction_dataset successfully.
Loaded Tvsybkzkmapab/Amharic_ad_generation successfully.
Loaded BiniyamAjaw/amharic_dataset_v2 successfully.
Loaded Henok/amharic-qa successfully.


# Preprocessing the Data

In [9]:
import re
import os
import io
import nltk
from datasets import load_dataset
from nltk.collocations import BigramCollocationFinder

class AmharicPreprocessor:
    def __init__(self, expansion_file_dir='/home/nyamusi_ontita/Amharic_LLM_Finetuning-1/llm-finetuning/short_forms.txt', bigram_dir='bigrams.txt'):
        self.expansion_file_dir = expansion_file_dir
        self.bigram_dir = bigram_dir
        self.short_form_dict = self.get_short_forms()

    def get_short_forms(self):
        exp = {}
        try:
            with open(self.expansion_file_dir, encoding='utf8') as text:
                for line in text:
                    line = line.strip()
                    if not line:
                        continue
                    expanded = line.split("-")
                    exp[expanded[0].strip()] = expanded[1].replace(" ", '_').strip()
        except FileNotFoundError:
            print(f"File not found: {self.expansion_file_dir}")
        return exp

    def expand_short_form(self, input_short_word):
        return self.short_form_dict.get(input_short_word, input_short_word)

    def normalize_char_level_mismatch(self, input_token):
        replacements = [
            ('[ሃኅኃሐሓኻ]', 'ሀ'), ('[ሑኁዅ]', 'ሁ'), ('[ኂሒኺ]', 'ሂ'), ('[ኌሔዄ]', 'ሄ'), ('[ሕኅ]', 'ህ'),
            ('[ኆሖኾ]', 'ሆ'), ('[ሠ]', 'ሰ'), ('[ሡ]', 'ሱ'), ('[ሢ]', 'ሲ'), ('[ሣ]', 'ሳ'), ('[ሤ]', 'ሴ'),
            ('[ሥ]', 'ስ'), ('[ሦ]', 'ሶ'), ('[ዓኣዐ]', 'አ'), ('[ዑ]', 'ኡ'), ('[ዒ]', 'ኢ'), ('[ዔ]', 'ኤ'),
            ('[ዕ]', 'እ'), ('[ዖ]', 'ኦ'), ('[ጸ]', 'ፀ'), ('[ጹ]', 'ፁ'), ('[ጺ]', 'ፂ'), ('[ጻ]', 'ፃ'),
            ('[ጼ]', 'ፄ'), ('[ጽ]', 'ፅ'), ('[ጾ]', 'ፆ'), ('(ሉ[ዋአ])', 'ሏ'), ('(ሙ[ዋአ])', 'ሟ'),
            ('(ቱ[ዋአ])', 'ቷ'), ('(ሩ[ዋአ])', 'ሯ'), ('(ሱ[ዋአ])', 'ሷ'), ('(ሹ[ዋአ])', 'ሿ'),
            ('(ቁ[ዋአ])', 'ቋ'), ('(ቡ[ዋአ])', 'ቧ'), ('(ቹ[ዋአ])', 'ቿ'), ('(ሁ[ዋአ])', 'ኋ'),
            ('(ኑ[ዋአ])', 'ኗ'), ('(ኙ[ዋአ])', 'ኟ'), ('(ኩ[ዋአ])', 'ኳ'), ('(ዙ[ዋአ])', 'ዟ'),
            ('(ጉ[ዋአ])', 'ጓ'), ('(ደ[ዋአ])', 'ዷ'), ('(ጡ[ዋአ])', 'ጧ'), ('(ጩ[ዋአ])', 'ጯ'),
            ('(ጹ[ዋአ])', 'ጿ'), ('(ፉ[ዋአ])', 'ፏ'), ('[ቊ]', 'ቁ'), ('[ኵ]', 'ኩ')
        ]
        for pattern, replacement in replacements:
            input_token = re.sub(pattern, replacement, str(input_token))
        return input_token

    def remove_punc_and_special_chars(self, text):
        return re.sub(r'[\!\@\#\$\%\^\«\»\&\*\(\)\…\[\]\{\}\;\“\”\›\’\‘\"\'\:\,\.\‹\/\<\>\?\\\\|\`\´\~\-\=\+\፡\።\፤\;\፦\፥\፧\፨\፠\፣]', '', text)

    def remove_ascii_and_numbers(self, text_input):
        rm_num_and_ascii = re.sub('[A-Za-z0-9]', '', text_input)
        return re.sub('[\u1369-\u137C]+', '', rm_num_and_ascii)


    def arabic2geez(self, arabicNumber):
        ETHIOPIC_ONE = 0x1369
        ETHIOPIC_TEN = 0x1372
        ETHIOPIC_HUNDRED = 0x137B
        ETHIOPIC_TEN_THOUSAND = 0x137C

        arabicNumber = str(arabicNumber)
        n = len(arabicNumber) - 1
        if n % 2 == 0:
            arabicNumber = "0" + arabicNumber
            n += 1

        arabicBigrams = [arabicNumber[i:i+2] for i in range(0, n, 2)]
        reversedArabic = arabicBigrams[::-1]
        geez = []

        for index, pair in enumerate(reversedArabic):
            curr_geez = ''
            artens = pair[0]
            arones = pair[1]
            amtens = ''
            amones = ''
            if artens != '0':
                amtens = str(chr((int(artens) + (ETHIOPIC_TEN - 1))))
            else:
                if arones == '0':
                    continue
            if arones != '0':
                amones = str(chr((int(arones) + (ETHIOPIC_ONE - 1))))
            if index > 0:
                if index % 2 != 0:
                    curr_geez = amtens + amones + str(chr(ETHIOPIC_HUNDRED))
                else:
                    curr_geez = amtens + amones + str(chr(ETHIOPIC_TEN_THOUSAND))
            else:
                curr_geez = amtens + amones
            geez.append(curr_geez)

        geez = ''.join(geez[::-1])
        if geez.startswith('፩፻') or geez.startswith('፩፼'):
            geez = geez[1:]

        if len(arabicNumber) >= 7:
            end_zeros = ''.join(re.findall('([0]+)$', arabicNumber)[0:])
            i = int(len(end_zeros) / 3)
            if len(end_zeros) >= (3 * i):
                if i >= 3:
                    i -= 1
                for thoushand in range(i - 1):
                    geez += '፼'

        return geez

    def get_expanded_number(self, number):
        if '.' not in str(number):
            return self.arabic2geez(number)
        else:
            num, decimal = str(number).split('.')
            if decimal.startswith('0'):
                decimal = decimal[1:]
                dot = ' ነጥብ ዜሮ '
            else:
                dot = ' ነጥብ '
            return self.arabic2geez(num) + dot + self.arabic2geez(decimal)

    def tokenize(self, corpus):
        sentences = re.compile('[!?።\፡\፡]+').split(corpus)
        tokens = []
        for sentence in sentences:
            tokens.extend(sentence.split())
        return tokens

    def collocation_finder(self, tokens):
        bigram_measures = nltk.collocations.BigramAssocMeasures()
        finder = BigramCollocationFinder.from_words(tokens)
        finder.apply_freq_filter(3)
        frequent_bigrams = finder.nbest(bigram_measures.chi_sq, 5)

        with io.open(self.bigram_dir, "w", encoding="utf8") as PhraseWriter:
            for bigram in frequent_bigrams:
                PhraseWriter.write(bigram[0] + ' ' + bigram[1] + "\n")

    def normalize_multi_words(self, tokenized_sentence, corpus):
        bigram = set()
        sent_with_bigrams = []
        index = 0

        if not os.path.exists(self.bigram_dir):
            self.collocation_finder(self.tokenize(corpus))

        try:
            with open(self.bigram_dir, encoding='utf8') as phrase_file:
                for line in phrase_file:
                    bigram.add(tuple(line.strip().split()))
        except FileNotFoundError:
            print(f"File not found: {self.bigram_dir}")

        while index < len(tokenized_sentence):
            if index + 1 < len(tokenized_sentence) and (tokenized_sentence[index], tokenized_sentence[index + 1]) in bigram:
                sent_with_bigrams.append(
                    (tokenized_sentence[index] + "_" + tokenized_sentence[index + 1]))
                index += 1
            else:
                sent_with_bigrams.append(tokenized_sentence[index])
            index += 1
        return sent_with_bigrams

    def preprocess_text(self, text):
        text = self.expand_short_form(text)
        text = self.normalize_char_level_mismatch(text)
        text = self.remove_punc_and_special_chars(text)
        text = self.remove_ascii_and_numbers(text)
        return text


def load_huggingface_dataset(dataset_name):
    return load_dataset(dataset_name)

def preprocess_dataset(dataset, text_field, preprocessor):
    def preprocess_example(example):
        if text_field in example:
            example[text_field] = preprocessor.preprocess_text(example[text_field])
        return example

    return dataset.map(preprocess_example)

In [10]:
# Instantiate your preprocessor
preprocessor = AmharicPreprocessor()

# Define the field containing text data in your dataset
text_field = "text"  # Adjust this according to your dataset structure

# Preprocess the dataset
for dataset_name in loaded_datasets:
    dataset = loaded_datasets[dataset_name]
    if isinstance(dataset, dict):
        # If the loaded dataset is a dictionary containing splits (e.g., train, test, validation)
        for split_name in dataset:
            dataset[split_name] = preprocess_dataset(dataset[split_name], text_field, preprocessor)
    else:
        # If the loaded dataset is just a single dataset
        loaded_datasets[dataset_name] = preprocess_dataset(dataset, text_field, preprocessor)


# Accessing the train data and validation

In [11]:
# Create a new dictionary to store only the "train" splits
train_datasets = {}

# Iterate through loaded_datasets
for dataset_name, dataset in loaded_datasets.items():
    # Check if the dataset is a dictionary containing splits
    if isinstance(dataset, dict):
        # Check if the "train" split exists in the dataset
        if "train" in dataset:
            # Add the "train" split to the train_datasets dictionary
            train_datasets[dataset_name] = dataset["train"]
            # Remove the "train" split from the original dataset
            del dataset["train"]

# Print the names of datasets with "train" splits
print("Datasets with 'train' splits:")
for dataset_name in train_datasets:
    print(dataset_name)


Datasets with 'train' splits:
simonbutt/amharic_gsm8k
EthioNLP/Amharic_LLAMA_MT
EthioNLP/Amharic_Instruction_dataset
Tvsybkzkmapab/Amharic_ad_generation
BiniyamAjaw/amharic_dataset_v2
Henok/amharic-qa


In [12]:
# Create a new dictionary to store only the "validation" splits
validation_datasets = {}

# Iterate through loaded_datasets
for dataset_name, dataset in loaded_datasets.items():
    # Check if the dataset is a dictionary containing splits
    if isinstance(dataset, dict):
        # Check if the "validation" split exists in the dataset
        if "validation" in dataset:
            # Add the "validation" split to the validation_datasets dictionary
            validation_datasets[dataset_name] = dataset["validation"]
            # Remove the "validation" split from the original dataset
            del dataset["validation"]

# Print the names of datasets with "validation" splits
print("Datasets with 'validation' splits:")
for dataset_name in validation_datasets:
    print(dataset_name)



Datasets with 'validation' splits:
simonbutt/amharic_truthful_qa
EthioNLP/Amharic_LLAMA_MT
EthioNLP/Amharic_Instruction_dataset
Tvsybkzkmapab/Amharic_ad_generation
Henok/amharic-qa


# Initializing the trainer

In [13]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=1,
    gradient_checkpointing=True,
    max_grad_norm=0.3,
    learning_rate=2e-4,
    weight_decay=0.001,
    optim="paged_adamw_32bit",
    lr_scheduler_type="cosine",
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    save_steps=0,
    logging_steps=25
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=train_datasets,  # Pass the preprocessed training dataset here
    eval_dataset=validation_datasets,  # Pass the preprocessed validation dataset here
    
)

In [14]:
# Fine-tune the model
trainer.train()

KeyError: 0