<a href="https://colab.research.google.com/github/LaiTechTinker/Aisheduler/blob/main/translator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# this is to mount our drive for dataset filepath loading
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
from datasets import load_dataset
file_path="/content/drive/My Drive/language_classifier/call_center_dataset_pure_yoruba.json"
# Login using e.g. `huggingface-cli login` to access this dataset
# ds = load_dataset("michsethowusu/english-yoruba_sentence-pairs_mt560")

In [3]:
# this is to convert our normal json file to hugging face dataset format
dataset=load_dataset("json", data_files=file_path)
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['messages'],
        num_rows: 25000
    })
})


In [4]:
#now let's flatten our dataset
def flatten_conversation(example):
    texts = []
    for msg in example["messages"]:
        if msg["role"] == "user":
            texts.append(f"Oníbàárà: {msg['content']}")
        else:
            texts.append(f"Aṣojú: {msg['content']}")
    return {"text": "\n".join(texts)}


In [5]:
new_dataset=dataset.map(flatten_conversation)

In [6]:
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, TaskType
import torch
!pip install -U transformers # Add this line to update transformers





In [7]:
!pip install -U bitsandbytes




In [8]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16

)

In [None]:
from huggingface_hub import login
from google.colab import userdata

# Log in to Hugging Face Hub using a Colab secret
HF_TOKEN = userdata.get('HF_TOKEN')
login(token=HF_TOKEN)

# # Using an openly accessible model as Llama-3 requires gated access
# model_name="HuggingFaceH4/zephyr-7b-beta"
model_name="NCAIR1/N-ATLaS"
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config, # Added this line to apply 4-bit quantization
    # torch_dtype=torch.float16, # Removed this, as bnb_config handles compute dtype
    device_map="auto",
    token=HF_TOKEN
)
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=False, token=HF_TOKEN) # Explicitly pass the token

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
total_parameters=sum(p.numel() for p in model.parameters())
print("total number of parameters is {}".format(total_parameters))

In [None]:
# print(ds)

In [None]:
# def tokenize(batch):
#     texts = [
#         f"Translate English to Yoruba:\nEnglish: {eng}\nYoruba: {yor}"
#         for eng, yor in zip(batch['eng'], batch['yor'])
#     ]

#     tokens = tokenizer(
#         texts,
#         padding="max_length",
#         truncation=True,
#         max_length=256,
#         return_tensors="pt"
#     )

#     tokens["labels"] = tokens["input_ids"].clone()
#     return tokens


In [None]:
tokenized_data = new_dataset.map(
    lambda x: {
        **tokenizer(
            x["text"],
            truncation=True,
            max_length=256, # Reduced from 1024 to 512
            padding="max_length"
        ),
        "labels": tokenizer(
            x["text"],
            truncation=True,
            max_length=256, # Reduced from 1024 to 512
            padding="max_length"
        )["input_ids"],
    },
    batched=True,
    remove_columns=["text", "messages"]
)

In [None]:
print(tokenized_data)

In [None]:
print(tokenized_data['train'][0]["input_ids"])

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir = './yoruba-finetuned',
    per_device_train_batch_size = 1,
    gradient_accumulation_steps = 16,
    learning_rate = 1e-3,
    num_train_epochs = 3,
    fp16 = True,
    logging_steps = 20,
    save_strategy = 'epoch',
    report_to = 'none',
    remove_unused_columns = False,
    label_names = ["labels"]
)

In [None]:
lora_config = LoraConfig(
    r=8, # Rank of the update matrices
    lora_alpha=32, # Scaling factor for LoRA
    target_modules=['q_proj', 'v_proj'], # Modules to apply LoRA to (common for attention mechanism)
    lora_dropout=0.05,
    bias='none',
    task_type=TaskType.CAUSAL_LM # Task type for language modeling
)

model = get_peft_model(model, lora_config)

trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_data['train'], # Specify the 'train' split
    processing_class = tokenizer
)

In [None]:
trainer.train()