In [None]:
!pip install -U kagglehub
!pip install -q 'transformers==4.47.1'
!pip install  accelerate datasets peft trl bitsandbytes --quiet

In [None]:
%%capture
!pip install unsloth
!pip install --force-reinstall --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git

**Darija** combines elements from Arabic, Berber, French, and Spanish, making it distinct from Modern Standard Arabic (MSA). As the primary language for informal communication, it is widely used in daily life, media, and social interactions. This project aims to fine-tune **Gemma 2** specifically for tasks involving **Moroccan Darija**, addressing its unique linguistic characteristics and regional variations.

# **Loading Model**

In [1]:
from unsloth import FastLanguageModel
import torch
modelName = "/kaggle/input/gemma-2/transformers/gemma-2-9b-it/2/"
max_seq_length = 2048  # Choose any! We auto support RoPE Scaling internally!
dtype = (
    None  # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
)
load_in_4bit = True
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=modelName,
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.1.5: Fast Gemma2 patching. Transformers: 4.47.1.
   \\   /|    GPU: Tesla T4. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 7.5. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [2]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2025.1.5 patched 42 layers with 42 QKV layers, 42 O layers and 42 MLP layers.


# **Loading Dataset**

In [6]:
instruction = {
    'determination':"""Transform the given input text in Darija from its indefinite forms to their corresponding definite forms in Darija.
    Maintain the structure and order of the words as in the input.""",
    'conj_past':"""Conjugate the given verb in Darija into its past tense for all pronouns (nta,nti,howa,hia,7na,ntoma,homa).""",
    'conj_present':"""Conjugate the given verb in Darija into its present tense for all pronouns (ana, nta, nti, howa, hiya, 7na, ntouma, homa).""",
    'imperative':'Generate the imperative conjugations of the given verb in Darija for specified pronouns (nta, nti, ntouma).',
    'pluralization':"""Pluralize the given nouns in Darija.
    Maintain the structure and order of the words as in the input.""",
    'nominalization':"""Perform nominalization on the given verbs in Darija. Convert the verbs into their corresponding noun forms.
    Maintain the structure and order of the words as in the input.""",
    'name_darija_to_arab':"""Convert names from Darija to Arabic.
    Maintain the structure and order of the words as in the input.""",
    'darija_to_arab':"""Convert names from Darija to Arabic.
    Maintain the structure and order of the words as in the input.""",
    'darija_arabic_to_arabic':"""Translate words from Darija Arabic to Arabic.
    Maintain the structure and order of the words as in the input.""",
    'darija_arabic_to_darija':"""Translate words from Darija Arabic to Darija.
    The input consists of words in Darija Arabic, separated by /n/n for each word.
    Provide multiple possible ways a word can be written in Darija, if applicable.
    Maintain the structure and order of the words as in the input.""" ,
    'alpaca':"""Perform question answering and provide the output in Arabic""",
    'textgen':"""Complete the text by generating a continuation for the given input."""
}

In [4]:
import kagglehub
# modified darija-eng-arabic-linguistic_dataset
path = kagglehub.dataset_download("aminemontasir/moroccan-arabic-darija-task-dataset")

In [5]:
from datasets import load_dataset
ds = load_dataset('FreedomIntelligence/alpaca-gpt4-arabic')
ds

DatasetDict({
    train: Dataset({
        features: ['id', 'conversations'],
        num_rows: 49969
    })
})

In [6]:
def preprocess(batch):
  batch['instruction'] = instruction['alpaca']
  batch['input'] = batch['conversations'][0]['value']
  batch['output'] = batch['conversations'][1]['value']
  return batch
ds1 = ds['train'].map(preprocess,remove_columns=['conversations','id'])
ds1 = ds1.shuffle(47).select(range(5000))
ds1

Map:   0%|          | 0/49969 [00:00<?, ? examples/s]

Dataset({
    features: ['instruction', 'input', 'output'],
    num_rows: 5000
})

In [7]:
ds = load_dataset("csv", data_files="/kaggle/input/moroccan-arabic-darija-task-dataset/darija_tasks.csv")
ds

DatasetDict({
    train: Dataset({
        features: ['inputs', 'targets', 'types'],
        num_rows: 24267
    })
})

In [8]:
def preprocess(batch):
  batch['instruction'] = instruction[batch['types']]
  batch['input'] = batch['inputs']
  batch['output'] = batch['targets']
  return batch
ds2 = ds['train'].map(preprocess,remove_columns=['inputs', 'targets', 'types'])
ds2

Dataset({
    features: ['instruction', 'input', 'output'],
    num_rows: 24267
})

In [9]:
ds = load_dataset('AbderrahmanSkiredj1/moroccan_darija_wikipedia_dataset')
ds

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 4862
    })
})

In [10]:
def transform_text(batch):
    inputs = []
    outputs = []
    inst = []
    for text in batch['text']:
        words = text.split()
        inputs.append(" ".join(words[:10]))  
        outputs.append(" ".join(words[10:])) 
        inst.append(instruction['textgen'])
    return {'instruction':inst,'input': inputs, 'output': outputs}
ds3 = ds['train'].map(transform_text, batched=True,remove_columns=['text'])
ds3

Dataset({
    features: ['instruction', 'input', 'output'],
    num_rows: 4862
})

In [11]:
print(ds1,ds2,ds3)

Dataset({
    features: ['instruction', 'input', 'output'],
    num_rows: 5000
}) Dataset({
    features: ['instruction', 'input', 'output'],
    num_rows: 24267
}) Dataset({
    features: ['instruction', 'input', 'output'],
    num_rows: 4862
})


In [12]:
from datasets import concatenate_datasets
dataset = concatenate_datasets([ds1,ds2,ds3])
dataset = dataset.shuffle(seed=42)
dataset

Dataset({
    features: ['instruction', 'input', 'output'],
    num_rows: 34129
})

In [13]:
gemma_prompt = """<start_of_turn>user Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
### Instruction:
{}

### Input:
{}

<end_of_turn>
<start_of_turn>model 
### Response:
{}<end_of_turn>"""
EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = gemma_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass
dataset = dataset.map(formatting_prompts_func, batched = True,)

Map:   0%|          | 0/34129 [00:00<?, ? examples/s]

# **Finetunning**

In [14]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 30, # for good result more than 4000
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
    ),
)

Map (num_proc=2):   0%|          | 0/34129 [00:00<?, ? examples/s]

In [15]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 34,129 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 30
 "-____-"     Number of trainable parameters = 54,018,048


Step,Training Loss
1,4.1154
2,3.845
3,3.5439
4,3.3339
5,2.9624
6,3.0645
7,2.5633
8,2.9599
9,2.7704
10,2.2229


In [16]:
# Local saving
model.save_pretrained("saved_model")  
tokenizer.save_pretrained("saved_model")

('saved_model/tokenizer_config.json',
 'saved_model/special_tokens_map.json',
 'saved_model/tokenizer.model',
 'saved_model/added_tokens.json',
 'saved_model/tokenizer.json')

# **Testing**

In [1]:
import kagglehub

# Download latest version
path = kagglehub.model_download("aminemontasir/gemma_2-9b_darija/transformers/default")

print("Path to model files:", path)


Path to model files: /kaggle/input/gemma_2-9b_darija/transformers/default/1


In [2]:
!pip install transformers peft



In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

tokenizer = AutoTokenizer.from_pretrained("/kaggle/input/gemma-2/transformers/gemma-2-9b-it/2/")
model = AutoModelForCausalLM.from_pretrained(
    "/kaggle/input/gemma-2/transformers/gemma-2-9b-it/2/",
    device_map="auto",
    torch_dtype=torch.bfloat16
)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [4]:
from peft import PeftModel

lora_weights_path = "/kaggle/input/gemma_2-9b_darija/transformers/default/1"  # Replace with the path to your LoRA weights
model = PeftModel.from_pretrained(model, lora_weights_path)


In [7]:
gemma_prompt = """<start_of_turn>user Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
### Instruction:
{}

### Input:
{}

<end_of_turn>
<start_of_turn>model 
### Response:
"""
input_text = gemma_prompt.format(instruction['alpaca'],"شرح ليا كيفاش نتجاوز سرعة الضو")


In [8]:
input_ids = tokenizer(input_text, return_tensors="pt").to('cuda')
outputs = model.generate(**input_ids,max_new_tokens=364, use_cache=True)
print(tokenizer.decode(outputs[0]))

The 'batch_size' attribute of HybridCache is deprecated and will be removed in v4.49. Use the more precisely named 'self.max_batch_size' attribute instead.


<bos><start_of_turn>user Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
### Instruction:
Perform question answering and provide the output in Arabic

### Input:
شرح ليا كيفاش نتجاوز سرعة الضو

<end_of_turn>
<start_of_turn>model 
### Response:
وفقًا للفيزياء الحديثة، من المستحيل تجاوز سرعة الضوء. في عام 1905، نشر ألبرت أينشتاين نظرية النسبية الخاصة، التي تنص على أن سرعة الضوء في الفراغ هي ثابت عالمي، وتبلغ حوالي 299،792،458 مترًا في الثانية.

تتضمن نظرية النسبية الخاصة أيضًا مفهومًا يسمى "الكتلة النسبية"، الذي يشير إلى أن كتلة الجسم تزداد مع زيادة سرعته. عندما يقترب الجسم من سرعة الضوء، تزداد كتلته بشكل لا نهائي، مما يجعل من المستحيل الوصول إلى سرعة الضوء.

بالإضافة إلى ذلك، تنص نظرية النسبية العامة، التي نشرها أينشتاين في عام 1915، على أن سرعة الضوء هي الحد الأقصى لسرعة أي شيء في الكون.

لذلك، على الرغم من أننا نستطيع الوصول إلى نسب عالية من السرعة، إلا أننا لن نتمكن أبدًا من 