# Step 1: Environment Setup


## 1.1 Install Required Libraries


In [None]:
! pip install transformers peft accelerate datasets bitsandbytes

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m31.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl (122.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━

## 1.2 Verify GPU Availability


In [None]:
import torch

if torch.cuda.is_available():
    print("GPU is available:", torch.cuda.get_device_name(0))
else:
    print("GPU is not available. Please enable GPU in your environment.")


GPU is available: NVIDIA A100-SXM4-40GB


# Step 2: Load the SILMA Model


## 2.1 Load the Model


In [None]:
from google.colab import drive
from huggingface_hub import login

# Authenticate using your token
login(token="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")


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch

# Configure 4-bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,  # Enable 4-bit quantization
    bnb_4bit_quant_type="nf4",  # Use NormalFloat4 quantization
)

# Model ID from Hugging Face
model_id = "silma-ai/SILMA-9B-Instruct-v1.0"


model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,  # Apply 4-bit quantization
    device_map="auto",  # Automatically map model to devices
    torch_dtype=torch.float16  # Mixed precision for efficient computation
)

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)  # Enable custom code if required


print("4-bit quantized model and tokenizer loaded successfully!")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

4-bit quantized model and tokenizer loaded successfully!


# Step 3: Prepare the Dataset


## 3.1 Load the Dataset


### Mount Drive

In [None]:
# Mount Google Drive
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


### Load Data

In [None]:
import pandas as pd
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("silma-ai/SILMA-9B-Instruct-v1.0"
, trust_remote_code=True)  # Enable custom code if required


# Load the dataset into a pandas DataFrame
file_path = "/content/drive/My Drive/Colab Notebooks/Verso_GP/data_filtered.json"
df = pd.read_json(file_path)

# Function to calculate token lengths
def calculate_lengths(row):
    input_prompt = f"اكتب قصيدة بعنوان '{row['poem_name']}' على بحر '{row['poem_meter']}' وعدد أبيات {int(row['line_count'] // 2)}."
    input_length = len(tokenizer(input_prompt, truncation=False)["input_ids"])
    output_length = len(tokenizer(row["poem_content"], truncation=False)["input_ids"])
    total_length = input_length + output_length
    return total_length

# Calculate total token lengths for all entries
df["total_length"] = df.apply(calculate_lengths, axis=1)

# Filter out entries exceeding 512 tokens
filtered_df = df[df["total_length"] <= 512]

# Save the filtered dataset
filtered_file_path = "/content/drive/My Drive/Colab Notebooks/Verso_GP/data_filtered_512.json"
filtered_df.drop(columns=["total_length"], inplace=True)  # Drop the helper column
filtered_df.to_json(filtered_file_path, orient="records", force_ascii=False)

print(f"Filtered dataset saved to {filtered_file_path}.")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/46.9k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df.drop(columns=["total_length"], inplace=True)  # Drop the helper column


Filtered dataset saved to /content/drive/My Drive/Colab Notebooks/Verso_GP/data_filtered_512.json.


## 3.2 Prepare the Dataset


In [None]:
from datasets import load_dataset

# Load dataset from JSON file
dataset = load_dataset("json", data_files="/content/drive/My Drive/Colab Notebooks/Verso_GP/data_filtered_512.json")

# Inspect the dataset structure
print(dataset)  # Print the first record

# Inspect the dataset structure
print(dataset["train"][0])  # Print the first record

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['poem_name', 'poem_meter', 'poem_content', 'line_count'],
        num_rows: 880
    })
})
{'poem_name': 'مقامات أهل الدين عند بني الدنيا', 'poem_meter': 'طويل', 'poem_content': 'مقاماتُ أهلِ الدين عند بني الدُّنيا\nمقامات معنيين بالرّتبة العليا\nرأوا ربّهم قبل الَّذين يرونهم\nفمالوا مع لاإِثبات واطّرحوا النّفيا\nولم يطلبوا إلا رضا الله وحدَه\nولو أسخطوا هذا الأنام بلا ثنيا\nومن كان بالتّحقيق للحق قائلا\nتُقُبِّل منه ما يقول ولو أعيا\nفكم مالك باغ أصاخ لواعظ\nبحق فلم يَسطِع على وعظه بغيا\nنواصي ملوك الأرض في يد مالك\nيصرّف في مخلوقه الامر والنّهيا\nفلا تُرض مَخلُوقاً باسخاطِ خالقٍ\nفربّك أولى من يُخَافُ ويُستَحيَا\nومن ىثر المولى على كلٍّ حالة\nرأى آمليه في الممات وفي المحيا\nفيا من يريد الله حقاً بقوله\nتلبث فقد غلّبت دنيا على دنيا\nإذا ما جعلت الزيّ برّك والتّقى\nفإنّك يوم الحشر أكرمهم زيّا', 'line_count': 10}


In [None]:
from datasets import DatasetDict

def preprocess_function(examples):
    # Construct input prompts for all examples in the batch
    input_prompts = [
        f"اكتب قصيدة بعنوان '{name}' على بحر '{meter}' وعدد أبيات {lines}."
        for name, meter, lines in zip(examples["poem_name"], examples["poem_meter"], examples["line_count"])
    ]
    output_poems = examples["poem_content"]

    # Tokenize input prompts and output poems
    inputs = tokenizer(input_prompts, truncation=True, max_length=512, padding="max_length")
    outputs = tokenizer(output_poems, truncation=True, max_length=512, padding="max_length")

    # Align labels with outputs and replace padding tokens with -100
    labels = outputs["input_ids"]
    labels = [
        [-100 if token == tokenizer.pad_token_id else token for token in sequence]
        for sequence in labels
    ]

    # Add labels to the input dictionary
    inputs["labels"] = labels
    return inputs

# Apply preprocessing to the dataset in batched mode
tokenized_dataset = dataset.map(preprocess_function, batched=True)

# Check if validation set exists; if not, split the dataset
if "validation" not in tokenized_dataset:
    split_dataset = tokenized_dataset["train"].train_test_split(test_size=0.1, seed=42)
    tokenized_dataset = DatasetDict({
        "train": split_dataset["train"],
        "validation": split_dataset["test"],  # Rename 'test' to 'validation'
    })
    print("Dataset split into training and validation sets.")

# Inspect the structure and a sample from the dataset
print(tokenized_dataset)
print(tokenized_dataset["train"][0])


Map:   0%|          | 0/880 [00:00<?, ? examples/s]

Dataset split into training and validation sets.
DatasetDict({
    train: Dataset({
        features: ['poem_name', 'poem_meter', 'poem_content', 'line_count', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 792
    })
    validation: Dataset({
        features: ['poem_name', 'poem_meter', 'poem_content', 'line_count', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 88
    })
})
{'poem_name': 'قدمت قدوم اليسر في أثر العسر', 'poem_meter': 'طويل', 'poem_content': 'قدمتَ قدومُ اليسر في أَثرِ العُسرِ\nوجئت كما جاءَ الغنى بدل الفقر\nفاهلاً به من قادمِ كانَ قربُه\nكروح أتى المكروب من حيث لا يَدري\nقربتَ فعمر الليلّ نزرُّ وإِن تغبْ\nفيا بُعدَ ما بين الغرُوب إِلى الفجر\nحكت أَلف شهرٍ ليلةً منك في النوى\nعلى انها عند اللقا ليلَة القدرِ\nوعدتَ فعادت في صدورِ قلوبِها\nفاهلاً وسهلاً بالفؤاد إِلى الصدرَ\nفحمدٌ وشكر إِن ربَّك لم يكنَ\nيكافي بغير الحمد لله والشكرَ', 'line_count': 6, 'input_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [None]:
import math

# Check the training dataset
for idx, sample in enumerate(tokenized_dataset["train"]):
    # Check if any value in the input_ids or labels is NaN
    if any(math.isnan(value) for value in sample["input_ids"]) or any(math.isnan(value) for value in sample["labels"]):
        print(f"NaN detected in training sample at index {idx}: {sample}")

# Check the validation dataset
for idx, sample in enumerate(tokenized_dataset["validation"]):
    # Check if any value in the input_ids or labels is NaN
    if any(math.isnan(value) for value in sample["input_ids"]) or any(math.isnan(value) for value in sample["labels"]):
        print(f"NaN detected in validation sample at index {idx}: {sample}")


# Step 4: Configure LoRA


## 4.1 Configure LoRA



In [None]:
from peft import LoraConfig, get_peft_model

# Configure LoRA
lora_config = LoraConfig(
    r=16,  # Low-rank adaptation dimension
    lora_alpha=32,  # Scaling factor
    target_modules=["q_proj", "v_proj"],  # Apply LoRA to attention layers
    lora_dropout=0.1,  # Regularization
    bias="none",  # Do not fine-tune biases
    task_type="CAUSAL_LM"  # Language modeling task
)

# Apply LoRA layers to the 4-bit quantized model
model = get_peft_model(model, lora_config)

# Disable cache for compatibility with LoRA
model.config.use_cache = False

print("LoRA applied to the 4-bit quantized model successfully!")


LoRA applied to the 4-bit quantized model successfully!


# Step 5: Fine-Tuning the Model


## 5.1 Define Training Arguments



In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="/content/drive/My Drive/Colab Notebooks/Verso_GP/model/lora-4bit-silma-poetry",
    per_device_train_batch_size=2,  # Increase batch size to speed up training with a smaller dataset
    gradient_accumulation_steps=8,  # Lower gradient accumulation steps for quicker optimization
    eval_strategy="steps",
    eval_steps=50,  # Evaluate every 50 steps since the dataset is smaller
    save_steps=50,  # Save checkpoints every 50 steps
    save_total_limit=2,  # Keep only the 2 most recent checkpoints
    logging_steps=50,  # Log metrics every 50 steps
    num_train_epochs=5,  # Increase epochs to fully utilize the small dataset
    learning_rate=5e-5,  # Lower learning rate for finer tuning
    fp16=True,  # Mixed precision for faster training
    load_best_model_at_end=True,
    report_to="none",  # Disable external logging
    max_grad_norm=1.0,  # Clip gradients to prevent instability
)

print("Training arguments updated for 1k dataset.")


Training arguments updated for 1k dataset.


## 5.2 Initialize the Trainer



## Small Training Example

In [None]:
from transformers import Trainer

small_train_dataset = tokenized_dataset["train"].select(range(10))
small_eval_dataset = tokenized_dataset["validation"].select(range(5))

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    tokenizer=tokenizer,
)
trainer.train()

  trainer = Trainer(


Step,Training Loss,Validation Loss


TrainOutput(global_step=1, training_loss=89.00733947753906, metrics={'train_runtime': 5.6288, 'train_samples_per_second': 8.883, 'train_steps_per_second': 0.888, 'total_flos': 511988591493120.0, 'train_loss': 89.00733947753906, 'epoch': 3.4})

## Training on The Full Dataset


In [None]:
import torch

# Clear GPU memory
torch.cuda.empty_cache()

In [None]:
import os
from transformers import Trainer

# Define checkpoint directory (matches output_dir in TrainingArguments)
CHECKPOINT_DIR = "/content/drive/My Drive/Colab Notebooks/Verso_GP/model/lora-4bit-silma-poetry"

# Create the directory if it doesn't exist
os.makedirs(CHECKPOINT_DIR, exist_ok=True)

# Automatically detect the latest checkpoint
def get_latest_checkpoint(output_dir):
    if not os.path.exists(output_dir):
        print(f"Checkpoint directory does not exist: {output_dir}")
        return None
    checkpoints = [
        os.path.join(output_dir, d)
        for d in os.listdir(output_dir)
        if os.path.isdir(os.path.join(output_dir, d)) and d.startswith("checkpoint")
    ]
    if checkpoints:
        latest_checkpoint = max(checkpoints, key=os.path.getctime)
        print(f"Resuming from checkpoint: {latest_checkpoint}")
        return latest_checkpoint
    print("No checkpoint found. Starting from scratch.")
    return None

# Detect the latest checkpoint
latest_checkpoint = get_latest_checkpoint(CHECKPOINT_DIR)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],  # Training data
    eval_dataset=tokenized_dataset["validation"],  # Validation data
    tokenizer=tokenizer  # Tokenizer for processing
)

# Start training (resume if checkpoint exists)
trainer.train(resume_from_checkpoint=latest_checkpoint)

Resuming from checkpoint: /content/drive/My Drive/Colab Notebooks/Verso_GP/model/lora-4bit-silma-poetry/checkpoint-200


  trainer = Trainer(
  torch.load(os.path.join(checkpoint, OPTIMIZER_NAME), map_location=map_location)
  checkpoint_rng_state = torch.load(rng_file)


Step,Training Loss,Validation Loss


TrainOutput(global_step=245, training_loss=1.0239289575693558, metrics={'train_runtime': 202.768, 'train_samples_per_second': 19.53, 'train_steps_per_second': 1.208, 'total_flos': 1.0034976393265152e+17, 'train_loss': 1.0239289575693558, 'epoch': 4.98989898989899})