# Data Preparation

In [None]:
import pandas as pd

## Testing data
- Original Competition Data

In [None]:
data_url = "https://raw.githubusercontent.com/MuhammadHelmyOmar/ArabicPIIRedaction/main/data/data_final.xlsx"

test = pd.read_excel(data_url)

test.head()

## Training data

In [None]:
from google.colab import drive
import random

### Loading Data

In [None]:
drive.mount('/content/drive')

In [None]:
# All Data

data_path = "/content/drive/MyDrive/Colab Notebooks/ArabicPIIRedaction/data/ALL_DATA.csv"

train = pd.read_csv(data_path)

In [None]:
print(len(train))
train.head()

In [None]:
print(train['dialect'].unique())
print(train['dialect'].value_counts())

In [None]:
train.loc[train['dialect']=='0', 'dialect'] = 'eg'
print(train['dialect'].value_counts())



---



In [None]:
# Data augmented with Arabic names and locations

names_loc_data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/ArabicPIIRedaction/data/names_locations_augmented_data.csv")

print(len(names_loc_data))
names_loc_data.head()

In [None]:
train = pd.concat([train, names_loc_data], ignore_index=True)
print(len(train))
train['dialect'].value_counts()

In [None]:
# Shuffle the data

train = train.sample(frac=1).reset_index(drop=True)

### Cleaning and preprocessing

In [None]:
import random
import re
from IPython.display import display, HTML
import re
import ast

In [None]:
idx = random.randint(0, len(train))
idx

In [None]:
print(train.tokens[idx])
print(type(train.tokens[idx]))
print(train.tags[idx])
print(type(train.tags[idx]))
print(len(train.tokens[idx]))
print(len(train.tags[idx]))

In [None]:
tokens_list = ast.literal_eval(train.tokens[idx])
print(tokens_list)
print(type(tokens_list), len(tokens_list))

tags_list = ast.literal_eval(train.tags[idx])
print(tags_list)
print(type(tags_list), len(tags_list))

In [None]:
# Convert tokens and tags columns to lists

train['tokens'] = train["tokens"].apply(ast.literal_eval)
train['tags'] = train['tags'].apply(ast.literal_eval)

In [None]:
print(train.tokens[idx])
print(type(train.tokens[idx]))
print(train.tags[idx])
print(type(train.tags[idx]))
print(len(train.tokens[idx]))
print(len(train.tags[idx]))

In [None]:
# Checking if there is mismatching lengths between tokens and tags

length_comparison_result = train.apply(lambda row: len(train.tokens) == len(train.tags), axis=1)
print(f"\nNumber of rows with mismatching lengths: {sum(~length_comparison_result)}")

In [None]:
# Initialize a list to store discrepancy information
discrepancy_data = []

# Creating a new masked sentence using the correct masker function
train['masked_source'] = train.apply(lambda row: masker(row, tags_to_mask, discrepancy_data), axis=1)

# Convert the list of discrepancy data into a DataFrame
discrepancy_df = pd.DataFrame(discrepancy_data)

In [None]:
tags_to_mask = ['PHONEIMEI',
 'VEHICLEVRM',
 'LITECOINADDRESS',
 'CREDITCARDNUMBER',
 'DATE',
 'NEARBYGPSCOORDINATE',
 'BITCOINADDRESS',
 'GENDER',
 'PERSONNAME',
 'JOBTITLE',
 'TIME',
 'CURRENCY',
 'BIC',
 'MASKEDNUMBER',
 'STREET',
 'MAC',
 'DOB',
 'SECONDARYADDRESS',
 'CREDITCARDISSUER',
 'ZIPCODE',
 'USERAGENT',
 'CURRENCYSYMBOL',
 'JOBTYPE',
 'BUILDINGNUMBER',
 'AGE',
 'MIDDLENAME',
 'CREDITCARDINUMBER',
 'ACCOUNTNUMBER',
 'PIN',
 'FIRSTNAME',
 'ORDINALDIRECTION',
 'PASSWORD',
 'PHONENUMBER',
 'IPV4',
 'CREDITCARDCVV',
 'USERNAME',
 'HEIGHT',
 'CURRENCYCODE',
 'ACCOUNTNAME',
 'IBAN',
 'AMOUNT',
 'PREFIX',
 'VEHICLEVIN',
 'SEX',
 'EMAIL',
 'ETHEREUMADDRESS',
 'IPV6',
 'SSN',
 'URL',
 'LASTNAME',
 'CURRENCYNAME',
 'IP']

In [None]:
def masker(row, tags_to_mask, discrepancy_list):
    """
    Constructs a masked sentence and records discrepancies, attempting to preserve original formatting.

    Args:
    row: A pandas DataFrame row with 'tokens', 'tags', and 'clean_source' columns.
    tags_to_mask: List of tag entity types to mask.
    discrepancy_list: A list to append discrepancy details.

    Returns:
    The constructed sentence with masked tokens for the given row, attempting to preserve spacing.
    """
    tokens = row['tokens']
    tags = row['tags']
    clean_source = row['clean_source']
    masked_sentence = ""
    current_position = 0

    for i in range(len(tokens)):
        token = tokens[i]
        tag = tags[i]

        # Find the position of the current token in the original string starting from the last processed position
        start_index = clean_source.find(token, current_position)

        if start_index != -1:
            # Append the text from the current position up to the start of the token
            masked_sentence += clean_source[current_position:start_index]

            if tag[2:] in tags_to_mask:
                masked_sentence += '[MASK]'
            else:
                masked_sentence += token

            # Update the current position to the end of the current token
            current_position = start_index + len(token)
        else:
            # If token not found at or after current_position, it indicates a discrepancy
            print(f"Warning: Token '{token}' not found in clean_source at or after position {current_position}\n")

            # Record the discrepancy
            discrepancy_list.append({
                'clean_source': clean_source,
                'tokens': tokens,
                'tags': tags,
                'discrepancy_token': token,
                'discrepancy_position': current_position
            })

            # You might choose how to handle the token that wasn't found -
            # here, we'll just append it unmasked to keep the process going
            masked_sentence += token
            # We don't update current_position based on this token as it wasn't found correctly

    # Append any remaining text after the last token
    masked_sentence += clean_source[current_position:]
    return masked_sentence

In [None]:
# Initialize a list to store discrepancy information
discrepancy_data = []

# Creating a new masked sentence
train['masked_source'] = train.apply(lambda row: masker(row, tags_to_mask, discrepancy_data), axis=1)

# Convert the list of discrepancy data into a DataFrame
discrepancy_df = pd.DataFrame(discrepancy_data)

In [None]:
print(f"{len(discrepancy_df)} discrepancies are found")

In [None]:
print(f"{len(discrepancy_df.clean_source.unique())} sentences are detected as discrepancies.")

In [None]:
# Drop discrepancies

print(f"Original number of rows: {len(train)}")

train = train[~train['clean_source'].isin(discrepancy_df['clean_source'])]

print(f"Number of rows after dropping discrepancies: {len(train)}")

In [None]:
train[['clean_source','masked_source']]

In [None]:
train_final = train[['clean_source', 'masked_source']].copy()
train_final.rename(columns={'clean_source': 'source', 'masked_source': 'target'}, inplace = True)

# train_final.head()

# # Save the train data after the updates
# train_final.to_csv('/content/drive/MyDrive/Colab Notebooks/ArabicPIIRedaction/data/masked_train_data.csv', index=False)

In [None]:
train_data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/ArabicPIIRedaction/data/masked_train_data.csv")

In [None]:
def merge_consecutive_masks(text):
    """Merges consecutive occurrences of '[MASK]' in a string, preserving a trailing whitespace if present."""
    return re.sub(r'(\[MASK\](\s*)){2,}', r'[MASK]\2', text)

def display_ar_eng(text):
    """Display Arabic and English text in a readable format"""
    display(HTML(f'<div dir="rtl" style="font-size:18px; line-height:1.8; font-family: "Arial", sans-serif;">{text}</div>'))

In [None]:
train_data['target'] = train_data['target'].apply(merge_consecutive_masks)

display_ar_eng(train_data['source'][0])
display_ar_eng(train_data['target'][0])

In [None]:
# Save the train data after the updates
# train_data.to_csv('/content/drive/MyDrive/Colab Notebooks/ArabicPIIRedaction/data/masked_train_data.csv', index=False)

# Fine-tuning

In [2]:
import torch

In [3]:
# Check versions
!python --version
!nvidia-smi
print("Torch Version:", torch.__version__)

Python 3.11.11
Thu Jun 19 14:18:09 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.03              Driver Version: 560.35.03      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   40C    P8             13W /   70W |       1MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   1  Tesla T4                  

In [4]:
print(f"Using {torch.cuda.device_count()} GPUs")  # Should print 2 on T4 x2

Using 2 GPUs


## Loading Data

In [5]:
import pandas as pd
# from google.colab import drive

In [6]:
# drive.mount('/content/drive')

In [7]:
# train_data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/ArabicPIIRedaction/data/masked_train_data.csv")

In [8]:
# Check the input directory
input_path = "/kaggle/input/arabic-pii/masked_train_data.csv"

# Read the train.csv file
train_data = pd.read_csv(input_path)

# Display the first few rows
train_data.head()


Unnamed: 0,source,target
0,يرجى تذكر إن رحلتنا التعليمية في الهواء الطلق ...,يرجى تذكر إن رحلتنا التعليمية في الهواء الطلق ...
1,عزيزي جادون، إن بحثك حول دول المراقبة وانتهاكا...,عزيزي [MASK]، إن بحثك حول دول المراقبة وانتهاك...
2,رح يتم تحديث شبكة الـ WiFi الخاصة بمختبر الأبح...,رح يتم تحديث شبكة الـ WiFi الخاصة بمختبر الأبح...
3,يقدم العلاج المهني لآلام الكتف في ولاية بنسلفا...,يقدم العلاج المهني لآلام الكتف في ولاية بنسلفا...
4,في 29067 جزر راث، شيد مؤخرا مركز جديد للتربية ...,في [MASK]، شيد مؤخرا مركز جديد للتربية الجنسية...


## Formatting data

In [9]:
from datasets import Dataset

In [10]:
prompt = """Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Mask the PII words in the given source sentence as "[MASK]".

### Input:
{input}

### Response:
{output}"""

In [11]:
train_data['text'] = train_data.apply(
    lambda row: prompt.format(input=row['source'], output=row['target']),
    axis=1
)

display(train_data[['source', 'target', 'text']].head())

Unnamed: 0,source,target,text
0,يرجى تذكر إن رحلتنا التعليمية في الهواء الطلق ...,يرجى تذكر إن رحلتنا التعليمية في الهواء الطلق ...,Below is an instruction that describes a task....
1,عزيزي جادون، إن بحثك حول دول المراقبة وانتهاكا...,عزيزي [MASK]، إن بحثك حول دول المراقبة وانتهاك...,Below is an instruction that describes a task....
2,رح يتم تحديث شبكة الـ WiFi الخاصة بمختبر الأبح...,رح يتم تحديث شبكة الـ WiFi الخاصة بمختبر الأبح...,Below is an instruction that describes a task....
3,يقدم العلاج المهني لآلام الكتف في ولاية بنسلفا...,يقدم العلاج المهني لآلام الكتف في ولاية بنسلفا...,Below is an instruction that describes a task....
4,في 29067 جزر راث، شيد مؤخرا مركز جديد للتربية ...,في [MASK]، شيد مؤخرا مركز جديد للتربية الجنسية...,Below is an instruction that describes a task....


## Loading model

In [12]:
# ✅ 1. Install specific compatible version
!pip install --upgrade pip
!pip install -qU "unsloth[cu124-torch260] @ git+https://github.com/unslothai/unsloth.git"

Collecting pip
  Downloading pip-25.1.1-py3-none-any.whl.metadata (3.6 kB)
Downloading pip-25.1.1-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m17.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-25.1.1
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.4/43.4 MB[0m [31m124.4 MB/s[0m eta [36m0:00:00[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.0/67.0 MB[0m [31m102.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [

In [13]:
from unsloth import FastLanguageModel, FastModel
# import os
from kaggle_secrets import UserSecretsClient
import wandb

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


2025-06-19 14:20:20.503630: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750342820.861506      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750342820.961634      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


🦥 Unsloth Zoo will now patch everything to make training faster!


In [14]:
model_name = "Qwen3-8B-Base"

In [15]:
user_secrets = UserSecretsClient()
wb_token = user_secrets.get_secret("wandb")

wandb.login(key=wb_token)
run = wandb.init(
    project=f'Fine-tuning {model_name} for Arabic PII Redaction', 
    job_type="training", 
    anonymous="allow"
)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mmuhammadhelmy[0m ([33mchezlong[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [16]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/" + model_name,
    max_seq_length = 2048,
    load_in_4bit = True, # True: QLoRA Optimization. False: LoRA Optimization
    dtype = torch.float16,
    full_finetuning = False # False: will use LoRA
)

==((====))==  Unsloth 2025.6.2: Fast Qwen3 patching. Transformers: 4.51.3.
   \\   /|    Tesla T4. Num GPUs = 2. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/6.75G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/166 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/5.43k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/707 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

In [17]:
# LoRA Optimization
model = FastModel.get_peft_model(
    model,
    finetune_vision_layers     = False,
    finetune_language_layers   = True,
    finetune_attention_modules = True,
    finetune_mlp_modules       = True,
    
    use_4bit_quants = True, # True: QLoRA
    # target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    r = 8,           # Larger = higher accuracy, but might overfit
    lora_alpha = 8,  # Recommended alpha == r at least
    lora_dropout = 0.05,
    bias = "none",
    random_state = 101,
    use_gradient_checkpointing = "unsloth" # reduce memory usage by an extra 30% and support extremely long context finetunes
)

Unsloth: Making `model.base_model.model.model` require gradients


In [18]:
# ✅ Enable PyTorch gradient checkpointing (good practice for optimizing the memory usage of deep learning models during training)
# model.gradient_checkpointing_enable()

In [19]:
# Convert the pandas DataFrame to a Hugging Face Dataset
dataset = Dataset.from_pandas(train_data)

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        # padding = "max_length",       # Prevent dimension mismatch
        truncation = True,
        max_length = 2048,
        # return_tensors = "pt"
    )

tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Set the format for PyTorch training
tokenized_dataset.set_format("torch")

Map:   0%|          | 0/376267 [00:00<?, ? examples/s]

## Training the model

In [20]:
from trl import SFTTrainer
from transformers import TrainingArguments

In [21]:
trainer = SFTTrainer(
    model = model,
    train_dataset = tokenized_dataset,
    dataset_text_field = "text",
    max_seq_length = 2048,
    args = TrainingArguments(
        output_dir = "outputs",                # where to save checkpoints
        
        save_strategy = "steps",               # save every N steps
        save_steps = 50,                       # adjust as needed
        save_total_limit = 4,                  # keep only last 2 checkpoints
        logging_first_step = True,
        
        per_device_train_batch_size = 1,
        gradient_accumulation_steps = 16,
        warmup_steps = 20,
        max_steps = 300, # adjust this based on your needs
        learning_rate = 2e-4,
        lr_scheduler_type = "cosine",
        
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        optim = "adamw_8bit",
        weight_decay = 0.01,
        dataloader_num_workers = 2,
        logging_steps = 5,
        seed = 101,
    ),
)

tokenizer_config.json:   0%|          | 0.00/5.43k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/707 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

In [22]:
torch.cuda.empty_cache() # Clear GPU Memory Before Training

In [23]:
# os.environ["WANDB_MODE"] = "disabled"
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 376,267 | Num Epochs = 1 | Total steps = 300
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 16
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 16 x 1) = 32
 "-____-"     Trainable parameters = 21,823,488/8,000,000,000 (0.27% trained)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,2.0111
5,2.0654
10,1.9173
15,1.7155
20,1.4718
25,1.3439
30,1.2746
35,1.2805
40,1.2563
45,1.2438


In [24]:
trainer_stats

TrainOutput(global_step=300, training_loss=1.1567873199780783, metrics={'train_runtime': 9979.7403, 'train_samples_per_second': 0.962, 'train_steps_per_second': 0.03, 'total_flos': 7.74953879026729e+16, 'train_loss': 1.1567873199780783})

In [25]:
# Gemma
# trainer.model.save_pretrained("outputs/gemma_3_4b_pt_model_qlora")
# tokenizer.save_pretrained("outputs/gemma_3_4b_pt_model_qlora")

# Qwen
trainer.model.save_pretrained(f"outputs/{model_name}_model")
tokenizer.save_pretrained(f"outputs/{model_name}_model")

('outputs/Qwen3-8B-Base_model/tokenizer_config.json',
 'outputs/Qwen3-8B-Base_model/special_tokens_map.json',
 'outputs/Qwen3-8B-Base_model/vocab.json',
 'outputs/Qwen3-8B-Base_model/merges.txt',
 'outputs/Qwen3-8B-Base_model/added_tokens.json',
 'outputs/Qwen3-8B-Base_model/tokenizer.json')

# Next

# Gemi