# Fine-tuning

In [None]:
import torch

In [None]:
# Check versions
!python --version
!nvidia-smi
print("Torch Version:", torch.__version__)

In [None]:
print(f"Using {torch.cuda.device_count()} GPUs")  # Should print 2 on T4 x2

## Loading Data

In [None]:
import pandas as pd
# from google.colab import drive

In [None]:
# drive.mount('/content/drive')

In [None]:
# train_data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/ArabicPIIRedaction/data/masked_train_data.csv")

In [None]:
# Check the input directory
input_path = "/kaggle/input/arabic-pii/masked_train_data.csv"

# Read the train.csv file
train_data = pd.read_csv(input_path)

# Display the first few rows
train_data.head()


In [None]:
print(train_data.source[0])
print(train_data.target[0])

## Formatting data

In [None]:
from datasets import Dataset

In [None]:
prompt_template = """Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Mask the PII words in the given source sentence as "[MASK]".

### Input:
{input}

### Response:
{output}"""

In [None]:
train_data['prompt'] = train_data.apply(
    lambda row: prompt_template.format(input=row['source'], output=row['target']),
    axis=1
)

display(train_data.head())

In [None]:
print(train_data.prompt[0])

## Loading model

In [None]:
# ✅ 1. Install specific compatible version
!pip install --upgrade pip
!pip install -qU "unsloth[cu124-torch260] @ git+https://github.com/unslothai/unsloth.git"

In [None]:
from unsloth import FastLanguageModel, FastModel
# import os
from kaggle_secrets import UserSecretsClient
import wandb

In [None]:
model_name = "Qwen3-8B"

In [None]:
user_secrets = UserSecretsClient()
wb_token = user_secrets.get_secret("wandb")

wandb.login(key=wb_token)
run = wandb.init(
    project=f'Fine-tuning {model_name} for Arabic PII Redaction', 
    job_type="training", 
    anonymous="allow"
)

In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/" + model_name,
    max_seq_length = 2048,
    load_in_4bit = True, # True: QLoRA Optimization. False: LoRA Optimization
    dtype = torch.float16,
    full_finetuning = False # False: will use LoRA
)

In [None]:
# LoRA Optimization
model = FastModel.get_peft_model(
    model,
    finetune_vision_layers     = False,
    finetune_language_layers   = True,
    finetune_attention_modules = True,
    finetune_mlp_modules       = True,
    
    use_4bit_quants = True, # True: QLoRA
    # target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    r = 8,           # Larger = higher accuracy, but might overfit
    lora_alpha = 8,  # Recommended alpha == r at least
    lora_dropout = 0.05,
    bias = "none",
    random_state = 101,
    use_gradient_checkpointing = "unsloth" # reduce memory usage by an extra 30% and support extremely long context finetunes
)

In [None]:
# ✅ Enable PyTorch gradient checkpointing (good practice for optimizing the memory usage of deep learning models during training)
model.gradient_checkpointing_enable()

In [None]:
# Convert the pandas DataFrame to a Hugging Face Dataset
dataset = Dataset.from_pandas(train_data)

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(
        examples["prompt"],
        truncation = True,
        max_length = 2048,
        # return_tensors = "pt"
    )

tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Set the format for PyTorch training
tokenized_dataset.set_format("torch")

## Training the model

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from huggingface_hub import login

In [None]:
hf_token = user_secrets.get_secret("hf_token")

login(token = hf_token) 

In [None]:
trainer = SFTTrainer(
    model = model,
    train_dataset = tokenized_dataset,
    dataset_text_field = "text",
    max_seq_length = 2048,
    args = TrainingArguments(
        output_dir = "outputs",                # where to save checkpoints
        save_strategy = "steps",               # save every N steps
        save_steps = 50,                       # adjust as needed
        save_total_limit = 3,                  # keep only last 2 checkpoints
        logging_first_step = True,
        push_to_hub = True, # Push to HF
        hub_model_id = f"MuhammadHelmy/{model_name}-ArPII-QLoRA",
        hub_strategy = "all_checkpoints",
        
        per_device_train_batch_size = 1,
        gradient_accumulation_steps = 8,
        warmup_steps = 30, # 10% of the max_steps
        max_steps = 300, # adjust this based on your needs
        learning_rate = 2e-4,
        lr_scheduler_type = "cosine",
        
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        optim = "adamw_8bit",
        weight_decay = 0.01,
        dataloader_num_workers = 2,
        logging_steps = 5,
        seed = 101,
    ),
)

In [None]:
torch.cuda.empty_cache() # Clear GPU Memory Before Training

In [None]:
# os.environ["WANDB_MODE"] = "disabled"
trainer_stats = trainer.train()

In [None]:
trainer_stats

In [None]:
# Save Locally
# trainer.model.save_pretrained(f"outputs/{model_name}_model")
# tokenizer.save_pretrained(f"outputs/{model_name}_model")

In [None]:
# Push to HF (done inside SFTTrainer)
trainer.push_to_hub(f"MuhammadHelmy/{model_name}-ArPII-QLoRA", token=hf_token, private=True)
tokenizer.push_to_hub(f"MuhammadHelmy/{model_name}-ArPII-QLoRA", token=hf_token, private=True)

# Evaluation

## Loading Test Data (Competition Data)

In [1]:
import pandas as pd

In [2]:
# Check the input directory
input_path = "/kaggle/input/arabic-pii/test_data.xlsx"

# Read the train.csv file
test_data = pd.read_excel(input_path)

# Display the first few rows
test_data.head()

Unnamed: 0,source,target,dialect
0,الأستاذ أدولفوس ريغان زييمان، بصفتك المدير الت...,الأستاذ [MASK]، بصفتك المدير التنفيذي الرئيسي ...,EGYPT
1,مرحبًا يا هانا، ممكن لو سمحتي تتحققي من التداع...,مرحبًا يا [MASK]، ممكن لو سمحتي تتحققي من التد...,EGYPT
2,نطلب أيضًا مراجعة سياساتنا فيما يتعلق بالتغيير...,نطلب أيضًا مراجعة سياساتنا فيما يتعلق بالتغيير...,SAUDI
3,عزيزي ديفان، مطلوب تقديم عرض تقديمي على مستوى ...,عزيزي [MASK]، مطلوب تقديم عرض تقديمي على مستوى...,EGYPT
4,ممكن كمان نعمل جلسة عن كيفية إدارة التوتر والت...,The sentence doesn't contain any personal or s...,LEVANTINE


In [6]:
# Check the input directory
input_path = "/kaggle/input/arabic-pii/generated_responses_batch_2999.csv"

# Read the train.csv file
generated_batch = pd.read_csv(input_path)

# Display the first few rows
generated_batch.head()

Unnamed: 0,generated_text
0,الأستاذ [MASK]، بصفتك [MASK] لتطبيقات المركز ف...
1,مرحبًا يا [MASK]، ممكن لو سمحتي تتحققي من التد...
2,نطلب أيضًا مراجعة سياساتنا فيما يتعلق بالتغيير...
3,عزيزي [MASK]، مطلوب تقديم عرض تقديمي على مستوى...
4,ممكن كمان نعمل جلسة عن كيفية إدارة التوتر والت...


In [32]:
generated_batch_size = len(generated_batch)

print(len(test_data), generated_batch_size)

10447 3000


## Loading Model

In [20]:
# ✅ 1. Install specific compatible version
!pip install --upgrade pip
!pip install -qU "unsloth[cu124-torch260] @ git+https://github.com/unslothai/unsloth.git"

Collecting pip
  Downloading pip-25.1.1-py3-none-any.whl.metadata (3.6 kB)
Downloading pip-25.1.1-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m24.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-25.1.1
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.4/43.4 MB[0m [31m70.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.0/67.0 MB[0m [31m115.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 M

In [21]:
from unsloth import FastLanguageModel
from IPython.display import display, HTML

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


2025-06-25 11:13:53.584570: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750850033.810911      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750850033.876625      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


🦥 Unsloth Zoo will now patch everything to make training faster!


In [22]:
model_name = "Qwen3-8B-base"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = f"MuhammadHelmy/{model_name}-ArPII-QLoRA",
    max_seq_length = 2048,
    dtype = None, # Automatically detect the suitable dtype with the current hardware
    load_in_4bit = True,
)

==((====))==  Unsloth 2025.6.5: Fast Qwen3 patching. Transformers: 4.51.3.
   \\   /|    Tesla T4. Num GPUs = 2. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/6.75G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/166 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/5.43k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/707 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/87.4M [00:00<?, ?B/s]

Unsloth 2025.6.5 patched 36 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


## Formatting Input

In [23]:
prompt_template = """Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Mask the PII words in the given source sentence as "[MASK]".

### Input:
{input}

### Response:
"""

In [24]:
def display_ar_eng(text):
    """Display Arabic and English text in a readable format"""
    display(HTML(f'<div dir="rtl" style="font-size:18px; line-height:1.8; font-family: "Arial", sans-serif;">{text}</div>'))

In [25]:
# Function for post-processing
def truncate_extra(text):
    index = text.find("Human")
    if index != -1:
        return text[:index].strip()
    return text.strip()

In [26]:
source_sentence = test_data['source'][7]
target_sentence = test_data['target'][7]

prompt = prompt_template.format(input=source_sentence)
# print(prompt)

inputs = tokenizer(prompt, truncation=True, max_length=2048, return_tensors="pt").to(model.device)
# inputs

# Calculate max_new_tokens based on source sentence length
source_tokens = tokenizer.encode(source_sentence, truncation=True, max_length=2048)
max_new_tokens = len(source_tokens) + 1  # Adding a buffer of 1 tokens
# source_tokens, max_new_tokens

outputs = model.generate(**inputs, max_new_tokens=max_new_tokens, use_cache=True)
outputs

tensor([[ 38214,    374,    458,   7600,    429,  16555,    264,   3383,     13,
           9645,    264,   2033,    429,  34901,  44595,    279,   1681,    382,
          14374,  29051,    510,  12686,    279,    393,   5543,   4244,    304,
            279,   2661,   2530,  11652,    438,  10545,  49863,     60,  11436,
          14374,   5571,    510,  12653, 125217, 128168, 124376,  68785,  23364,
         124525, 124495,  23364, 124993, 124511, 126530, 131707,  31382, 125006,
         125350,   5703, 123995,  47632, 135170, 124325,  27846, 131599, 123894,
         124042, 130339, 137961, 124476, 131598,  47632, 132061,  77273,  17166,
         124422,  65398,   5703, 124176,     13, 132148, 124269,  33090, 124179,
         128252, 124666,  84532, 131013, 143592, 128248, 125993,  41593, 138144,
            320,   2428,   1110,    924,    332,   1223,   2385,  11159,   2659,
           3593,  14374,   5949,    510,  12653, 125217, 128168, 124376,  68785,
          23364, 124525, 124

In [27]:
display_ar_eng(source_sentence)
display_ar_eng(target_sentence)
response = tokenizer.batch_decode(outputs[:, inputs['input_ids'].shape[1]:], skip_special_tokens=True)[0]
cleaned_response = truncate_extra(response)
display_ar_eng(response)
display_ar_eng(cleaned_response)

## Text Generation

In [28]:
from datasets import Dataset

In [None]:
model.eval()

generated_responses = []


for i, row in test_data[generated_batch_size:].iterrows():
    source_sentence = row['source']
    target_sentence = row['target']

    # Format the prompt template
    prompt = prompt_template.format(input=source_sentence)

    # Tokenize the input.
    inputs = tokenizer(prompt, truncation=True, max_length=2048, return_tensors="pt").to(model.device)

   # Calculate max_new_tokens based on source sentence length
    source_tokens = tokenizer.encode(source_sentence, truncation=True, max_length=2048)
    max_new_tokens = len(source_tokens) + 1  # Adding a buffer of 1 tokens

    # Generate the output and capture it
    outputs = model.generate(**inputs, max_new_tokens=max_new_tokens, use_cache=True)
    
    # Decode the generated tokens
    response = tokenizer.batch_decode(outputs[:, inputs['input_ids'].shape[1]:], skip_special_tokens=True)[0]
    cleaned_response = truncate_extra(response)
    
    # Append the response to the list
    generated_responses.append(cleaned_response)

    display_ar_eng(target_sentence)
    display_ar_eng(cleaned_response) # Optional: Print for immediate feedback
    print("="*100 + "\n") #  a visual separator in the output

    if ((i+1) % 100) == 0:
        df = pd.DataFrame(generated_responses, columns = ["generated_text"])
        filename = f"generated_responses_batch_{i}.csv"
        df.to_csv(filename, index=False)
        print(f"\nSaved additional 100 rows to {filename}\n")

# Add the generated responses to the DataFrame
test_data['generated_text'] = generated_responses
test_data.head()












































































































































































































































































































Saved additional 100 rows to generated_responses_batch_3099.csv













































































































































































































































































Saved additional 100 rows to generated_responses_batch_3199.csv














































































































































































































































































































Saved additional 100 rows to generated_responses_batch_3299.csv





























































































































































































In [None]:
hf_dataset = Dataset.from_pandas(test_data)

# Push the dataset to the Hub
hf_dataset.push_to_hub(f"MuhammadHelmy/{model_name}-ArPII-QLoRA-generated-data", private=True)

## Validate

In [None]:
import re
import time
from collections import defaultdict

In [None]:
def evaluate_pii_masking(target_sentences, predicted_sentences, mask_token="[MASK]"):
    """
    Comprehensive evaluation of PII masking with multiple metrics.
    
    Args:
        target_sentences: List of target sentences with correct PII masking
        predicted_sentences: List of model-predicted sentences with PII masking
        mask_token: The token used for masking PII (default: "[MASK]")
        
    Returns:
        Dictionary with precision, recall, F1-score for different matching types
    """
    # Initialize metrics dictionaries
    metrics = {
        "exact": {"tp": 0, "fp": 0, "fn": 0},
        "partial": {"tp": 0, "fp": 0, "fn": 0},
        "iou50": {"tp": 0, "fp": 0, "fn": 0},
        "value": {"tp": 0, "fp": 0, "fn": 0}
    }
    
    exact_matches = 0
    gt_entities = 0
    pred_entities = 0
    processing_times = []
    
    # Helper function to find all mask spans in a text
    def find_mask_spans(text, mask_token):
        spans = []
        escaped_mask = re.escape(mask_token)
        for match in re.finditer(escaped_mask, text):
            spans.append((match.start(), match.end()))
        return spans
    
    # Calculate IoU between two spans
    def calculate_iou(span1, span2):
        intersection_start = max(span1[0], span2[0])
        intersection_end = min(span1[1], span2[1])
        
        if intersection_end <= intersection_start:  # No overlap
            return 0.0
            
        intersection = intersection_end - intersection_start
        union = (span1[1] - span1[0]) + (span2[1] - span2[0]) - intersection
        return intersection / union if union > 0 else 0.0
    
    # Check if two spans overlap at all
    def spans_overlap(span1, span2):
        return span1[0] < span2[1] and span2[0] < span1[1]
    
    # Process each sentence pair
    for target, predicted in zip(target_sentences, predicted_sentences):
        start_time = time.time()
        
        # Find mask spans in target and predicted
        target_spans = find_mask_spans(target, mask_token)
        predicted_spans = find_mask_spans(predicted, mask_token)
        
        # Update entity counts
        gt_entities += len(target_spans)
        pred_entities += len(predicted_spans)
        
        # Check for exact sentence match
        if target == predicted:
            exact_matches += 1
        
        # Track which spans have been matched
        matched_target_spans = set()
        matched_predicted_spans = set()
        
        # For value match, extract the context around each mask
        # (since the mask token itself is always the same)
        target_contexts = []
        predicted_contexts = []
        
        for span in target_spans:
            # Get 5 characters before and after the mask for context
            start = max(0, span[0] - 5)
            end = min(len(target), span[1] + 5)
            target_contexts.append(target[start:end])
        
        for span in predicted_spans:
            start = max(0, span[0] - 5)
            end = min(len(predicted), span[1] + 5)
            predicted_contexts.append(predicted[start:end])
        
        # Value match calculation (based on number of masks)
        common_values = len(set(target_contexts) & set(predicted_contexts))
        metrics["value"]["tp"] += common_values
        metrics["value"]["fp"] += len(predicted_contexts) - common_values
        metrics["value"]["fn"] += len(target_contexts) - common_values
        
        # Compare each target span against each predicted span
        for t_idx, t_span in enumerate(target_spans):
            best_iou = 0
            best_p_idx = -1
            
            for p_idx, p_span in enumerate(predicted_spans):
                if p_idx in matched_predicted_spans:
                    continue
                    
                # Calculate IoU for this pair
                iou = calculate_iou(t_span, p_span)
                
                if iou > best_iou:
                    best_iou = iou
                    best_p_idx = p_idx
            
            # Handle the match based on IoU
            if best_p_idx >= 0:
                # Some match was found
                if best_iou == 1.0:
                    # Exact match
                    metrics["exact"]["tp"] += 1
                    matched_target_spans.add(t_idx)
                    matched_predicted_spans.add(best_p_idx)
                    # Also counts for other categories
                    metrics["partial"]["tp"] += 1
                    metrics["iou50"]["tp"] += 1
                elif best_iou >= 0.5:
                    # IoU50 match
                    metrics["iou50"]["tp"] += 1
                    metrics["partial"]["tp"] += 1
                    matched_target_spans.add(t_idx)
                    matched_predicted_spans.add(best_p_idx)
                elif spans_overlap(t_span, predicted_spans[best_p_idx]):
                    # Partial match (any overlap)
                    metrics["partial"]["tp"] += 1
                    matched_target_spans.add(t_idx)
                    matched_predicted_spans.add(best_p_idx)
        
        # Record processing time
        processing_times.append((time.time() - start_time) * 1000)  # Convert to ms
        
        # Count unmatched targets as false negatives
        metrics["exact"]["fn"] += len(target_spans) - len(matched_target_spans)
        metrics["partial"]["fn"] += len(target_spans) - len(matched_target_spans)
        metrics["iou50"]["fn"] += len(target_spans) - len(matched_target_spans)
        
        # Count unmatched predictions as false positives
        metrics["exact"]["fp"] += len(predicted_spans) - len(matched_predicted_spans)
        metrics["partial"]["fp"] += len(predicted_spans) - len(matched_predicted_spans)
        metrics["iou50"]["fp"] += len(predicted_spans) - len(matched_predicted_spans)
    
    # Calculate final metrics
    results = {
        "gt_entities": gt_entities,
        "pred_entities": pred_entities,
        "exact_match_rate": exact_matches / len(target_sentences) if len(target_sentences) > 0 else 0,
        "avg_time_ms": sum(processing_times) / len(processing_times) if processing_times else 0
    }
    
    # Calculate precision, recall, and F1 for each match type
    f1_scores = []
    for match_type in metrics:
        tp = metrics[match_type]["tp"]
        fp = metrics[match_type]["fp"]
        fn = metrics[match_type]["fn"]
        
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        f1_scores.append(f1)
        
        results[f"{match_type}_precision"] = precision
        results[f"{match_type}_recall"] = recall
        results[f"{match_type}_f1"] = f1
        results[f"{match_type}_tp"] = tp
    
    # Calculate best overall score (highest of all F1 scores)
    results["best_overall_score"] = max(f1_scores) if f1_scores else 0
    
    return results

def display_pii_evaluation_results(results, show_details=True):
    """
    Display PII masking evaluation results in a visually appealing format with proper alignment.
    
    Args:
        results: Dictionary of results from evaluate_pii_masking function
        show_details: Whether to show detailed metrics (default: True)
    """
    width = 80
    
    # Helper function to create a horizontal line separator
    def separator(width=width):
        return "+" + "-" * (width-2) + "+"

    # Helper function to create a centered title
    def centered_title(text, width=width):
        padding = (width - len(text) - 2) // 2
        extra = 1 if (width - len(text) - 2) % 2 != 0 else 0
        return "+" + " " * padding + text + " " * (padding + extra) + "+"

    # Helper function to format percentage
    def format_pct(value):
        return f"{value:.2%}"

    # Helper function to create a metric row with better alignment
    # def metric_row(label, value, width=width):
    #     label_width = 30
    #     value_str = str(value)
    #     return f"| {label:<{label_width}} | {value_str:<{width-(label_width+len(value_str)+len(label))}} |"
    def metric_row(label, value, width=width):
        label_width = 30
        value_width = width - label_width - 5  # 5 accounts for "| " and " | " and "|"
        value_str = str(value)
        return f"| {label:<{label_width}} | {value_str:<{value_width-1}}|"

    # Helper function to create a section header
    def section_header(text, width=width):
        return f"| {text:<{width-4}} |"
    
    # Generate performance indicators based on score ranges
    def performance_indicator(score):
        if score >= 0.9:
            return "★★★★★"
        elif score >= 0.8:
            return "★★★★☆"
        elif score >= 0.7:
            return "★★★☆☆"
        elif score >= 0.6:
            return "★★☆☆☆"
        elif score >= 0.5:
            return "★☆☆☆☆"
        else:
            return "☆☆☆☆☆"

    print("\n" + separator())
    print(centered_title(" PII MASKING EVALUATION RESULTS "))
    print(separator())
    
    # Summary section
    print(section_header("SUMMARY"))
    print(separator())
    print(metric_row("Best Overall Score", f"{results['best_overall_score']:.4f}"))
    print(metric_row("Exact Match Rate", f"{format_pct(results['exact_match_rate'])}"))
    print(metric_row("Average Processing Time", f"{results['avg_time_ms']:.2f} ms"))
    print(metric_row("Ground Truth Entities", f"{results['gt_entities']}"))
    pred_gt_ratio = results['pred_entities'] / results['gt_entities'] if results['gt_entities'] > 0 else 0
    print(metric_row("Predicted Entities", f"{results['pred_entities']} ({pred_gt_ratio:.2%} of GT)"))
    
    print(separator())

    # Detailed metrics for each match type
    if show_details:
        print(centered_title(" DETAILED METRICS "))
        print(separator())
        
        # Define table headers for detailed metrics with fixed widths
        type_width = 15
        metric_width = 14
        tp_width = 8
        
        header = f"| {'Match Type':<{type_width}} | {'Precision':<{metric_width}} | {'Recall':<{metric_width}} | {'F1 Score':<{metric_width}} | {'TP':<{tp_width}}|"
        divider = f"| {'-'*type_width} | {'-'*metric_width} | {'-'*metric_width} | {'-'*metric_width} | {'-'*tp_width}|"
        
        print(header)
        print(divider)
        
        # Show metrics for each match type
        for match_type in ['exact', 'partial', 'iou50', 'value']:
            precision = results[f"{match_type}_precision"]
            recall = results[f"{match_type}_recall"]
            f1 = results[f"{match_type}_f1"]
            tp = results[f"{match_type}_tp"]
            
            # Format each metric with its indicator, ensuring consistent width
            p_str = f"{precision:.4f} {performance_indicator(precision)[:2]}"
            r_str = f"{recall:.4f} {performance_indicator(recall)[:2]}"
            f1_str = f"{f1:.4f} {performance_indicator(f1)[:2]}"
            
            print(f"| {match_type.capitalize():<{type_width}} | {p_str:<{metric_width-1}} | {r_str:<{metric_width-1}} | {f1_str:<{metric_width-1}} | {tp:<{tp_width-1}} |")
        
        print(separator())
    
    # Recommendations section
    print(centered_title(" RECOMMENDATIONS "))
    print(separator())
    
    # Generate recommendations based on metrics
    recommendations = []
    
    # Compare precision and recall for gaps
    match_types = ['exact', 'partial', 'iou50']
    for match_type in match_types:
        precision = results[f"{match_type}_precision"]
        recall = results[f"{match_type}_recall"]
        
        if precision > recall + 0.15:
            recommendations.append(f"Model is over-masking in {match_type} matches (high precision, lower recall).")
        elif recall > precision + 0.15:
            recommendations.append(f"Model is under-masking in {match_type} matches (high recall, lower precision).")
    
    # Check for specific issues
    if results['exact_f1'] < 0.7 and results['partial_f1'] > results['exact_f1'] + 0.15:
        recommendations.append("Model identifies PII regions but has boundary precision issues.")
    
    if len(recommendations) == 0:
        recommendations.append("No specific recommendations. Model performance looks balanced.")
    
    for i, rec in enumerate(recommendations, 1):
        print(f"| {i}. {rec:<{width-6}}|")
    
    print(separator())
    print("\n")

In [None]:
results = evaluate_pii_masking(target_sentences=test_data['target'], predicted_sentences=test_data['generated_text'])
display_pii_evaluation_results(results)

# Next

- Push data from drive to github
- Submit to Muhy
- Change the tags_to_mask + retrain a model

# Gemi