<a href="https://colab.research.google.com/github/MathiasGarnier/Algorithms-All---Old/blob/master/Finetuning_QWEN_Dimension_parall%C3%A8le.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
DEBUG = True

# ------ Choix du modèle
model_name = "unsloth/Qwen2.5-VL-7B-Instruct" # Compétition fermée
#model_name = "unsloth/Qwen3-VL-8B-Instruct" # Compétion ouverte, exemple de modèle (on peut monter à plus de 8B)

In [2]:
%%capture

# ------ TÉLÉCHARGEMENT MÉTRIQUES EVAHAN
if DEBUG: print(f"{"*"*12} TÉLÉCHARGEMENT MÉTRIQUES {"*"*12}")
!wget -O task_a_c_eva.py https://raw.githubusercontent.com/GoThereGit/EvaHan/refs/heads/main/task_a_c_eva.py
!wget -O task_b_eva.py https://raw.githubusercontent.com/GoThereGit/EvaHan/refs/heads/main/task_b_eva.py

print("Downloaded task_a_c_eva.py and task_b_eva.py")

import os, re
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    import torch; v = re.match(r"[0-9]{1,}\.[0-9]{1,}", str(torch.__version__)).group(0)
    xformers = "xformers==" + ("0.0.33.post1" if v=="2.9" else "0.0.32.post2" if v=="2.8" else "0.0.29.post3")
    !pip install --no-deps bitsandbytes accelerate {xformers} peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets==4.3.0" "huggingface_hub>=0.34.0" hf_transfer
    !pip install --no-deps unsloth

if "Qwen2.5" and "7B" in model_name:
    print("Modèle 2.5, compétition fermée")
    !pip install transformers==4.56.2
elif "Qwen3" in model_name:
    print("Modèle 3, compétion ouverte (exemple de modèle)")
    !pip install transformers==4.57.1
else:
    print("Problème modèle")
!pip install --no-deps trl==0.22.2

In [None]:
import os
import json
import torch
import PIL.Image
from tqdm import tqdm
from google.colab import drive
from unsloth import FastVisionModel
from torch.utils.data import Dataset
from trl import SFTTrainer, SFTConfig
from task_a_c_eva import calculate_char_metrics
from concurrent.futures import ThreadPoolExecutor
from sklearn.model_selection import train_test_split
from unsloth.trainer import UnslothVisionDataCollator



# ------ CHEMIN D'ACCÈS
if DEBUG: print(f"{"*"*12} CHEMINS D'ACCÈS {"*"*12}")
drive.mount('/content/drive')
base_path = '/content/drive/MyDrive/EVAHAN/train_data/'



# ------ BONJOUR LE MODÈLE
if DEBUG: print(f"{"*"*12} CHARGEMENT DU MODÈLE {"*"*12}")

GPU_WORTH_IT = torch.cuda.is_bf16_supported() # Si mieux qu'un GPU T4: True, sinon False
if DEBUG: print(f"Support BF16 : {GPU_WORTH_IT}")

instruction = """Analyze the provided image of ancient Chinese texts.
Transcribe the characters into standard Traditional Chinese (Unicode).
Do not add any notes, just the transcription."""

model, tokenizer = FastVisionModel.from_pretrained(
    model_name,
    load_in_4bit = True,
    use_gradient_checkpointing = "unsloth",
)

model = FastVisionModel.get_peft_model(
    model,
    finetune_vision_layers     = True, # False if not finetuning vision layers
    finetune_language_layers   = True, # False if not finetuning language layers
    finetune_attention_modules = True, # False if not finetuning attention layers
    finetune_mlp_modules       = True, # False if not finetuning MLP layers

    r = 16,                            # Monter à 32 ?
    lora_alpha = 16,                   # Recommended alpha == r at least
    lora_dropout = 0,
    bias = "none",
    random_state = 3407,               # 3407 par défaut, changer ça même si ça change rien
    use_rslora = False,
    loftq_config = None,               # And LoftQ
    # target_modules = "all-linear",   # Optional now! Can specify a list if needed
)



# ------ SÉLECTION DU JEU DE DONNÉES ET CHARGEMENT
if DEBUG: print(f"{"*"*12} JEU DE DONNÉES {"*"*12}")

subset_size = 1 # 0.1 # pour tests, on se restreint à une petite partie du jeu de données
                # sinon mettre à 1
assert(subset_size >= 0 and subset_size <= 1)

VERIFY_DATASET = True # pour éviter problèmes avec images corrompues

if DEBUG: print(f"\t\tRestrict to {subset_size*100}% of the dataset")

dataset_files = ['Dataset_A.json', 'Dataset_C.json']
combined_dataset = []

for filename in dataset_files:
    filepath = os.path.join(base_path, filename)
    with open(filepath, 'r') as f:
        data = json.load(f)
        combined_dataset.extend(data)

print(f"Successfully loaded and combined {len(dataset_files)} datasets.")
print(f"Total items in combined dataset: {len(combined_dataset)}")

def check_single_image(item, base_path):
    """Fonction interne pour vérifier une seule image."""
    img_path = os.path.join(base_path, item["image_path"])

    if not os.path.exists(img_path):
        return None

    try:
        with PIL.Image.open(img_path) as img:
            img.verify()
        return item
    except Exception:
        # On peut logger l'erreur ici si besoin
        return None

def verify_dataset_fast(data_list, base_path, max_workers=20):

    if DEBUG: print(f"Vérification accélérée de {len(data_list)} images...")

    valid_data = []

    # max_workers=10 : compromis pour ne pas saturer le Drive
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        # On lance toutes les tâches
        futures = [executor.submit(check_single_image, item, base_path) for item in data_list]

        # On récupère les résultats avec une barre de progression
        for future in tqdm(futures, total=len(data_list)):
            result = future.result()
            if result is not None:
                valid_data.append(result)

    return valid_data

if VERIFY_DATASET:
  combined_dataset = verify_dataset_fast(combined_dataset, base_path)
  print(f"Images valides restantes : {len(combined_dataset)}")

if subset_size < 1:
    # On réduit d'abord la taille totale
    combined_dataset, _ = train_test_split(combined_dataset, train_size=subset_size, random_state=42)

# On split ensuite en Train / Temp (Val + Test)
train_data, temp_data = train_test_split(combined_dataset, test_size=0.20, random_state=42)
# On split Temp en Val et Test (50/50 du temp = 10% chacun du total)
test_data, val_data = train_test_split(temp_data, test_size=0.5, random_state=42)

print(f"Training set size: {len(train_data)}")
print(f"Testing set size: {len(test_data)}")
print(f"Validation set size: {len(val_data)}")


class EvaHanDataset(Dataset):
    def __init__(self, data_list, base_path, instruction):
        self.data = data_list
        self.base_path = base_path
        self.instruction = instruction

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sample = self.data[idx]
        image_path = os.path.join(self.base_path, sample["image_path"])

        try:
            image = PIL.Image.open(image_path).convert("RGB")
            assistant_content = sample.get("text", "")
        except Exception as e:
            # Au lieu de récursion, on renvoie un placeholder safe
            # Cela permet au batch de continuer sans planter le Trainer
            image = PIL.Image.new('RGB', (512, 512), color='black')
            assistant_content = "□" # caractère neutre pour ne pas fausser l'apprentissage
            print(f"Skipping corrupt image: {image_path}")

        return {
            "messages": [
                { "role": "user", "content": [
                    {"type": "text", "text": self.instruction},
                    {"type": "image", "image": image}
                ]},
                { "role": "assistant", "content": [
                    {"type": "text", "text": assistant_content}
                ]}
            ]
        }

converted_train_dataset = EvaHanDataset(train_data, base_path, instruction)
converted_val_dataset = EvaHanDataset(val_data, base_path, instruction)

print(f"Dataset is converted. Train size: {len(converted_train_dataset)}")

if DEBUG:
  print(f"Un exemple: {converted_train_dataset[0]['messages'][1]['content'][0]['text']}")


FastVisionModel.for_training(model) # Enable for training!

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    data_collator = UnslothVisionDataCollator(model, tokenizer), # Must use!
    train_dataset = converted_train_dataset, # Updated to use the new training dataset
    eval_dataset = converted_val_dataset,    # Added for evaluation
    args = SFTConfig(
        per_device_train_batch_size = 2,   # ou 1 ?
        gradient_accumulation_steps = 4,   # ou 8 ?
        warmup_ratio = 0.1,                # 10% de warmup au lieu de 5 steps fixes
        num_train_epochs = 1,              #
        learning_rate = 2e-4,              # Diminuer? 4e-5? 5e-6 ?

        bf16 = GPU_WORTH_IT,               # Si GPU T4 ou equiv, on aura False
        fp16 = not GPU_WORTH_IT,           # Si GPU T4 ou equiv, on aura True
        eval_strategy = "steps",
        eval_steps = 20,                   # Plus fréquent pour voir la courbe
        save_strategy = "steps",
        save_steps = 20,
        load_best_model_at_end = True,

        optim = "adamw_8bit",              #
        weight_decay = 0.01,               # Légère augmentation pour régulariser
        lr_scheduler_type = "cosine",      # Plus efficace que linear
        seed = 3407,                       #
        output_dir = "outputs",
        #report_to = "none",

        remove_unused_columns = False,     # Obligatoire pour vision
        dataset_text_field = "",           #
        dataset_kwargs = {"skip_prepare_dataset": True}, #
        max_length = 2048,                  # monter à 4096 si GPUs disent oui

        logging_steps = 1,              # Affiche les stats à CHAQUE étape
        logging_strategy = "steps",     # Basé sur les pas, pas sur les époques
        report_to = "tensorboard",      # Ou "tensorboard" pour voir des graphiques
        disable_tqdm = False,           # Assurez-vous que la barre de progression est active
    )
    # Unsloth: Model does not have a default image size - using 512
)

if DEBUG:
  gpu_stats = torch.cuda.get_device_properties(0)
  start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
  max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
  print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
  print(f"{start_gpu_memory} GB of memory reserved.")



# ------ ENTRAÎNER NOTRE CHER MODÈLE
trainer_stats = trainer.train()



if DEBUG:
  used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
  used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
  used_percentage = round(used_memory / max_memory * 100, 3)
  lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
  print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
  print(
      f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training."
  )
  print(f"Peak reserved memory = {used_memory} GB.")
  print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
  print(f"Peak reserved memory % of max memory = {used_percentage} %.")
  print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")



if DEBUG:
  # VISUALISATION LOSS
  import matplotlib.pyplot as plt

  # Extraire l'historique des logs
  history = trainer.state.log_history

  train_loss = [x['loss'] for x in history if 'loss' in x]
  eval_loss = [x['eval_loss'] for x in history if 'eval_loss' in x]
  steps = [x['step'] for x in history if 'loss' in x]
  eval_steps = [x['step'] for x in history if 'eval_loss' in x]

  plt.figure(figsize=(10, 6))
  plt.plot(steps, train_loss, label='Training Loss', color='blue', alpha=0.6)
  if eval_loss:
      plt.plot(eval_steps, eval_loss, label='Validation Loss', color='red', marker='o')

  #plt.title('Loss - EvaHan Transcription')
  plt.xlabel('Steps')
  plt.ylabel('Loss')
  plt.legend()
  plt.grid(True)
  plt.show()

FastVisionModel.for_inference(model) # Ensure model is in inference mode

sum_score = 0
med_cer, med_precision, med_recall, med_f1, med_ned = 0, 0, 0, 0, 0
idx = 0

# On teste juste sur 50 données de test_data
for i, test_example in tqdm(enumerate(test_data[:50]), total=50, desc="Inférence et évaluation"):
    image_path = os.path.join(base_path, test_example["image_path"])

    try:
        image = PIL.Image.open(image_path).convert("RGB")
    except Exception as e:
        print(f"Erreur chargement image {image_path}: {e}")
        continue

    # Préparation du prompt (l'instruction ne doit plus mentionner de JSON)
    messages = [
        {"role": "user", "content": [
            {"type": "image"},
            {"type": "text", "text": instruction}
        ]}
    ]

    input_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True)
    inputs = tokenizer(
        image,
        input_text,
        add_special_tokens=False,
        return_tensors="pt",
    ).to("cuda")

    # GÉNÉRATION
    # Temperature baissée à 0.1 pour plus de fidélité au texte
    outputs = model.generate(
        **inputs,
        max_new_tokens=256, # 128 ? 512 ?
        use_cache=True,
        temperature=0.1,
        min_p=0.05
    )

    generated_text_tokens = outputs[0][len(inputs["input_ids"][0]):]
    model_output_raw = tokenizer.decode(generated_text_tokens, skip_special_tokens=True)

    # NETTOYAGE SIMPLE (Plus besoin de JSON)
    predicted_transcription = model_output_raw.replace("<|im_end|>", "").strip()

    ground_truth_transcription = test_example.get("text", "N/A (ground truth missing)")

    print(f"\n--- Exemple {i+1} ---")
    print(f"Sortie Modèle : {predicted_transcription}")
    print(f"Réalité (GT)  : {ground_truth_transcription}")

    # ÉVALUATION
    dataset_name = image_path.split('/')[-2]

    if "Dataset_B" in dataset_name:
        print("Évaluation (Dataset B) : Ignoré (Détection de mise en page).")
    else:
        if ground_truth_transcription != "N/A (ground truth missing)":

            # Calcul des métriques sur le texte brut
            metrics_ac = calculate_char_metrics(ground_truth_transcription, predicted_transcription)

            s_ac = metrics_ac.get("comprehensive_score", 0)
            s_cer = metrics_ac.get("cer", 1.0)
            s_p = metrics_ac.get("precision", 0)
            s_r = metrics_ac.get("recall", 0)
            s_f1 = metrics_ac.get("f1", 0)
            s_ned = metrics_ac.get("ned", 0)

            print(f"Score : {s_ac} | CER : {s_cer}")

            sum_score += s_ac
            med_cer += s_cer
            med_precision += s_p
            med_recall += s_r
            med_f1 += s_f1
            med_ned += s_ned
            idx += 1
        else:
            print("Évaluation (Dataset A/C) : Ground Truth manquante.")

# Résultats finaux
if idx > 0:
    print("\n" + "="*30)
    print(f"MOYENNE GÉNÉRALE ({idx} exemples)")
    print(f"Comprehensive Score : {sum_score / idx:.4f}")
    print(f"CER (Taux d'erreur) : {med_cer / idx:.4f}")
    print(f"F1-Score            : {med_f1 / idx:.4f}")
    print(f"NED (Distance)      : {med_ned / idx:.4f}")
    print("="*30)

model.save_pretrained("lora_model_QWEN2.5_FINETUNE_EVAHAN__MONACO_PAR_HOUDI")
tokenizer.save_pretrained("lora_model_QWEN2.5_FINETUNE_EVAHAN__MONACO_PAR_HOUDI")

************ CHEMINS D'ACCÈS ************
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
************ CHARGEMENT DU MODÈLE ************
Support BF16 : True
==((====))==  Unsloth 2026.1.4: Fast Qwen2_5_Vl patching. Transformers: 4.56.2.
   \\   /|    NVIDIA H100 80GB HBM3. Num GPUs = 1. Max memory: 79.32 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 9.0. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
************ JEU DE DONNÉES ************
		Restrict to 100% of the dataset
Successfully loaded and combined 2 datasets.
Total items in combined dataset: 10000
Vérification accélérée de 10000 images...


100%|██████████| 10000/10000 [04:36<00:00, 36.18it/s]


Images valides restantes : 9999
Training set size: 7999
Testing set size: 1000
Validation set size: 1000
Dataset is converted. Train size: 7999
Un exemple: 廿世同居豈天㝎浦陽上邑是其家自古相傳山水勝
Unsloth: Model does not have a default image size - using 512
GPU = NVIDIA H100 80GB HBM3. Max memory = 79.32 GB.
13.994 GB of memory reserved.


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 7,999 | Num Epochs = 1 | Total steps = 1,000
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 51,521,536 of 8,343,688,192 (0.62% trained)


Step,Training Loss,Validation Loss
20,7.2576,7.23067
40,7.6965,7.23067
60,7.1831,7.23067
80,7.1593,7.23067
100,7.1197,7.23067
120,7.0047,7.23067
140,7.2361,7.23067
160,7.0804,7.23067
180,7.1561,7.23067


Unsloth: Will smartly offload gradients to save VRAM!
Skipping corrupt image: /content/drive/MyDrive/EVAHAN/train_data/Dataset_C/c_0939.jpg


Unsloth: Not an error, but Qwen2_5_VLForConditionalGeneration does not accept `num_items_in_batch`.
Using gradient accumulation will be very slightly less accurate.
Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient
