In [None]:
# Install necessary package for unsupervised learning
! pip install unsloth

In [None]:
import os
from datasets import load_dataset

# Define the path for the training dataset (ensure no sensitive info in path). The dataset should contain two columns:
# - "mail": This is the input text to be used for training.
# - "Catégorie du ticket": This is the annotation, representing the target category for classification.
HOME_PATH = '/home/ubuntu'
train_dataset_name = os.path.join(HOME_PATH, "train_set_filtred.csv")


# Load the dataset from CSV file
dataset = load_dataset("csv", data_files = train_dataset_name)

# Print dataset details: number of prompts and column names
print(f'Number of prompts: {len(dataset)}')
print(f'Column names are: {dataset.column_names}')

In [None]:
# Import necessary components from unsloth for vision model
from unsloth import FastVisionModel
import torch

# Load pre-trained vision model with options for memory optimization (4-bit)
model, tokenizer = FastVisionModel.from_pretrained(
    "unsloth/Llama-3.2-11B-Vision-Instruct",
    load_in_4bit = True, # Use 4bit to reduce memory use. False for 16bit LoRA.
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for long context
)

In [None]:
# Apply PEFT (Progressive Fine-Tuning) to vision and language layers
model = FastVisionModel.get_peft_model(
    model,
    finetune_vision_layers     = False, # False if not finetuning vision layers
    finetune_language_layers   = True, # False if not finetuning language layers
    finetune_attention_modules = True, # False if not finetuning attention layers
    finetune_mlp_modules       = True, # False if not finetuning MLP layers

    r = 16,           # The larger, the higher the accuracy, but might overfit
    lora_alpha = 16,  # Recommended alpha == r at least
    lora_dropout = 0,
    bias = "none",
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

In [None]:
# Define prompt for ticket classification assistant
prompt = """
Tu es un assistant spécialisé dans la classification de tickets à partir de leur contenu textuel. Ton objectif est d’analyser la description fournie et de l’associer à l’une des catégories ci-dessous :
```
Demande de service/Backup #BCS/Autre
Demande de service/Backup #BCS/Demande de renseignement
Demande de service/Backup #BCS/Restauration qualifiée
Demande de service/Backup #BCS/Stratégie de sauvegarde/Création
Demande de service/Backup #BCS/Stratégie de sauvegarde/Modification
Demande de service/Backup #BCS/Stratégie de sauvegarde/Suppression
Demande de service/Cyber Sécurité #CS2/Bastion/Création-Modification d'entrées
Incidents/Backup #BCS/Sauvegarde
Incidents/Supervision
```
### Règles :
1. Réponds uniquement par la catégorie exacte, sans texte supplémentaire.
2. La catégorie associée doit être unique.
3. Si aucune catégorie ne correspond, la valeur de "categorisation" doit être "unknown".

### Exemple :
La description:
Bonjour¤ Merci de relancer les sauvegardes FULL¤ si ils ne sont pas repassées en automatique. Thomas Envoyé de mon iPhone Début du message transféré
#### Sortie :
{{
  "categorisation": "Incidents/Backup #BCS/Sauvegarde"
}}

Analyse uniquement la description suivante :
{content}
"""

# Format for predicting the category for each sample
prediction_format = """
{{
  "categorisation": "{category}"
}}
"""

In [None]:
# Helper function to convert data into a conversation format
def convert_to_conversation(sample):
    new_conversation = [
        { "role": "user",
          "content" : [
              {
                  "type" : "text",
                  "text" : prompt.format(content = sample["mail"])
              }
          ]
        },
        { "role": "assistant",
          "content" : [
              {
                  "type" : "text",
                  "text" : prediction_format.format(category = sample["Catégorie du ticket"])
              }
          ]
        }
    ]
    return {"messages" : new_conversation}

In [None]:
# Convert the dataset to a conversation format
converted_dataset = [convert_to_conversation(sample) for sample in dataset['train']]
print(converted_dataset[0])

In [None]:
# Function to format prompts for tokenization
def formatting_prompts_func(examples):
    try:
        convos = examples["dataset"]
        texts = [tokenizer.apply_chat_template(convo['messages'], tokenize = False, add_generation_prompt = False) for convo in convos]
        return { "text" : texts, }
    except:
        print(examples)
        raise

In [None]:
# Convert the dataset to a format suitable for processing
from datasets import Dataset
my_dataset = Dataset.from_dict({"dataset": converted_dataset})

In [None]:
# Apply formatting to the dataset
dataset = my_dataset.map(formatting_prompts_func, batched = True,)
dataset[2]['text']

In [None]:
# Preprocess function for batch tokenization
from functools import partial
def preprocess_batch(batch, tokenizer, max_length):
    """
    Tokenizes dataset batch

    :param batch: Dataset batch
    :param tokenizer: Model tokenizer
    :param max_length: Maximum number of tokens to emit from the tokenizer
    """

    return tokenizer(
        images=None,
        text=batch["text"],
        max_length = max_length,
        truncation = True,
    )


def preprocess_dataset(tokenizer, max_length: int, seed, my_dataset: str):
    """
    Tokenizes dataset for fine-tuning

    :param tokenizer (AutoTokenizer): Model tokenizer
    :param max_length (int): Maximum number of tokens to emit from the tokenizer
    :param seed: Random seed for reproducibility
    :param dataset (str): Instruction dataset
    """
    columns_names = my_dataset.column_names
    columns_names.append('text')

    # Apply preprocessing to each batch of the dataset & and remove initial columns and "text" fields
    _preprocessing_function = partial(preprocess_batch, max_length = max_length, tokenizer = tokenizer)
    my_dataset = my_dataset.map(
        _preprocessing_function,
        batched = True,
        remove_columns = columns_names,
    )
    # Filter out samples that have "input_ids" exceeding "max_length"
    my_dataset = my_dataset.filter(lambda sample: len(sample["input_ids"]) < max_length)
    # Shuffle dataset
    my_dataset = my_dataset.shuffle(seed = seed)

    return my_dataset

In [None]:
# Preprocess the dataset with a max length of 2048 tokens
max_length = 2048
seed = 33
preprocessed_dataset = preprocess_dataset(tokenizer, max_length, seed, dataset)

In [None]:
# print dataset
preprocessed_dataset

In [None]:
import torch

# Define data collator for batching input during training
class TextDataCollator:
    def __init__(self, model, tokenizer, max_length=2048):
        self.model = model
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __call__(self, examples):
        # Pad or truncate input_ids and attention_mask
        input_ids = [ex["input_ids"][:self.max_length] for ex in examples]
        attention_mask = [ex["attention_mask"][:self.max_length] for ex in examples]

        # Pad sequences to max_length
        input_ids = torch.nn.utils.rnn.pad_sequence(
            [torch.tensor(ids) for ids in input_ids],
            batch_first=True,
            padding_value=0
        )

        attention_mask = torch.nn.utils.rnn.pad_sequence(
            [torch.tensor(mask) for mask in attention_mask],
            batch_first=True,
            padding_value=0
        )

        # Add labels (same as input_ids for language modeling)
        labels = input_ids.clone()

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": labels
        }

In [None]:
from unsloth import is_bf16_supported
from trl import SFTTrainer, SFTConfig

# Set up the training configuration and trainer

FastVisionModel.for_training(model) # Enable for training!

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    data_collator =  TextDataCollator(model, tokenizer),
    train_dataset = preprocessed_dataset,

    args = SFTConfig(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        #max_steps = 30,
        num_train_epochs = 1, # Set this instead of max_steps for full training runs
        learning_rate = 2e-4,
        fp16 = not is_bf16_supported(),
        bf16 = is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none",     # For Weights and Biases

        # You MUST put the below items for vision finetuning:
        remove_unused_columns = False,
        dataset_text_field = "",
        dataset_kwargs = {"skip_prepare_dataset": True},
        dataset_num_proc = 4,
        max_seq_length = 2048,
    ),
)

In [None]:
# Start training and track stats
trainer_stats = trainer.train()

In [None]:
import pandas as pd
# Save training logs to a CSV file
pd.DataFrame(trainer.state.log_history).to_csv(os.path.join(HOME_PATH,"llama_ft_log.csv"))

In [None]:
# Display training logs in DataFrame
df = pd.DataFrame(trainer.state.log_history)
df

In [None]:
import matplotlib.pyplot as plt

# Plot the training loss curve
loss = df['loss']
step = df['step']

plt.figure(figsize=(10, 6))
plt.plot(step, loss, label='Loss', color='blue', linewidth=2)

plt.title("Loss Curve", fontsize=16)
plt.xlabel("Step", fontsize=14)
plt.ylabel("Loss", fontsize=14)
plt.grid(True, linestyle='--', alpha=0.6)
plt.legend(fontsize=12)
plt.show()

In [None]:
# Push trained model and tokenizer to Hugging Face Hub (ensure tokens are replaced by placeholders)
model.push_to_hub("your_hf_hub", token="hf_XXXXXXXX")
tokenizer.push_to_hub("you_hf_hub", token="hf_XXXXXXXXXX")