In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
import_save = '/content/drive/MyDrive/rdva2024/data/'

import pandas as pd

df_train = pd.read_csv("/content/drive/MyDrive/rdva2024/data/train_llama_equal_weights.csv")
df_valid = pd.read_csv("/content/drive/MyDrive/rdva2024/data/valid_llama.csv").sample(1000, random_state = 46)

In [None]:
def text_and_label(dataset):
    text = []
    label = []

    # Itération sur les lignes du DataFrame avec iterrows()
    for _, row in dataset.iterrows():
        text.append(row["full_text"])  # Accès à la colonne 'full_text'
        label.append(row["rating"])  # Accès à la colonne 'rating'

    # Création du dictionnaire
    return {
        'text': text,
        'label': label
    }

# Exemple d'utilisation
dataset_train_format = text_and_label(df_train)
dataset_valid_format = text_and_label(df_valid)

In [None]:
import os
import torch
import huggingface_hub
import sklearn
import numpy as np

from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding, BitsAndBytesConfig
from peft import get_peft_model, LoraConfig, prepare_model_for_kbit_training
from torch import nn
from datasets import load_dataset, Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score

In [None]:
id2label = {
    0: 1, 1: 2, 2: 3, 3: 4, 4: 5
}

label2id = {
    1: 0, 2: 1, 3: 2, 4: 3, 5: 4
}

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type = 'nf4',
    bnb_4bit_use_double_quant = True,
    bnb_4bit_compute_dtype=torch.bfloat16
)


llama_classif = AutoModelForSequenceClassification.from_pretrained(
    "unsloth/Llama-3.2-3B", num_labels=5, id2label=id2label, label2id=label2id, quantization_config=quantization_config
)

config.json:   0%|          | 0.00/885 [00:00<?, ?B/s]

`low_cpu_mem_usage` was None, now default to True since model is quantized.


model.safetensors:   0%|          | 0.00/6.43G [00:00<?, ?B/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at unsloth/Llama-3.2-3B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
lora_config = LoraConfig(
    r=16,
    lora_alpha=8,
    target_modules = ['q_proj', 'k_proj', 'v_proj', 'o_proj'],
    lora_dropout=0.05,
    bias = 'none',
    task_type = 'SEQ_CLS'
    )

llama_classif = prepare_model_for_kbit_training(llama_classif)
llama_classif = get_peft_model(llama_classif, lora_config)

llama_tokenizer = AutoTokenizer.from_pretrained("unsloth/Llama-3.2-3B")
llama_tokenizer.pad_token_id = llama_tokenizer.eos_token_id
llama_tokenizer.pad_token = llama_tokenizer.eos_token

llama_classif.config.pad_token_id = llama_tokenizer.pad_token_id
llama_classif.config.use_cache = False
llama_classif.config.pretraining_tp = 1

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/459 [00:00<?, ?B/s]

In [None]:
# Tokenisation des textes

train_encoding_text = llama_tokenizer(dataset_train_format["text"], padding=True, truncation=True)
valid_encoding_text = llama_tokenizer(dataset_valid_format["text"], padding=True, truncation=True)

train_dataset = Dataset.from_dict({"input_ids": train_encoding_text["input_ids"],
                                   "attention_mask": train_encoding_text["attention_mask"],
                                   "label": dataset_train_format["label"]})
valid_dataset = Dataset.from_dict({"input_ids": valid_encoding_text["input_ids"],
                                   "attention_mask": valid_encoding_text["attention_mask"],
                                   "label": dataset_valid_format["label"]})


train_dataset = train_dataset.map(lambda e: {
    'input_ids': torch.tensor(e['input_ids']),
    'attention_mask': torch.tensor(e['attention_mask']),
    'label': torch.tensor(label2id[e['label']]) 
})

valid_dataset = valid_dataset.map(lambda e: {
    'input_ids': torch.tensor(e['input_ids']),
    'attention_mask': torch.tensor(e['attention_mask']),
    'label': torch.tensor(label2id[e['label']])
})

dataset = DatasetDict({
    'train': train_dataset,
    'valid': valid_dataset
})

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [None]:
import torch.nn.functional as F

def compute_metrics(p):
    predictions, labels = p

    preds = np.argmax(predictions, axis=1)

    accuracy = accuracy_score(labels, preds)

    mse = mean_squared_error(labels, preds)

    return {
        'accuracy': accuracy,
        'mse': mse
    }

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.pop("labels").long()
        outputs = model(**inputs)
        logits = outputs.get('logits')
        loss = F.cross_entropy(logits, labels)

        ## Nous avons testé une perte modifiée qui considérait l'aspect ordinal (distance entre les scores) pour le calcul du gradient
        #k = logits.shape[-1]
        #predicted_class = torch.argmax(logits, dim=-1)
        #true_class = labels
        #w = torch.abs(predicted_class - true_class).float() / (k - 1)

        #loss = (1 + w) * F.cross_entropy(logits, labels, reduction='none')
        #loss = loss.mean()

        return (loss, outputs) if return_outputs else loss

training_args = TrainingArguments(
    output_dir='./results',
    learning_rate=1e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps = 5,
    num_train_epochs=2,
    weight_decay = 0.05,
    eval_strategy='epoch',
    save_strategy='epoch',
    logging_dir='./logs',
    report_to='none',
    fp16=True,
    seed = 89,
)

collate_fn = DataCollatorWithPadding(tokenizer=llama_tokenizer)

trainer = CustomTrainer(
    model=llama_classif,
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['valid'],
    compute_metrics=compute_metrics,
    data_collator=collate_fn
)

In [None]:
train_results = trainer.train()

  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,Mse
1,1.3119,0.927054,0.633,0.668
2,0.8845,0.770749,0.716,0.504


  return fn(*args, **kwargs)


In [None]:
# Sauvegarder le modèle finetuned sur la tache de classification

from datetime import datetime
import pickle
timestamp = datetime.now().strftime('%Y_%m_%d_%H_%M')
output_dir = f'/content/drive/MyDrive/rdva2024/model_llama32_3b_output_EqWtd_{timestamp}'
os.makedirs(output_dir, exist_ok=True)

train_results_path = os.path.join(output_dir, 'train_results.pkl')
with open(train_results_path, 'wb') as f:
    pickle.dump(train_results, f)

trainer.save_model(output_dir)
llama_tokenizer.save_pretrained(output_dir)

('/content/drive/MyDrive/rdva2024/model_llama32_3b_output_EqWtd_2024_12_03_05_29/tokenizer_config.json',
 '/content/drive/MyDrive/rdva2024/model_llama32_3b_output_EqWtd_2024_12_03_05_29/special_tokens_map.json',
 '/content/drive/MyDrive/rdva2024/model_llama32_3b_output_EqWtd_2024_12_03_05_29/tokenizer.json')

In [None]:
# Petite section pour analyser les résultats en validation

preds_valid = trainer.predict(dataset['valid'])
labels_valid = np.argmax(preds_valid.predictions, axis=1)

from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support
import pandas as pd

# Matrice de confusion
matrice_confusion = confusion_matrix(y_true=dataset['valid']['label'], y_pred=labels_valid)
print(matrice_confusion)

# Tableau precision, rappel, f1score, support
metrics_df = pd.DataFrame()
metrics_df['score'] = ["precision", "recall", "f1score", "support"]

metrics = pd.DataFrame(
    precision_recall_fscore_support(
        y_true= list(dataset['valid']['label']),
        y_pred = list(labels_valid),
        average=None,
        labels=[0,1,2,3,4]
    )
)
pd.concat([metrics_df, metrics], axis=1, join='inner')

In [None]:
from google.colab import runtime
runtime.unassign()