In [1]:
import pandas as pd
from transformers import AutoTokenizer
import tqdm
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader
from torch.optim import AdamW
from torch.nn import CrossEntropyLoss
# Assurez-vous que votre colonne cible ('type') est codée en entiers
from sklearn.preprocessing import LabelEncoder
import torch
from pytorch_lightning import LightningModule, Trainer
from datasets import Dataset
from datasets import DatasetDict
import numpy as np



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Charger les données depuis un fichier Parquet
data = pd.read_parquet('data/curated/curated_data.parquet')

# Afficher les premières lignes pour vérifier
print(data.head())

   type                                              posts
0  INFJ  ' enfp and intj moments sportscenter not top t...
1  ENTP  'i'm finding the lack of me in these posts ver...
2  INTP  'good one _____ of course, to which i say i kn...
3  INTJ  'dear intp, i enjoyed our conversation the oth...
4  ENTJ  'you're fired. that's another silly misconcept...


In [3]:
# Tokeniser les textes


In [4]:


from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

data['input_ids'] = data['posts'].apply(lambda x: tokenizer.encode(x, truncation=True, padding='max_length', max_length=512))

# Créer des attention masks (utile si vous utilisez des séquences de longueur variable)
data['attention_mask'] = data['input_ids'].apply(lambda x: [int(token_id > 0) for token_id in x])


encoder = LabelEncoder()
data['labels'] = encoder.fit_transform(data['type'])


In [5]:


# Convertir le DataFrame pandas en Dataset Hugging Face
dataset = Dataset.from_pandas(data[['input_ids', 'attention_mask', 'labels']])
dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])


In [6]:

# Diviser le dataset en ensembles d'entraînement et de test
train_test_split = dataset.train_test_split(test_size=0.2)
dataset_dict = DatasetDict({
    'train': train_test_split['train'],
    'test': train_test_split['test']
})


In [7]:
from transformers import AutoModelForSequenceClassification

num_labels = len(set(data['labels']))  # Le nombre de classes distinctes
model = AutoModelForSequenceClassification.from_pretrained('huawei-noah/TinyBERT_General_4L_312D', num_labels=len(set(data['labels'])))



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at huawei-noah/TinyBERT_General_4L_312D and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
#for param in model.base_model.parameters():
#    param.requires_grad = False

In [13]:
from datasets import load_metric

metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)


  metric = load_metric("accuracy")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
Downloading builder script: 4.21kB [00:00, 4.21MB/s]                   


In [9]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_dict["train"],
    eval_dataset=dataset_dict["test"],
    compute_metrics=lambda p: load_metric("accuracy").compute(predictions=np.argmax(p.predictions, axis=1), references=p.label_ids),
)


In [10]:
trainer.train()


 19%|█▉        | 500/2604 [00:20<01:23, 25.07it/s]Checkpoint destination directory ./results\checkpoint-500 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'loss': 2.3514, 'learning_rate': 4.0399385560675886e-05, 'epoch': 0.58}


 38%|███▊      | 1000/2604 [00:40<01:04, 24.85it/s]Checkpoint destination directory ./results\checkpoint-1000 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'loss': 2.2466, 'learning_rate': 3.079877112135177e-05, 'epoch': 1.15}


 58%|█████▊    | 1500/2604 [01:01<00:44, 24.98it/s]

{'loss': 2.2079, 'learning_rate': 2.1198156682027652e-05, 'epoch': 1.73}


 77%|███████▋  | 2000/2604 [01:21<00:24, 24.67it/s]

{'loss': 2.1683, 'learning_rate': 1.1597542242703534e-05, 'epoch': 2.3}


 96%|█████████▌| 2500/2604 [01:42<00:04, 24.81it/s]

{'loss': 2.1278, 'learning_rate': 1.996927803379416e-06, 'epoch': 2.88}


100%|██████████| 2604/2604 [01:46<00:00, 24.42it/s]

{'train_runtime': 106.6475, 'train_samples_per_second': 195.223, 'train_steps_per_second': 24.417, 'train_loss': 2.216509307767572, 'epoch': 3.0}





TrainOutput(global_step=2604, training_loss=2.216509307767572, metrics={'train_runtime': 106.6475, 'train_samples_per_second': 195.223, 'train_steps_per_second': 24.417, 'train_loss': 2.216509307767572, 'epoch': 3.0})

In [12]:
from datasets import load_metric

results = trainer.evaluate()
print(results)


218it [01:34,  3.46s/it]                         

  compute_metrics=lambda p: load_metric("accuracy").compute(predictions=np.argmax(p.predictions, axis=1), references=p.label_ids),
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
434it [01:38,  4.41it/s]

{'eval_loss': 2.153076410293579, 'eval_accuracy': 0.2801152737752161, 'eval_runtime': 3.6967, 'eval_samples_per_second': 469.336, 'eval_steps_per_second': 58.701, 'epoch': 3.0}



