In [26]:
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, GlueDataset, default_data_collator
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader

# Charger les données
data = pd.read_csv("../Data/train_df.csv")
data = data.dropna(subset=['words'])
data = data.sample(1000)

# Tokenizer BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
max_len = 128

# Encoder le texte
def encode_text(texts, tokenizer, max_len):
    input_ids = []
    attention_masks = []

    for text in texts:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_len,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                       )
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

input_ids, attention_masks = encode_text(data['words'], tokenizer, max_len)

# Diviser les données en ensembles d'entraînement et de validation
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, data['target'].values, test_size=0.1, random_state=42)
train_masks, validation_masks, _, _ = train_test_split(attention_masks, input_ids, test_size=0.1, random_state=42)

# Convertir les tenseurs en objets torch.Tensor
train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)
train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)
train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)

train_dataset = TensorDataset(train_inputs, train_masks, train_labels)
eval_dataset = TensorDataset(validation_inputs, validation_masks, validation_labels)


# Initialiser le modèle BERT
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# Définir les arguments d'entraînement
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="steps",
    save_steps=100,
    save_total_limit=2,
    gradient_accumulation_steps=1,
    learning_rate=2e-5,
    warmup_steps=0,
    weight_decay=0.01,
    seed=42,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True
)

def my_data_collator(features):
    batch = {}
    batch['input_ids'] = torch.stack([f[0] for f in features])
    batch['attention_mask'] = torch.stack([f[1] for f in features])
    batch['labels'] = torch.tensor([f[2] for f in features])
    return batch

# Initialiser le Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=my_data_collator
)

# Lancer l'entraînement
trainer.train()


  train_inputs = torch.tensor(train_inputs)
  validation_inputs = torch.tensor(validation_inputs)
  train_masks = torch.tensor(train_masks)
  validation_masks = torch.tensor(validation_masks)
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|          | 0/87 [09:28<?, ?it/s]
                                               
 11%|█▏        | 10/87 [02:48<21:50, 17.01s/it]

{'loss': 0.6932, 'grad_norm': 1.8605074882507324, 'learning_rate': 1.770114942528736e-05, 'epoch': 0.34}



[A
                                               
[A                                   

 11%|█▏        | 10/87 [03:01<21:50, 17.01s/it]
[A
[A

{'eval_loss': 0.6875459551811218, 'eval_runtime': 12.5805, 'eval_samples_per_second': 7.949, 'eval_steps_per_second': 0.159, 'epoch': 0.34}


                                               
 23%|██▎       | 20/87 [05:55<20:22, 18.24s/it]

{'loss': 0.6819, 'grad_norm': 4.173459053039551, 'learning_rate': 1.540229885057471e-05, 'epoch': 0.69}



[A
                                               

[A[A                                       
 23%|██▎       | 20/87 [06:08<20:22, 18.24s/it]
[A
[A

{'eval_loss': 0.6639307141304016, 'eval_runtime': 12.3756, 'eval_samples_per_second': 8.08, 'eval_steps_per_second': 0.162, 'epoch': 0.69}


                                               
 34%|███▍      | 30/87 [09:01<14:40, 15.45s/it]

{'loss': 0.6682, 'grad_norm': 3.397709608078003, 'learning_rate': 1.310344827586207e-05, 'epoch': 1.03}



[A
                                               

[A[A                                       
 34%|███▍      | 30/87 [09:14<14:40, 15.45s/it]
[A
[A

{'eval_loss': 0.6561877727508545, 'eval_runtime': 13.2328, 'eval_samples_per_second': 7.557, 'eval_steps_per_second': 0.151, 'epoch': 1.03}


                                               
 46%|████▌     | 40/87 [12:11<13:44, 17.54s/it]

{'loss': 0.6364, 'grad_norm': 5.4783549308776855, 'learning_rate': 1.0804597701149427e-05, 'epoch': 1.38}



[A
                                               

[A[A                                       
 46%|████▌     | 40/87 [12:24<13:44, 17.54s/it]
[A
[A

{'eval_loss': 0.6133930087089539, 'eval_runtime': 12.7889, 'eval_samples_per_second': 7.819, 'eval_steps_per_second': 0.156, 'epoch': 1.38}


                                               
 57%|█████▋    | 50/87 [15:14<10:29, 17.02s/it]

{'loss': 0.5783, 'grad_norm': 6.04926061630249, 'learning_rate': 8.505747126436782e-06, 'epoch': 1.72}



[A
                                               

[A[A                                       
 57%|█████▋    | 50/87 [15:27<10:29, 17.02s/it]
[A
[A

{'eval_loss': 0.624743640422821, 'eval_runtime': 12.5624, 'eval_samples_per_second': 7.96, 'eval_steps_per_second': 0.159, 'epoch': 1.72}


                                               
 69%|██████▉   | 60/87 [18:11<06:45, 15.03s/it]

{'loss': 0.5674, 'grad_norm': 9.93677043914795, 'learning_rate': 6.206896551724138e-06, 'epoch': 2.07}



[A
                                               

[A[A                                       
 69%|██████▉   | 60/87 [18:24<06:45, 15.03s/it]
[A
[A

{'eval_loss': 0.5930447578430176, 'eval_runtime': 12.6674, 'eval_samples_per_second': 7.894, 'eval_steps_per_second': 0.158, 'epoch': 2.07}


                                               
 80%|████████  | 70/87 [21:13<04:52, 17.19s/it]

{'loss': 0.5259, 'grad_norm': 9.857807159423828, 'learning_rate': 3.908045977011495e-06, 'epoch': 2.41}



[A
                                               

[A[A                                       
 80%|████████  | 70/87 [21:26<04:52, 17.19s/it]
[A
[A

{'eval_loss': 0.6028279066085815, 'eval_runtime': 12.4939, 'eval_samples_per_second': 8.004, 'eval_steps_per_second': 0.16, 'epoch': 2.41}


                                               
 92%|█████████▏| 80/87 [24:16<01:59, 17.01s/it]

{'loss': 0.4896, 'grad_norm': 9.012977600097656, 'learning_rate': 1.6091954022988506e-06, 'epoch': 2.76}



[A
                                               

[A[A                                       
 92%|█████████▏| 80/87 [24:28<01:59, 17.01s/it]
[A
[A

{'eval_loss': 0.5623685717582703, 'eval_runtime': 12.0499, 'eval_samples_per_second': 8.299, 'eval_steps_per_second': 0.166, 'epoch': 2.76}


                                               
100%|██████████| 87/87 [26:10<00:00, 18.05s/it]

{'train_runtime': 1570.6609, 'train_samples_per_second': 1.719, 'train_steps_per_second': 0.055, 'train_loss': 0.6039193526081655, 'epoch': 3.0}





TrainOutput(global_step=87, training_loss=0.6039193526081655, metrics={'train_runtime': 1570.6609, 'train_samples_per_second': 1.719, 'train_steps_per_second': 0.055, 'total_flos': 177599962368000.0, 'train_loss': 0.6039193526081655, 'epoch': 3.0})

In [28]:
# Importer les bibliothèques nécessaires
from transformers import pipeline

# Créer un pipeline pour la classification de séquence
classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)

# Texte à prédire
text = "Hello, how are you?"

# Prédire la classe du texte
result = classifier(text)

# Afficher les résultats
print(result)

[{'label': 'LABEL_1', 'score': 0.5244001150131226}]


In [29]:
text = "I hate you"

# Prédire la classe du texte
result = classifier(text)

# Afficher les résultats
print(result)

[{'label': 'LABEL_0', 'score': 0.6896698474884033}]


In [30]:
text = "I love you"

# Prédire la classe du texte
result = classifier(text)

# Afficher les résultats
print(result)

[{'label': 'LABEL_1', 'score': 0.7088402509689331}]


In [44]:
text = "I'm good"

# Prédire la classe du texte
result = classifier(text)

# Afficher les résultats
print(result)

[{'label': 'LABEL_1', 'score': 0.5209335088729858}]


In [45]:

model_path = "./model"
tokenizer_path = "./tokenizer"

model.save_pretrained(model_path)
tokenizer.save_pretrained(tokenizer_path)

('./tokenizer/tokenizer_config.json',
 './tokenizer/special_tokens_map.json',
 './tokenizer/vocab.txt',
 './tokenizer/added_tokens.json')