In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification




  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train_file_path = '../data/train_sample_theo.csv'
test_file_path = '../data/test_sample_theo.csv'

# Chargement des données
train_data = pd.read_csv(train_file_path)
test_data = pd.read_csv(test_file_path)

In [3]:
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = TFBertForSequenceClassification.from_pretrained(model_name, num_labels=5)




All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
# Préparation des données
X_train = list(train_data["revue/texte"])
y_train = np.array([int(i) - 1 for i in train_data["revue/score"]])

X_test = list(test_data["revue/texte"])
y_test = np.array([int(i) - 1 for i in test_data["revue/score"]])

In [5]:
def tokenize(inputs):
    return tokenizer(inputs, padding=True, truncation=True, max_length=512, return_tensors="tf")

X_train_tokenized = tokenize(X_train)
X_test_tokenized = tokenize(X_test)

In [6]:
# Convertir BatchEncoding en un type hashable
X_train_tokenized_hashable = (X_train_tokenized['input_ids'], X_train_tokenized['token_type_ids'], X_train_tokenized['attention_mask'])
X_test_tokenized_hashable = (X_test_tokenized['input_ids'], X_test_tokenized['token_type_ids'], X_test_tokenized['attention_mask'])

# Convertir les étiquettes en ensembles de données TensorFlow
train_dataset = tf.data.Dataset.from_tensor_slices((X_train_tokenized_hashable, y_train)).shuffle(len(X_train)).batch(8)
test_dataset = tf.data.Dataset.from_tensor_slices((X_test_tokenized_hashable, y_test)).batch(8)

In [7]:
# Compilation du modèle
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [8]:
# Entraînement du modèle
model.fit(train_dataset, epochs=2, validation_data=test_dataset, batch_size=40)

Epoch 1/2




Epoch 2/2


<keras.src.callbacks.History at 0x260f3aa8670>

In [10]:
# Sauvegarde du modèle
save_path = '../models/polarity'
model.save(save_path)

INFO:tensorflow:Assets written to: ../models/polarity\assets


INFO:tensorflow:Assets written to: ../models/polarity\assets


In [11]:
# Évaluation du modèle
eval_loss, eval_accuracy = model.evaluate(test_dataset)
print(f"Eval Loss: {eval_loss}, Eval Accuracy: {eval_accuracy}")

Eval Loss: 1.3840978145599365, Eval Accuracy: 0.495991975069046


In [26]:
from transformers import BertTokenizer
import tensorflow as tf

# Chemin vers le modèle sauvegardé
model_path = '../models/polarity'

# Chargement du modèle
loaded_model = tf.keras.models.load_model(model_path)

# Initialisation du tokenizer
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)

def prepare_input(text):
    tokens = tokenizer(text, padding=True, truncation=True, max_length=512, return_tensors="tf")
    return {'input_ids': tokens['input_ids'], 'token_type_ids': tokens['token_type_ids'], 'attention_mask': tokens['attention_mask']}

# Exemple de texte à classer
#text = "Ce livre est vraiment génial, je le recommande à tout le monde !"
#text = "Je déconseille ce livre, il est vraiment nul"
text = "null"

# Préparation de l'entrée
prepared_input = prepare_input(text)

# Inférence
predictions = loaded_model.predict(prepared_input)
predicted_class = tf.argmax(predictions['logits'], axis=1).numpy()[0] + 1

print(f"Classe prédite: {predicted_class}")



Classe prédite: 5
