<h1> Classification des étoiles avec camembert </h1>

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import CamembertTokenizer,TrainingArguments, CamembertForSequenceClassification,Trainer,pipeline,AutoTokenizer, TFCamembertForSequenceClassification
import torch 
import os
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
from torch.utils.data import Dataset

# in house functions: 
%run 'DST_fun.ipynb' # model_report() and review_vector() 

# set up camembert
os.environ["TQDM_NOTEBOOK"] = "1"

class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Load the datasets
train_df = pd.read_csv("../data/avis/train_noYC_lemma_sent_equil.csv", index_col=0)
train_df['text_total'] = train_df['text_total'].astype("str")
y_train = train_df.etoiles
X_train = train_df.text_total


test_df = pd.read_csv('../data/avis/test_noYC_lemma_sent_equil.csv')
test_df['text_total'] = test_df['text_total'].astype("str")
y_temp = test_df['etoiles']
X_temp = test_df.text_total

model_type = "Camembert"

X_train.head(2)

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
  from .autonotebook import tqdm as notebook_tqdm


5529     Permet de faire des factures et des…Permet de ...
57109    Bien accompagnéBien accompagné, sympathique, p...
Name: text_total, dtype: object

In [2]:
# split du dataset
X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size= 0.5,
random_state = 7)


# on doit soustraire 1 à chaque étoile car classifieur commence à 0
y_train = y_train - 1
y_valid = y_valid - 1
y_test = y_test - 1

# transfo into list
X_train = X_train.tolist()
X_valid = X_valid.tolist()
X_test= X_test.tolist()

In [3]:
# tokenization
t0 = time.time()
#tokenizer = AutoTokenizer.from_pretrained("tblard/tf-allocine") # better for our task I think
#model = TFCamembertForSequenceClassification.from_pretrained("tblard/tf-allocine", num_labels = 5) # better for our task I think



tokenizer = CamembertTokenizer.from_pretrained('camembert-base')


train_encodings = tokenizer(X_train, truncation = True, padding = True,
                            max_length = 512)
valid_encodings = tokenizer(X_valid, truncation = True, padding = True, 
                            max_length= 512)
test_encodings = tokenizer(X_test, truncation = True, padding = True, 
                            max_length= 512)
model = CamembertForSequenceClassification.from_pretrained('camembert-base', num_labels = 5) # pour les 5 étoiles
classifier=  pipeline("text-classification", model = model, tokenizer= tokenizer)
#tokenizer_kwargs = {'padding':True,'truncation':True,'max_length':512,'return_tensors':'pt'}

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
train_dataset = CustomDataset(train_encodings, y_train.tolist())
valid_dataset = CustomDataset(valid_encodings, y_valid.tolist())
test_dataset = CustomDataset(test_encodings, y_valid.tolist())

In [5]:
# entrainement du modèle


#model = CamembertForSequenceClassification.from_pretrained('camembert-base', num_labels = 5) # pour les 5 étoiles
training_args = TrainingArguments(
    output_dir= '../models',
    num_train_epochs = 3,
    per_device_train_batch_size = 8,
    per_device_eval_batch_size = 8,
    warmup_steps = 500,
    weight_decay = 0.01,
    logging_dir = ".logs")

trainer = Trainer(
    model= model,
    args =  training_args,
    train_dataset = train_dataset, 
    eval_dataset = valid_dataset)

trainer.train()

  1%|          | 82/8016 [02:00<3:16:04,  1.48s/it]

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    t1 = time.time()
    delais = round((t1-t0)/60,2)
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)

    bench = pd.DataFrame({"model":"Camembert classification",
                            "grid search": "no",
                            "used/best params":"na",
                            "features": "text total",
                            "score":"na",
                            "precision": [precision],
                            "recall": [recall],
                            "f1":[f1],
                            "time_taken_mns":[delais],
                            "run_date": [time.strftime('%Y-%m-%d', time.localtime())]
                        })
    bench.to_csv('../reports/benchmark/camembert_model_benchmark.csv')
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Updating the trainer initialization with compute_metrics
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    compute_metrics=compute_metrics,
)

# evaluating
results = trainer.evaluate()
print(results)

In [None]:
# Saving model and tokenizer
trainer.save_model("../models/camembert/model_dst_camembert")

# If there is a need to save the tokenizer separately 
tokenizer.save_pretrained("../models/camembert/tokenizer_dst_camembert")

In [None]:
# runing predictions on test set

"""
test_encodings = tokenizer(test_dataset, truncation=True, padding=True, max_length=512, return_tensors="pt")

# trying with batches
y_pred = []
batch_size = 16
for i in range(0, test_encodings.input_ids.size(0), batch_size):
    batch = {k: v[i:i+batch_size] for k, v in test_encodings.items()}
    with torch.no_grad():
        outputs = model(**batch)
    logits = outputs.logits
    batch_predictions = logits.argmax(-1)
    y_pred.extend(batch_predictions.tolist())
# Camembert reporting
model_report()
"""