# SENTENCE CLASSIFICATION

## 1. Loading packages and dataframe

In [2]:
import os
from pathlib import Path
import pandas as pd

import torch

from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import Trainer, TrainingArguments

In [4]:
from src.config import CLASSES
from src.sentence_clf.data_loader import ClassificationDataset
from src.utils import *

ModuleNotFoundError: No module named 'src'

## 2. Loading the Tokenizer and the pretrained model

We define here the pretrained model that we will to build a sentence classifier. And then, we will use a sample data file that is available in the repository `data/sample_data.csv` . This file contains sentences in French and it includes **è classes**.

Since the sentences are in French, we have chosen a well-known pretrained model: **camembert-base**.

**NOTE:** Yous should use the same model name to load the Tokenizer, since each pretrained model has its own tokenizer. Using a different Tokenizer could cause some problems during the training or the evaluation phase.

In [13]:
model = "camembert-base"
nb_labels = 7

# Set the model and the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model)
clf_model = AutoModelForSequenceClassification.from_pretrained(model, num_labels=nb_labels)

Some weights of the model checkpoint at camembert-base were not used when initializing CamembertForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 

## 3. Preparing the training set

We have defined the **sentence classes** in the `src/config.py` file. To fine-tune the pre-trained model to perform a multiclass classification task, we should transform our **target variable** using a **one-hot encoder** before to train the model, since we have many output in the model as the number of classes that we want to detect.



In [14]:
encoder = get_on_hot_encoder(data=list(CLASSES.values()))


NameError: name 'get_on_hot_encoder' is not defined

In [None]:
# Load the training and validation sets
train_df = pd.read_csv("data/sample_data.csv", sep='|')

# Map the classes to numeric values
train_df['new_label'] = train_df['labels'].map(CLASSES)

# Use the one-hot encoder to transform our target variable
train_df['encoded_label'] = train_df["new_label"].apply(
    lambda x: encoder.transform([[x]]).toarray().tolist()[0])

We split the training set into **training** and **validation** sets using only the **text** and the **encoded label** columns. Then, we should prepare these data sets to feed them to the Trainer of HugginFace. For this, we should tokenize the text using the **Tokenizer** and then we use the Dataset Loader that allows to load the data set by batch.

In [None]:
# Split data into training and validation sets
texts = train_df["text"].tolist()
labels = train_df["encoded_label"].tolist()
train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts, labels, test_size=.1, random_state=17, stratify=labels
)

# Prepare train and val sets for the training
train_encodings = tokenizer(train_texts, truncation=True, max_length=300,
                            padding=True)
val_encodings = tokenizer(val_texts, truncation=True, max_length=300,
                          padding=True)

train_dataset = ClassificationDataset(train_encodings, train_labels)
val_dataset = ClassificationDataset(val_encodings, val_labels)

## 4. Configure the Trainer

To fine-tune the pretrained model, we use the **Trainer** module from HugginFace. For this, we should first configure the training arguments to use during the training.

In [None]:
# Config the Trainer
training_args = TrainingArguments(
    output_dir="./output_model",     # output directory
    num_train_epochs=5,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=20,                # number of steps before to store training metrics
    evaluation_strategy="steps",     # strategy to compute the training metrics
    save_strategy="steps",           # should be the same as evaluation_strategy
    load_best_model_at_end=True,     # load the best model at the end of the training
    report_to="none",                # useful if used with mlflow for training reporting
    run_name="none",                 # name of the run to report to mlflow
)

In [None]:
# Preparing the Trainer
trainer = Trainer(
    model=clf_model,                  # the instantiated 🤗 Transformers model to be trained
    args=training_args,               # training arguments, defined above
    train_dataset=train_dataset,      # training dataset
    eval_dataset=val_dataset,         # evaluation dataset
    compute_metrics=compute_metrics,  # function to compute the metrics during the training
)

## 5. Training

In [None]:
# Run the Trainer
trainer.train()

In [7]:
# Saving the BEST MODEL
trainer.save_model(output_dir="./output_model")

NameError: name 'trainer' is not defined

## 6. Evaluation - Metrics

In [8]:
# Computing metrics on training and validation sets
train_pred = trainer.predict(test_dataset=train_dataset, metric_key_prefix="train")
val_pred = trainer.predict(test_dataset=val_dataset, metric_key_prefix="val")

NameError: name 'trainer' is not defined

In [9]:
# Printing metrics
print(train_pred.metrics)
print(val_pred.metrics)

NameError: name 'train_pred' is not defined

In [10]:
# Training Loss evolution
history = trainer.state.log_history
plot_history_loss(
    history=history,
    output_file=None,
)

NameError: name 'trainer' is not defined

In [11]:

# Saving the VALIDATION SET
inverse_rof_classes = {v: k for k, v in CLASSES.items()}

val_df = pd.DataFrame(columns=["text", "true_label", "prediction_label", "prediction_score"])
val_df['text'] = val_texts
val_df['true_label_nb'] = encoder.inverse_transform(val_labels)
val_df['true_label'] = val_df['true_label_nb'].map(inverse_rof_classes)
idx = np.argmax(val_pred.predictions, axis=-1)
val_df['prediction_label_nb'] = idx
val_df['prediction_label'] = val_df['prediction_label_nb'].map(inverse_rof_classes)
scores = torch.nn.functional.softmax(torch.tensor(val_pred.predictions), dim=1).tolist()
val_df['prediction_score'] = [score[index] for score, index in zip(scores, idx)]

NameError: name 'CLASSES' is not defined

In [12]:
plot_confusion_matrix(
    y_true=val_df["true_label_nb"].tolist(),
    y_pred=val_df["prediction_label_nb"].tolist(),
    labels=list(CLASSES.keys()),
    output_file=None,
)

NameError: name 'plot_confusion_matrix' is not defined