# Bert classification

In [1]:
!pip install AugmentedSocialScientist pandas numpy



In [2]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

from torch import cuda

cuda.get_device_name(0)

'Tesla T4'

In [1]:
import pandas as pd

annotations_file = pd.read_csv("../data/intermediate/current_annotation_file.csv")
label_categories = {
    "Poubelle" : "Anything but personne",
    "Humain comme empathie" : "Anything but personne",
    "Genre/espèce humaine": "Anything but personne",
    "Matériel biologique": "Anything but personne",
    "Individu": "Anything but personne",
    "Agent responsable":"Personne",
    "Personne relationnelle": "Personne",
    "Personne à protéger":"Personne"

}
annotations_file['label'] = annotations_file['Personne'].map(label_categories)
annotations_file.sample(5)

Unnamed: 0,sentence_id,Personne,sentence_text,label
630,80420,Matériel biologique,La phase critique de l'implantation se situe e...,Anything but personne
509,1280800,Agent responsable,"Au terme de cette réflexion sur l’âgisme, ten...",Personne
694,1060067,Individu,Spécificité épidémiologique de la pandémie gri...,Anything but personne
408,980024,Matériel biologique,"A ces techniques classiques, s'est ajoutée une...",Anything but personne
519,1280489,Personne relationnelle,Un enjeu essentiel est \nde faire en sorte que...,Personne


In [2]:
annotations = annotations_file.dropna()
annotations = annotations[~annotations["Personne"].isin(["Humain comme empathie", "Poubelle"])]
annotations.sample(5)

Unnamed: 0,sentence_id,Personne,sentence_text,label
293,540522,Genre/espèce humaine,Clonage reproductif : une inadmissible instrum...,Anything but personne
445,790389,Personne à protéger,"D’un côté, l’absence de consentement a pu limi...",Personne
656,570528,Individu,A la\ncollectivité incombe (i) l'identificatio...,Anything but personne
328,1020347,Personne à protéger,6) Il est illusoire de favoriser l’insertion ...,Personne
359,490124,Personne à protéger,Quelles que soient les capacités affectives du...,Personne


In [3]:
annotations["Personne"].value_counts()

Personne
Personne à protéger       143
Matériel biologique       127
Individu                  118
Agent responsable         104
Personne relationnelle     97
Genre/espèce humaine       90
Name: count, dtype: int64

In [4]:
annotations["label"].value_counts()

label
Personne                 344
Anything but personne    335
Name: count, dtype: int64

In [6]:
from sklearn.model_selection import train_test_split
 
train_data, test_data = train_test_split(annotations, train_size=0.67)

In [7]:
from AugmentedSocialScientist.models import Camembert

bert = Camembert()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


There are 1 GPU(s) available.
We will use GPU 0: Tesla T4


In [8]:
batch_size = 16
train_loader = bert.encode(train_data.sentence_text.values,
train_data.label.values, batch_size=batch_size)

  0%|          | 0/466 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1022 > 512). Running this sequence through the model will result in indexing errors


  0%|          | 0/466 [00:00<?, ?it/s]

label ids: {'Anything but personne': 0, 'Personne': 1}


In [9]:
test_loader = bert.encode(test_data.sentence_text.values,
test_data.Personne.values, batch_size=batch_size)

  0%|          | 0/231 [00:00<?, ?it/s]

  0%|          | 0/231 [00:00<?, ?it/s]

label ids: {'Anything but personne': 0, 'Personne': 1}


In [10]:
scores = bert.run_training(
    train_loader,
    test_loader,
    n_epochs=3,
    lr=2e-5,
    random_state=42,
    save_model_as='clickbait'
    )

Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Training...

  Average training loss: 0.68
  Training took: 0:00:42

Running Validation...

  Average test loss: 0.65
  Validation took: 0:00:06
                       precision    recall  f1-score   support

Anything but personne       0.76      0.68      0.71       124
             Personne       0.67      0.75      0.70       107

             accuracy                           0.71       231
            macro avg       0.71      0.71      0.71       231
         weighted avg       0.72      0.71      0.71       231


Training...

  Average training loss: 0.60
  Training took: 0:00:42

Running Validation...

  Average test loss: 0.58
  Validation took: 0:00:07
                       precision    recall  f1-score   support

Anything but personne       0.86      0.63      0.73       124
             Personne       0.67      0.88      0.76       107

             accuracy                           0.74       231
            macro avg       0.76      0.75      0.74       231
         w

In [11]:
import numpy as np
pred_data = annotations_file[annotations_file.isna().any(axis=1)].drop('Personne', axis=1).dropna()
pred_loader = bert.encode(pred_data.sentence_text.values, batch_size=batch_size)
pred_proba = bert.predict_with_model(pred_loader, model_path='./models/clickbait')
pred_data['pred_label'] = np.argmax(pred_proba, axis=1)
pred_data['pred_proba'] = np.max(pred_proba, axis=1)

  0%|          | 0/798 [00:00<?, ?it/s]

  0%|          | 0/798 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

label ids: {'Anything but personne': 0, 'Personne': 1}


In [12]:
pred_data

Unnamed: 0,sentence_id,sentence_text,pred_label,pred_proba
700,1220587,S’il semble dès lors naïf de penser que le sen...,0,0.502037
701,30245,Personne ne soutient qu'il soit possible de to...,0,0.608660
702,880051,Le temps de parole doit être suffisant pour \n...,1,0.741629
703,1120305,»\n24 Avis N°107 du 15 octobre 2009 sur les p...,0,0.738585
704,600333,Sur la question du champ d'application de la l...,0,0.551886
...,...,...,...,...
1495,940495,La garde à vue est une mesure privative de lib...,1,0.689244
1496,1200490,50\t\n “Il\t\n convient\t\n surtout\t\n de...,0,0.675180
1497,1400183,Cette mosaïque de situ ations d’inégalité s en...,0,0.507050
1498,670025,Dans le souci de clarification exposé plus hau...,0,0.730586
