In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [2]:
import os
os.chdir("/content/gdrive/My Drive/Studies/tweets/")

In [3]:
pip install transformers sentencepiece --quiet

In [4]:
import gc
from glob import glob

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf

from tensorflow.keras import layers, backend as K
from tensorflow.keras.callbacks import EarlyStopping, Callback
from tensorflow.keras import backend as K

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.metrics import (
    f1_score, roc_auc_score,
    accuracy_score, classification_report
)

import transformers as tr
from transformers import TFAutoModel, AutoTokenizer
from transformers.modeling_tf_utils import TFSequenceClassificationLoss

In [5]:
tf.get_logger().setLevel('ERROR')
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.TPUStrategy(tpu)
else:
    strategy = tf.distribute.get_strategy()

print(f"{'='*80}\nREPLICAS: {strategy.num_replicas_in_sync}\n{'='*80}")

INFO:absl:Entering into master device scope: /job:worker/replica:0/task:0/device:CPU:0


Running on TPU  grpc://10.73.166.42:8470
REPLICAS: 8


In [6]:
class Camembert(tr.TFRobertaPreTrainedModel, TFSequenceClassificationLoss):
    """
    Classic classifier w/ transformer layer: Camembert
    Using CLS token representation
    Output: array of (batch_size, num_labels)
    """
    config_class = tr.CamembertConfig
    _keys_to_ignore_on_load_missing = [r"pooler", r"lm_head"]
    
    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)
        self.camembert = tr.TFRobertaMainLayer(
            config,
            name = "ro_camembert")
        self.stride = layers.Lambda(lambda x: x[:, 0, :], name = "stride")
        self.classifier = layers.Dense(
            4,
            activation = tf.keras.activations.softmax,
            name = "classifier")
    
    def call(self, inputs = None, **kwargs):
        outputs = self.camembert(inputs, **kwargs)
        sequences = outputs[0]
        cls_token = self.stride(sequences)
        return self.classifier(cls_token)

In [7]:
class Save(Callback):
  def __init__(self, path = "./", monitor = 'loss'):
    super(Save, self).__init__()
    self.path = path
    self.monitor = monitor

  def on_epoch_end(self, epoch, logs = None):
    path = f"{self.path}{epoch}-{logs[self.monitor]}"
    self.model.save_pretrained(path)

# Modèle sans prétraitement
Uniquement les URLs

In [49]:
train = pd.read_pickle('datasets/train_basic.pkl')
val = pd.read_pickle('datasets/val_basic.pkl')
test = pd.read_pickle('datasets/test_basic.pkl')

In [50]:
AUTO = tf.data.experimental.AUTOTUNE
BATCH_SIZE = 16*strategy.num_replicas_in_sync
BATCH_D = BATCH_SIZE
MAX_LEN = 26
N_LABELS = 4
BUFFER = 300000
SEED = 42069
MODEL = "cam_base"
NTRAIN = train.shape[0]
NVAL = val.shape[0]
STEPS = int(np.ceil(NTRAIN/BATCH_D))
VAL_STEPS = int(np.ceil(NVAL/BATCH_D))

print("Total Steps:", STEPS)
print("Total Validation Steps:", VAL_STEPS)

Total Steps: 40
Total Validation Steps: 10


In [10]:
tr.set_seed(SEED)

In [11]:
with strategy.scope():
  model = Camembert.from_pretrained(f"weights/{MODEL}")
  tokenizer = AutoTokenizer.from_pretrained(f"weights/{MODEL}")

  save = Save(path = f"weights/{MODEL}/epochs/", monitor = "val_loss")
  early = tf.keras.callbacks.EarlyStopping(
      monitor = 'val_loss',
      patience = 2,
      restore_best_weights = True
  )
  callbacks = [save, early]
  

All model checkpoint layers were used when initializing Camembert.

All the layers of Camembert were initialized from the model checkpoint at weights/cam_base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use Camembert for predictions without further training.


In [12]:
with strategy.scope():
  model.layers[0].trainable = False
  optimizer = tf.keras.optimizers.Adam(lr=1e-5)
  model.compile(optimizer = optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [13]:
encoder = LabelEncoder()

In [51]:
x_train = tokenizer.batch_encode_plus(
  train.tweet.to_list(), truncation=True, 
  return_tensors='tf', max_length=MAX_LEN,
  return_attention_mask = False,
  padding = "max_length")['input_ids']

x_val = tokenizer.batch_encode_plus(
  val.tweet.to_list(), truncation=True, 
  return_tensors='tf', max_length=MAX_LEN,
  return_attention_mask = False,
  padding = "max_length")['input_ids']

x_test = tokenizer.batch_encode_plus(
  test.tweet.to_list(), truncation=True, 
  return_tensors='tf', max_length=MAX_LEN,
  return_attention_mask = False,
  padding = "max_length")['input_ids']

y_train = encoder.fit_transform(train.label)
y_val = encoder.transform(val.label)
y_test = encoder.transform(test.label)

In [15]:
train_dataset = (
    tf.data.Dataset
    .from_tensor_slices((x_train, y_train))
    .repeat()
    .shuffle(BUFFER)
    .batch(BATCH_D)
    .prefetch(AUTO)
)

val_dataset = (
    tf.data.Dataset
    .from_tensor_slices((x_val, y_val))
    .batch(BATCH_D)
    .prefetch(AUTO)
)

test_dataset = (
    tf.data.Dataset
    .from_tensor_slices(x_test)
    .batch(BATCH_D)
)

## Entraînement du modèle

In [16]:
epochs_done = 0
history = model.fit(
  train_dataset,
  epochs = 200,
  steps_per_epoch = STEPS,
  callbacks = [early],
  validation_data = val_dataset,
  initial_epoch = epochs_done,
  workers = 8,
  use_multiprocessing = True
)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200


In [17]:
with strategy.scope():
  model.layers[0].trainable = True
  optimizer = tf.keras.optimizers.Adam(lr=1e-6)
  model.compile(optimizer = optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [18]:
epochs_done = 42
history = model.fit(
  train_dataset,
  epochs = 200,
  steps_per_epoch = STEPS,
  callbacks = [early],
  validation_data = val_dataset,
  initial_epoch = epochs_done,
  workers = 8,
  use_multiprocessing = True
)

Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200


In [33]:
model.save_pretrained(f"weights/{MODEL}/nopreproc")

## Evaluation du modèle

In [34]:
train_dataset = (
    tf.data.Dataset
    .from_tensor_slices((
        tf.concat([x_train, x_val], axis = 0),
        np.concatenate([y_train, y_val])
      ))
    .repeat()
    .shuffle(BUFFER)
    .batch(BATCH_D)
    .prefetch(AUTO)
)

In [35]:
with strategy.scope():
  final_model_1 = Camembert.from_pretrained(f"weights/{MODEL}")
  final_model_1.layers[0].trainable = False
  optimizer = tf.keras.optimizers.Adam(lr=1e-5)
  final_model_1.compile(optimizer = optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

epochs_done = 0
history = final_model_1.fit(
  train_dataset,
  epochs = 42,
  steps_per_epoch = STEPS,
  initial_epoch = epochs_done,
  workers = 8, verbose = 0,
  use_multiprocessing = True
)

with strategy.scope():
  final_model_1.layers[0].trainable = True
  optimizer = tf.keras.optimizers.Adam(lr=1e-6)
  final_model_1.compile(optimizer = optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

epochs_done = 42
history = final_model_1.fit(
  train_dataset,
  epochs = 4,
  steps_per_epoch = STEPS,
  initial_epoch = epochs_done,
  workers = 8, verbose = 0,
  use_multiprocessing = True
)

All model checkpoint layers were used when initializing Camembert.

All the layers of Camembert were initialized from the model checkpoint at weights/cam_base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use Camembert for predictions without further training.


In [36]:
pred_1 = final_model_1.predict(x_test)
y_1 = encoder.inverse_transform(np.argmax(pred_1, axis = 1))

In [44]:
binarizer = LabelBinarizer()

In [54]:
print(classification_report(test.label, y_1))

print("F1-score: {}\nAUC: {}\nAccuracy: {}".format(
    f1_score(test.label, y_1, average = "macro"),
    roc_auc_score(binarizer.fit_transform(y_test), pred_1, average = "macro", multi_class = "ovr"),
    accuracy_score(test.label, y_1)
))

              precision    recall  f1-score   support

     EMOTION       0.00      0.00      0.00       337
 INFORMATION       0.45      1.00      0.62      1473
     OPINION       0.18      0.00      0.00       950
   SENTIMENT       0.00      0.00      0.00       523

    accuracy                           0.45      3283
   macro avg       0.16      0.25      0.16      3283
weighted avg       0.25      0.45      0.28      3283

F1-score: 0.15583510327427194
AUC: 0.5295628692569901
Accuracy: 0.4480657934815717


  _warn_prf(average, modifier, msg_start, len(result))


# Modèle avec prétraitement mais sans Lem-Stem
Données classic_nos_nol