In [6]:
pip install datasets

Defaulting to user installation because normal site-packages is not writeable
Collecting datasets
  Obtaining dependency information for datasets from https://files.pythonhosted.org/packages/66/f8/38298237d18d4b6a8ee5dfe390e97bed5adb8e01ec6f9680c0ddf3066728/datasets-2.14.4-py3-none-any.whl.metadata
  Downloading datasets-2.14.4-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Obtaining dependency information for dill<0.3.8,>=0.3.0 from https://files.pythonhosted.org/packages/f5/3a/74a29b11cf2cdfcd6ba89c0cecd70b37cd1ba7b77978ce611eb7a146a832/dill-0.3.7-py3-none-any.whl.metadata
  Using cached dill-0.3.7-py3-none-any.whl.metadata (9.9 kB)
Collecting xxhash (from datasets)
  Obtaining dependency information for xxhash from https://files.pythonhosted.org/packages/10/5c/fa0c8a5f903bfd899ecc20543d84d8765d457a7a05bd9319e70e571a0dc4/xxhash-3.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
  Using cached xxhash-3.3.0-cp311-cp311-manylinux

In [None]:
# Preprocess imports
import re
# Minibatch imports
import os
import numpy as np
import pandas as pdp
from sklearn.model_selection import StratifiedKFold
import tensorflow as tf
from transformers import AutoTokenizer, DataCollatorWithPadding, Trainer, AutoModelForSequenceClassification, TFAutoModelForSequenceClassification,AutoModelForMaskedLM
from datasets import Dataset
#Trainer imports
import subprocess
from datetime import datetime
import random


#callback imports
from collections import defaultdict
import sys
from sklearn.metrics import precision_recall_curve, average_precision_score, roc_auc_score, accuracy_score
import bisect
import json
#WarmUp imports
from typing import Callable
#Additional Imports
import torch
import gc
import pandas as pd

### Preprocessor

In [2]:
class DefaultENPreprocessor():
    def __init__(self):
        # Variables for preprocessing, taken from mb_generator.
        self.url_group = r"(\bhttps?:\/\/\S+)"
        self.mention_group = r"(\B@\S+)"
        self.urls_mentions_re = re.compile(self.url_group + r"|" + self.mention_group, re.IGNORECASE)
        self.url_re = re.compile(self.url_group, re.IGNORECASE)
        self.mention_re = re.compile(self.mention_group, re.IGNORECASE)
        self.newline_re = re.compile(r"\n+", re.IGNORECASE)
        self.and_re = re.compile(r"&\s?amp\s?;", re.IGNORECASE)




    # Data Preprocessing: DefaultENPreprocessor, procedure is labelled as 'default'
    def preprocess(self, adhoc_df):
        print(
      ".... removing \\n and replacing @mentions and URLs by placeholders. "
      "Emoji filtering is not done.")

        adhoc_df["text"] = [self.url_re.sub("URL", tweet) for tweet in adhoc_df.raw_text.values]
        adhoc_df["text"] = [self.mention_re.sub("MENTION", tweet) for tweet in adhoc_df.text.values]
        adhoc_df["text"] = [
        self.newline_re.sub(" ", tweet).lstrip(" ").rstrip(" ") for tweet in adhoc_df.text.values
        ]
        adhoc_df["text"] = [self.and_re.sub("&", tweet) for tweet in adhoc_df.text.values]

        return adhoc_df


    def __call__(self, df, *args, **kwargs):
        return self.preprocess(df)

### MinibatchLoader

In [3]:
WARM_UP_PERC = 0.1
OUTER_CV = 5
INNER_CV = 5
MAX_SEQ_LENGTH = 256
TARGET_POS_PER_EPOCH = 5000
NUM_PREFETCH = 5
NUM_WORKERS = 10
LOCAL_DIR = "./LOKAL_DIR"

class BalancedMiniBatchLoader(object):
    def __init__(
    self,
    fold,
    mb_size,
    seed,
    perc_training_tox,
    scope="TOX",
    # project=...,
    dual_head=None,
    n_outer_splits=None,
    n_inner_splits=None,
    sample_weights=None,
    huggingface=False,
  ):

        # Balancing of toxic examples in dataset
        if 0 >= perc_training_tox or perc_training_tox > 0.5:
            raise ValueError("Perc_training_tox should be in ]0; 0.5]")

        self.perc_training_tox = perc_training_tox

        # no_outer_splits = 0 --> True, else False
        if not n_outer_splits:
            n_outer_splits = OUTER_CV

        if isinstance(n_outer_splits, int):
            self.n_outer_splits = n_outer_splits
            self.get_outer_fold = self._get_outer_cv_fold
            if fold < 0 or fold >= self.n_outer_splits or int(fold) != fold:
                raise ValueError(f"Number of fold should be an integer in [0 ; {self.n_outer_splits} [.")

        # Training is not time dependent -> no need for time fold
        elif n_outer_splits == "time":
            self.get_outer_fold = self._get_time_fold
            if fold != "time":
                raise ValueError(
                "To avoid repeating the same run many times, the external fold"
                "should be time when test outputs is split according to dates."
                )

            else:
                raise ValueError(
                        f"Argument n_outer_splits should either an integer or 'time'. Provided: {n_outer_splits}"
                )

        self.n_inner_splits = n_inner_splits if n_inner_splits is not None else INNER_CV

        self.seed = seed
        self.mb_size = mb_size
        self.fold = fold

        self.sample_weights = sample_weights
        self.dual_head = dual_head
        self.huggingface = huggingface
        if self.huggingface:
            self._load_tokenizer()

    def _load_tokenizer(self):
        print("Making a local copy of Bertweet-base model")
        local_model_dir = os.path.join(LOCAL_DIR)
        cmd = f"mkdir {local_model_dir}" #; gsutil -m cp -r gs://... {local_model_dir}"
        os.system(cmd)

        self.tokenizer = AutoTokenizer.from_pretrained(
        "vinai/bertweet-base", normalization=True
        )

    def tokenize_function(self, el):
        return self.tokenizer(
            el["text"],
            max_length=MAX_SEQ_LENGTH,
            padding="max_length",
            truncation=True,
            add_special_tokens=True,
            return_token_type_ids=False,
            return_attention_mask=False,
        )

    def _get_stratified_kfold(self, n_splits):
        return StratifiedKFold(shuffle=True, n_splits=n_splits, random_state=self.seed)

    def _get_time_fold(self, df):
        pass



    def _get_outer_cv_fold(self, df):
        labels = df.int_label
        stratifier = self._get_stratified_kfold(n_splits=self.n_outer_splits)

        k = 0
        for train_index, test_index in stratifier.split(np.zeros(len(labels)), labels):
            if k == self.fold:
                break
            k += 1

        train_data = df.iloc[train_index].copy()
        test_data = df.iloc[test_index].copy()

        return train_data, test_data


    def get_steps_per_epoch(self, nb_pos_examples):
        return int(max(TARGET_POS_PER_EPOCH, nb_pos_examples) / self.mb_size / self.perc_training_tox)



    def make_huggingface_tensorflow_ds(self, group, mb_size=None, shuffle=True):
        huggingface_ds = Dataset.from_pandas(group).map(self.tokenize_function, batched=True)
        data_collator = DataCollatorWithPadding(tokenizer=self.tokenizer, return_tensors="tf")
        tensorflow_ds = huggingface_ds.to_tf_dataset(
        columns=["input_ids"],
        label_cols=["labels"],
        shuffle=shuffle,
        batch_size=self.mb_size if mb_size is None else mb_size,
        collate_fn=data_collator,
        )

        if shuffle:
            return tensorflow_ds.repeat()
        return tensorflow_ds




    def make_pure_tensorflow_ds(self, df, nb_samples):
        buffer_size = nb_samples * 2

        if self.sample_weights is not None:
            if self.sample_weights not in df.columns:
                raise ValueError

            ds = tf.data.Dataset.from_tensor_slices(
                (df.text.values, df.label.values, df[self.sample_weights].values)
            )

        elif self.dual_head:
            label_d = {f'{e}_output': df[f'{e}_label'].values for e in self.dual_head}
            label_d['content_output'] = tf.keras.utils.to_categorical(label_d['content_output'], num_classes=3)
            ds = tf.data.Dataset.from_tensor_slices((df.text.values, label_d))

        else:
            ds = tf.data.Dataset.from_tensor_slices((df.text.values, df.label.values))
        ds = ds.shuffle(buffer_size, seed=self.seed, reshuffle_each_iteration=True).repeat()
        return ds

    def get_balanced_dataset(self, training_data, size_limit=None, return_as_batch=True):
        training_data = training_data.sample(frac=1, random_state=self.seed)
        nb_samples = training_data.shape[0] if not size_limit else size_limit

        num_classes = training_data.int_label.nunique()
        toxic_class = training_data.int_label.max()
        if size_limit:
            training_data = training_data[: size_limit * num_classes]

        print(
        ".... {} examples, incl. {:.2f}% tox in train, {} classes".format(
            nb_samples,
            100 * training_data[training_data.int_label == toxic_class].shape[0] / nb_samples,
            num_classes,
        )
        )

        label_groups = training_data.groupby("int_label")
        if self.huggingface:
            label_datasets = {
            label: self.make_huggingface_tensorflow_ds(group) for label, group in label_groups
             }

        else:
            label_datasets = {
            label: self.make_pure_tensorflow_ds(group, nb_samples=nb_samples * 2)
            for label, group in label_groups
            }

        datasets = [label_datasets[0], label_datasets[1]]
        weights = [1 - self.perc_training_tox, self.perc_training_tox]
        if num_classes == 3:
            datasets.append(label_datasets[2])
            weights = [1 - self.perc_training_tox, self.perc_training_tox / 2, self.perc_training_tox / 2]
        elif num_classes != 2:
            raise ValueError("Currently it should not be possible to get other than 2 or 3 classes")
        resampled_ds = tf.data.experimental.sample_from_datasets(datasets, weights, seed=self.seed)

        if return_as_batch and not self.huggingface:
            return resampled_ds.batch(
                self.mb_size, drop_remainder=True, num_parallel_calls=NUM_WORKERS, deterministic=True
                ).prefetch(NUM_PREFETCH)

        return resampled_ds

    @staticmethod
    def _compute_int_labels(full_df):
        if full_df.label.dtype == int:
            full_df["int_label"] = full_df.label

        elif "int_label" not in full_df.columns:
            if full_df.label.max() > 1:
                raise ValueError("Binarizing labels that should not be.")
            full_df["int_label"] = np.where(full_df.label >= 0.5, 1, 0)

        return full_df

    def __call__(self, full_df, *args, **kwargs):
        full_df = self._compute_int_labels(full_df)

        train_data, test_data = self.get_outer_fold(df=full_df)


        stratifier = self._get_stratified_kfold(n_splits=self.n_inner_splits)
        for train_index, val_index in stratifier.split(np.zeros(train_data.shape[0]), train_data.int_label):
            curr_train_data = train_data.iloc[train_index]

            mini_batches = self.get_balanced_dataset(curr_train_data)

            steps_per_epoch = self.get_steps_per_epoch(
                nb_pos_examples=curr_train_data[curr_train_data.int_label != 0].shape[0]
            )

            val_data = train_data.iloc[val_index].copy()

            yield mini_batches, steps_per_epoch, val_data, test_data



    def simple_cv_load(self, full_df):
        full_df = self._compute_int_labels(full_df)

        train_data, test_data = self.get_outer_fold(df=full_df)
        # insert manual validation split
        val_split = int(test_data.shape[0]*0.10)
        val_data, test_data = test_data.iloc[:val_split, :], test_data.iloc[val_split:, :]
        if test_data.shape[0] == 0:
            test_data = train_data.iloc[:500]

        mini_batches = self.get_balanced_dataset(train_data)
        steps_per_epoch = self.get_steps_per_epoch(
        nb_pos_examples=train_data[train_data.int_label != 0].shape[0]
        )

        return mini_batches, test_data, steps_per_epoch, val_data

    def no_cv_load(self, full_df):
        full_df = self._compute_int_labels(full_df)

        val_test = full_df[full_df.origin == "precision"].copy(deep=True)
        val_data, test_data = self.get_outer_fold(df=val_test)

        train_data = full_df.drop(full_df[full_df.origin == "precision"].index, axis=0)
        if test_data.shape[0] == 0:
            test_data = train_data.iloc[:500]

        mini_batches = self.get_balanced_dataset(train_data)
        if train_data.int_label.nunique() == 1:
            raise ValueError('Should be at least two labels')

        num_examples = train_data[train_data.int_label == 1].shape[0]
        if train_data.int_label.nunique() > 2:
            second_most_frequent_label = train_data.loc[train_data.int_label != 0, 'int_label'].mode().values[0]
            num_examples = train_data[train_data.int_label == second_most_frequent_label].shape[0] * 2
        steps_per_epoch = self.get_steps_per_epoch(nb_pos_examples=num_examples)

        return mini_batches, steps_per_epoch, val_data, test_data


### Callbacks

In [4]:
class NothingCallback(tf.keras.callbacks.Callback):
    def on_epoch_begin(self, epoch, logs=None):
        print("ici, ", epoch)

    def on_epoch_end(self, epoch, logs=None):
        print("fin ", epoch)

    def on_train_batch_end(self, batch, logs=None):
        print("fin de batch ", batch)


class ControlledStoppingCheckpointCallback(tf.keras.callbacks.ModelCheckpoint):
    def __init__(self, stopping_epoch, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.stopping_epoch = stopping_epoch

    def on_epoch_end(self, epoch, logs=None):
        super().on_epoch_end(epoch, logs)
        if epoch == self.stopping_epoch:
            self.model.stop_training = True


class SyncingTensorBoard(tf.keras.callbacks.TensorBoard):
    def __init__(self, remote_logdir=None, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.remote_logdir = (
            remote_logdir if remote_logdir is not None else REMOTE_LOGDIR
        )

    def on_epoch_end(self, epoch, logs=None):
        super().on_epoch_end(epoch, logs=logs)
        # self.synchronize()

    def synchronize(self):
        base_dir = os.path.dirname(self.log_dir)
        # cmd = f"gsutil -m rsync -r {base_dir} {self.remote_logdir}"
        # execute_command(cmd)


class GradientLoggingTensorBoard(SyncingTensorBoard):
    def __init__(self, loader, val_data, freq, *args, **kwargs):
        super().__init__(*args, **kwargs)
        val_dataset = loader.get_balanced_dataset(
            training_data=val_data, size_limit=50, return_as_batch=False
        )
        data_args = list(val_dataset.batch(32).take(1))[0]
        self.x_batch, self.y_batch = data_args[0], data_args[1]
        self.freq = freq
        self.counter = 0

    def _log_gradients(self):
        writer = self._train_writer

        with writer.as_default():
            with tf.GradientTape() as tape:
                y_pred = self.model(self.x_batch)
                loss = self.model.compiled_loss(y_true=self.y_batch, y_pred=y_pred)
                print(f"gradient loss : {loss}")
                gradient_norm = tf.linalg.global_norm(
                    tape.gradient(loss, self.model.trainable_weights)
                )

            tf.summary.scalar("gradient_norm", data=gradient_norm, step=self.counter)
        writer.flush()

    def on_train_batch_end(self, batch, logs=None):
        super().on_batch_end(batch, logs=logs)
        self.counter += 1
        if batch % self.freq == 0:
            self._log_gradients()


class AdditionalResultLogger(tf.keras.callbacks.Callback):
    def __init__(
        self,
        data,
        set_,
        fixed_recall=0.85,
        from_logits=False,
        dataset_transform_func=None,
        batch_size=16,
        dual_head=None,
        counter=0,
        *args,
        **kwargs,
    ):
        super().__init__(*args, **kwargs)
        self.counter = counter
        self.set_ = set_
        self.data = data
        self.batch_size = batch_size
        if data is None:
            return None

        self.single_head = True
        try:
            self.labels = data.int_label.values
        except AttributeError:
            self.labels = data.to_dataframe()[LABEL_NAMES].values.astype("int")
            self.data = data.to_tf_dataset().map(parse_labeled_data).batch(batch_size)
            self.label_names = LABEL_NAMES
        else:
            self.label_names = [""]
            if dual_head:
                self.label_names = [f"{e}_label" for e in dual_head]
                self.labels = {
                    f"{e}_output": data[f"{e}_label"].values for e in dual_head
                }
                self.single_head = False
            if dataset_transform_func is None:
                self.data = data.text.values
            else:
                self.data = dataset_transform_func(
                    data, mb_size=batch_size, shuffle=False
                )

        finally:
            if len(self.label_names) == 1:
                self.metric_kw = {}
            else:
                self.metric_kw = {"average": None}

            self.best_metrics = defaultdict(float)
            self.from_logits = from_logits
            print(
                f"Loaded callback for {set_}, from_logits: {from_logits}, labels {self.label_names}"
            )

            if 1 < fixed_recall <= 100:
                fixed_recall = fixed_recall / 100
            elif not (0 < fixed_recall <= 100):
                raise ValueError("Threshold should be between 0 and 1, or 0 and 100")
            self.fixed_recall = fixed_recall
            self.batch_size = batch_size

    def compute_precision_fixed_recall(self, labels, preds):
        precision_values, recall_values, thresholds = precision_recall_curve(y_true=labels, probas_pred=preds)
        index_recall = bisect.bisect_left(-recall_values, -1 * self.fixed_recall)
        result = precision_values[index_recall - 1]
        print(f"Precision at {recall_values[index_recall-1]} recall: {result}")

        return result, thresholds[index_recall - 1]


    def on_epoch_end(self, epoch, logs=None):
        if self.set_=='test':
            self.custom_logging()

    def on_train_batch_end(self, batch, logs=None):
        self.counter += 1
        if self.counter % 2000 == 0:
            self.additional_evaluations(step=self.counter, eval_time="batch")

    def custom_logging(self):
        preds = self.model.predict(x=self.data, batch_size=self.batch_size)
        if self.from_logits:
            preds = tf.keras.activations.sigmoid(preds.logits).numpy()
        curr_labels = self.labels
        accuracy_pred = np.array(preds).round()

        bce = tf.keras.losses.BinaryCrossentropy(from_logits=False)
        loss = bce(curr_labels.reshape(curr_labels.shape[0],) , preds.reshape(preds.shape[0],)).numpy()
        evaluation =  {
            'accuraccy': accuracy_score(y_true=curr_labels, y_pred=accuracy_pred),
            'loss': loss,
            "average_precision": average_precision_score(y_true=curr_labels, y_score=preds),
            "roc_auc": roc_auc_score(y_true=curr_labels, y_score=preds),
        }
        print(evaluation)
        metric_df=pd.DataFrame([evaluation]).reset_index(drop=True)
        if os.path.isfile('metric_df.csv'):
            metric_df.to_csv('metric_df.csv', mode='a', index=False, header=False)
        else:
            metric_df.columns = ['accuracy', 'loss', 'average_precision', 'roc_auc']
            metric_df.to_csv('metric_df.csv', index=False)


    def _binary_evaluations(self, preds, label_name=None, class_index=None):
        mask = None
        curr_labels = self.labels
        if label_name is not None:
            curr_labels = self.labels[label_name]
            if class_index is not None:
                curr_labels = (curr_labels == class_index).astype(int)

        if -1 in curr_labels:
            mask = curr_labels != -1
            curr_labels = curr_labels[mask]
            preds = preds[mask]

        accuracy_pred = np.array(preds).round()

        bce = tf.keras.losses.BinaryCrossentropy(from_logits=True)
        loss = bce(curr_labels.reshape(curr_labels.shape[0],) , preds.reshape(preds.shape[0],)).numpy()
        evaluation =  {
            'accuraccy': accuracy_score(y_true=curr_labels, y_pred=accuracy_pred),
            'loss': loss,
            "average_precision": average_precision_score(y_true=curr_labels, y_score=preds),
            "roc_auc": roc_auc_score(y_true=curr_labels, y_score=preds),
        }
        return evaluation

    def _multiclass_evaluations(self, preds):
        pr_auc_l = average_precision_score(
            y_true=self.labels, y_score=preds, **self.metric_kw
        )
        roc_auc_l = roc_auc_score(y_true=self.labels, y_score=preds, **self.metric_kw)
        metrics = {}
        for i, label in enumerate(self.label_names):
            metrics[f"pr_auc_{label}"] = pr_auc_l[i]
            metrics[f"roc_auc_{label}"] = roc_auc_l[i]

        return metrics

    def additional_evaluations(self, step, eval_time):
        print("Evaluating ", self.set_, eval_time, step)

        preds = self.model.predict(x=self.data, batch_size=self.batch_size)
        if self.from_logits:
            preds = tf.keras.activations.sigmoid(preds.logits).numpy()

        if self.single_head:
            if len(self.label_names) == 1:
                metrics = self._binary_evaluations(preds)
            else:
                metrics = self._multiclass_evaluations(preds)
        else:
            if preds[0].shape[1] == 1:
                binary_preds = preds[0]
                multic_preds = preds[1]
            else:
                binary_preds = preds[1]
                multic_preds = preds[0]

            binary_metrics = self._binary_evaluations(
                binary_preds, label_name="target_output"
            )
            metrics = {f"{k}_target": v for k, v in binary_metrics.items()}
            num_classes = multic_preds.shape[1]
            for class_ in range(num_classes):
                binary_metrics = self._binary_evaluations(
                    multic_preds[:, class_],
                    label_name="content_output",
                    class_index=class_,
                )
                metrics.update(
                    {f"{k}_content_{class_}": v for k, v in binary_metrics.items()}
                )



    def log_metrics(self, metrics_d, step, eval_time):
        pass

In [5]:
class WarmUp(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(
        self,
        initial_learning_rate: float,
        decay_schedule_fn: Callable,
        warmup_steps: int,
        power: float = 1.0,
        name: str = "",
    ):
        super().__init__()
        self.initial_learning_rate = initial_learning_rate
        self.warmup_steps = warmup_steps
        self.power = power
        self.decay_schedule_fn = decay_schedule_fn
        self.name = name

    def __call__(self, step):
        with tf.name_scope(self.name or "WarmUp") as name:
            global_step_float = tf.cast(step, tf.float32)
            warmup_steps_float = tf.cast(self.warmup_steps, tf.float32)
            warmup_percent_done = global_step_float / warmup_steps_float
            warmup_learning_rate = self.initial_learning_rate * tf.math.pow(
                warmup_percent_done, self.power
            )
            return tf.cond(
                global_step_float < warmup_steps_float,
                lambda: warmup_learning_rate,
                lambda: self.decay_schedule_fn(step - self.warmup_steps),
                name=name,
            )

    def get_config(self):
        return {
            "initial_learning_rate": self.initial_learning_rate,
            "decay_schedule_fn": self.decay_schedule_fn,
            "warmup_steps": self.warmup_steps,
            "power": self.power,
            "name": self.name,
        }


NameError: name 'Callable' is not defined

### Train Function

In [None]:
!ls LOKAL_DIR/models/bertweet-base

ls: cannot access 'LOKAL_DIR/models/bertweet-base': No such file or directory


In [None]:

LOCAL_DIR = f"./LOKAL_DIR"
REMOTE_LOGDIR = f"LOKAL_DIR/logs"
MODEL_DIR = f"LOKAL_DIR/models"
LOCAL_MODEL_DIR = "LOKAL_DIR/models/bertweet-base"
RANDOM_SEED = 1337
TRAIN_EPOCHS = 10
MINI_BATCH_SIZE = 8
TARGET_POS_PER_EPOCH = 5000
MAX_SEQ_LENGTH = 100

WARM_UP_PERC = 0.1
OUTER_CV = 5
INNER_CV = 5
NUM_PREFETCH = 5


NUM_WORKERS = 10

In [None]:
class TrainClass(object):
    OPTIMIZERS = ["Adam", "AdamW"]

    def __init__(
        self,
        optimizer_name,
        weight_decay,
        learning_rate,
        mb_size,
        train_epochs,
        ### Manual Insertion
        dual_head=False,
        content_loss_weight=1,
        sample_weights="class_weight",
        model_type="bertweet-base",
        perc_training_tox=0.25,
        preprocessing="default",
        label_column="label",
        trainable=True,
        loss_names=["bce", "cce", "scce", "focal_bce", "masked_bce", "inv_kl_loss"],
        num_classes=2,
        additional_layer=False,
        content_num_classes=None,
        # content_loss_name randomly set so that model loads
        content_loss_name="mce",
        # If model_reload =True --> some google cloud upload_stuff
        model_reload=False,
        filter_low_agreements=False,
        ### END
        language="en",
        scope="TOX",
        project=...,
        experiment_id="default",
        gradient_clipping=None,
        fold=2,
        seed=RANDOM_SEED,
        log_gradients=False,
        kw="",
        stopping_epoch=None,
        test=False,
    ):
        self.seed = seed
        self.weight_decay = weight_decay
        self.linear_lr_decay = True
        self.smart_bias_init = True
        self.learning_rate = learning_rate
        self.mb_size = mb_size
        self.train_epochs = train_epochs
        # The following values are not pre-set by Twitter
        self.dual_head = dual_head
        self.content_loss_weight = content_loss_weight
        self.sample_weights = sample_weights
        self.model_type = model_type
        self.perc_training_tox = perc_training_tox
        self.model_dir = LOCAL_DIR
        self.remote_logdir = REMOTE_LOGDIR
        self.preprocessing = preprocessing
        self.label_column = label_column
        self.trainable = trainable
        self.loss_names = loss_names
        self.loss_name = self.loss_names[0]
        self.num_classes = num_classes
        self.additional_layer = additional_layer
        self.content_num_classes = content_num_classes
        self.content_loss_name = content_loss_name
        self.model_reload = model_reload
        self.filter_low_agreements = False
        ##END
        self.gradient_clipping = gradient_clipping
        self.project = 435

        if optimizer_name not in self.OPTIMIZERS:
            raise ValueError(
                f"Optimizer {optimizer_name} not implemented. Accepted values {self.OPTIMIZERS}."
            )
        self.optimizer_name = optimizer_name
        self.log_gradients = log_gradients
        self.test = test
        self.fold = fold
        self.stopping_epoch = stopping_epoch
        self.language = language
        self.content_loss_weight = content_loss_weight if self.dual_head else None
        self.mb_loader = BalancedMiniBatchLoader(
            fold=self.fold,
            seed=self.seed,
            # Should be 0.0 < perc_training_tox < 0.5
            perc_training_tox=0.48,
            mb_size=self.mb_size,
            n_outer_splits=5,
            scope=scope,
            # project=None,
            dual_head=self.dual_head,
            sample_weights=self.sample_weights,
            huggingface=("bertweet" in self.model_type),
    )
        self.init_dirnames(kw=kw, experiment_id=experiment_id)
        print("------- Checking there is a GPU")
        self.check_gpu()

    def execute_command(self, cmd, print_=True):
        s = subprocess.run(cmd, shell=True, capture_output=print_, check=True)
        if print_:
            print(s.stderr.decode("utf-8"))
            print(s.stdout.decode("utf-8"))

    def check_gpu(self):
        try:
            self.execute_command("nvidia-smi")
        except subprocess.CalledProcessError:
            print("There is no GPU when there should be one.")
            # raise AttributeError

        l = tf.config.list_physical_devices("GPU")
        # if len(l) == 0:
        # raise ModuleNotFoundError("Tensorflow has not found the GPU. Check your installation")
        print(l)

    def init_dirnames(self, kw, experiment_id):
        kw = "test" if self.test else kw
        hyper_param_kw = ""
        if self.optimizer_name == "AdamW":
            hyper_param_kw += f"{self.weight_decay}_"
        if self.gradient_clipping:
            hyper_param_kw += f"{self.gradient_clipping}_"
        if self.content_loss_weight:
            hyper_param_kw += f"{self.content_loss_weight}_"
        experiment_name = (
            f"{self.language}{str(datetime.now()).replace(' ', '')[:-7]}{kw}_{experiment_id}{self.fold}_"
            f"{self.optimizer_name}_"
            f"{self.learning_rate}_"
            f"{hyper_param_kw}"
            f"{self.mb_size}_"
            f"{self.perc_training_tox}_"
            f"{self.train_epochs}_seed{self.seed}"
        )
        print("------- Experiment name: ", experiment_name)
        self.logdir = "logs"
        self.checkpoint_path = f"{self.model_dir}/{experiment_name}"

    def set_seeds(self, seed):
        np.random.seed(seed)

        random.seed(seed)

        tf.random.set_seed(seed)

    def preprocess(self, df):

        data_prepro = DefaultENPreprocessor()

        return data_prepro(
            df=df,
            label_column=self.label_column,
            class_weight=self.perc_training_tox
            if self.sample_weights == "class_weight"
            else None,
            filter_low_agreements=self.filter_low_agreements,
            num_classes=self.num_classes,
        )

    def get_training_actors(self, steps_per_epoch, val_data, test_data, fold):
        callbacks = self.get_callbacks(
            fold=fold, val_data=val_data, test_data=test_data
        )
        schedule = self.get_lr_schedule(steps_per_epoch=steps_per_epoch)

        optimizer = self.get_optimizer(schedule)

        return optimizer, callbacks

    def load_bertweet(self, **kwargs):
        bert = TFAutoModelForSequenceClassification.from_pretrained(
            'vinai/bertweet-base',
            num_labels=1,
            classifier_dropout=0.1,
            hidden_size=768,
            from_pt=True,
        )
        if "num_classes" in kwargs and kwargs["num_classes"] > 2:
            raise NotImplementedError

        return bert, True

    def get_loss(self, loss_name, from_logits, **kwargs):
        loss_name = loss_name.lower()
        if loss_name == "bce":
            print("Binary CE loss")
            return tf.keras.losses.BinaryCrossentropy(from_logits=from_logits)

        if loss_name == "cce":
            print("Categorical cross-entropy loss")
            return tf.keras.losses.CategoricalCrossentropy(from_logits=from_logits)

        if loss_name == "scce":
            print("Sparse categorical cross-entropy loss")
            return tf.keras.losses.SparseCategoricalCrossentropy(
                from_logits=from_logits
            )

        if loss_name == "focal_bce":
            gamma = kwargs.get("gamma", 2)
            print("Focal binary CE loss", gamma)
            return tf.keras.losses.BinaryFocalCrossentropy(
                gamma=gamma, from_logits=from_logits
            )

        if loss_name == "inv_kl_loss":
            raise NotImplementedError

        raise ValueError(
            f"This loss name is not valid: {loss_name}. Accepted loss names: BCE, masked BCE, CCE, sCCE, "
            f"Focal_BCE, inv_KL_loss"
        )

    def load(
        self,
        optimizer,
        seed,
        model_type,
        loss_name,
        trainable,
        **kwargs,
    ):
        model, from_logits = self.load_bertweet()

        pr_auc = tf.keras.metrics.AUC(
            curve="PR", name="pr_auc", from_logits=from_logits
        )
        roc_auc = tf.keras.metrics.AUC(
            curve="ROC", name="roc_auc", from_logits=from_logits
        )

        loss = self.get_loss(loss_name, from_logits, **kwargs)
        if kwargs.get("content_num_classes", None):
            second_loss = self.get_loss(
                loss_name=kwargs["content_loss_name"], from_logits=from_logits
            )
            loss_weights = {
                "content_output": kwargs["content_loss_weight"],
                "target_output": 1,
            }
            model.compile(
                optimizer=optimizer,
                loss={"content_output": second_loss, "target_output": loss},
                loss_weights=loss_weights,
                metrics=[pr_auc, roc_auc],
            )

        else:
            model.compile(
                optimizer=optimizer,
                loss=loss,
                metrics=[pr_auc, roc_auc],
            )
        print(model.summary(), "logits: ", from_logits)

        return model

    def load_model(self, optimizer):
        smart_bias_value = (
            np.log(self.perc_training_tox / (1 - self.perc_training_tox))
            if self.smart_bias_init
            else 0
        )
        model = self.load(
            optimizer=optimizer,
            seed=self.seed,
            trainable=self.trainable,
            model_type=self.model_type,
            loss_name=self.loss_name,
            num_classes=self.num_classes,
            additional_layer=self.additional_layer,
            smart_bias_value=smart_bias_value,
            content_num_classes=self.content_num_classes,
            content_loss_name=self.content_loss_name,
            content_loss_weight=self.content_loss_weight,
        )

        return model

    def _train_single_fold(
        self, mb_generator, test_data, steps_per_epoch, fold, val_data
    ):
        steps_per_epoch = 100 if self.test else steps_per_epoch

        optimizer, callbacks = self.get_training_actors(
            steps_per_epoch=steps_per_epoch,
            val_data=val_data,
            test_data=test_data,
            fold=fold,
        )
        print("Loading model")
        model = self.load_model(optimizer)
        print(f"Nb of steps per epoch: {steps_per_epoch} ---- launching training")

        training_args = {
            "epochs": self.train_epochs,
            "steps_per_epoch": steps_per_epoch,
            "batch_size": self.mb_size,
            "callbacks": callbacks,
            "verbose": 2,

        }
        mb_generator = mb_generator
        model.fit(mb_generator, **training_args)
        return model

    def train_full_model(self, df):
        print("Setting up random seed.")
        self.set_seeds(self.seed)

        print(f"Loading {self.language} data")
        # load data is internal query function
        # df = self.load_data(df)
        df = self.preprocess(df=df)

        print("Going to train on everything but the test dataset")

        mini_batches, test_data, steps_per_epoch, val_data = self.mb_loader.simple_cv_load(df)
        print(f"steps per epoch: {steps_per_epoch}")
        model = self._train_single_fold(
            mb_generator=mini_batches,
            test_data=test_data,
            steps_per_epoch=steps_per_epoch,
            fold="full",
            val_data=val_data,
        )
        print(type(model))
        model.save_pretrained("model")
        return model

    def get_callbacks(self, fold, val_data, test_data):
        fold_logdir = self.logdir + f"_fold{fold}"
        fold_checkpoint_path = self.checkpoint_path + f"_fold{fold}/{{epoch:02d}}"

        tb_args = {
          "log_dir": './logs',
          "histogram_freq": 0,
          "update_freq": 500,
          "embeddings_freq": 0,
          "remote_logdir": f"./remote_logs"
          if not self.test
          else f"{self.remote_logdir}_test",
        }
        tensorboard_callback = (
          GradientLoggingTensorBoard(loader=self.mb_loader, val_data=val_data, freq=10, **tb_args)
          if self.log_gradients
          else SyncingTensorBoard(**tb_args)
        )

        callbacks = [tensorboard_callback]
        if "bertweet" in self.model_type:
            from_logits = True
            dataset_transform_func = self.mb_loader.make_huggingface_tensorflow_ds
        else:
            from_logits = False
            dataset_transform_func = None

        fixed_recall = 0.85 if not self.dual_head else 0.5
        val_callback = AdditionalResultLogger(
          data=val_data,
          set_="validation",
          from_logits=from_logits,
          dataset_transform_func=dataset_transform_func,
          dual_head=self.dual_head,
          fixed_recall=fixed_recall
        )
        if val_callback is not None:
            callbacks.append(val_callback)

        test_callback = AdditionalResultLogger(
          data=test_data,
          set_="test",
          from_logits=from_logits,
          dataset_transform_func=dataset_transform_func,
          dual_head=self.dual_head,
          fixed_recall=fixed_recall
        )
        callbacks.append(test_callback)

        checkpoint_args = {
          "filepath": fold_checkpoint_path,
          "verbose": 0,
          "monitor": "val_pr_auc",
          "save_weights_only": True,
          "mode": "max",
          "save_freq": "epoch",
        }
        if self.stopping_epoch:
            checkpoint_callback = ControlledStoppingCheckpointCallback(
            **checkpoint_args,
            stopping_epoch=self.stopping_epoch,
            save_best_only=False,
            )
            callbacks.append(checkpoint_callback)

        return callbacks
    def get_lr_schedule(self, steps_per_epoch):
        total_num_steps = steps_per_epoch * self.train_epochs

        warm_up_perc = WARM_UP_PERC if self.learning_rate >= 1e-3 else 0
        warm_up_steps = int(total_num_steps * warm_up_perc)
        if self.linear_lr_decay:
            learning_rate_fn = tf.keras.optimizers.schedules.PolynomialDecay(
            self.learning_rate,
            total_num_steps - warm_up_steps,
            end_learning_rate=0.0,
            power=1.0,
            cycle=False,
            )
        else:
            print('Constant learning rate')
            learning_rate_fn = self.learning_rate

        if warm_up_perc > 0:
            print(f".... using warm-up for {warm_up_steps} steps")
            warm_up_schedule = WarmUp(
            initial_learning_rate=self.learning_rate,
            decay_schedule_fn=learning_rate_fn,
            warmup_steps=warm_up_steps,
            )
            return warm_up_schedule
        return learning_rate_fn


    def get_optimizer(self, schedule):
        optim_args = {
          "learning_rate": schedule,
          "beta_1": 0.9,
          "beta_2": 0.999,
          "epsilon": 1e-6,
          "amsgrad": False,
        }
        if self.gradient_clipping:
            optim_args["global_clipnorm"] = self.gradient_clipping

        print(f".... {self.optimizer_name} w global clipnorm {self.gradient_clipping}")
        if self.optimizer_name == "Adam":
            return tf.keras.optimizers.Adam(**optim_args)

        if self.optimizer_name == "AdamW":
            optim_args["weight_decay"] = self.weight_decay
            return tf.keras.optimizers.AdamW(**optim_args)
        raise NotImplementedError

    def get_training_actors(self, steps_per_epoch, val_data, test_data, fold):
        callbacks = self.get_callbacks(fold=fold, val_data=val_data, test_data=test_data)
        schedule = self.get_lr_schedule(steps_per_epoch=steps_per_epoch)

        optimizer = self.get_optimizer(schedule)

        return optimizer, callbacks

In [None]:
!ls -la LOKAL_DIR/models/bertweet-base

ls: cannot access 'LOKAL_DIR/models/bertweet-base': No such file or directory


In [None]:
print(torch.cuda.list_gpu_processes())
gc.collect()
torch.cuda.empty_cache()

GPU:0
no processes are running


In [None]:
import gc
import torch
#fixes oom error when initializing Trainer

print(torch.cuda.list_gpu_processes())
gc.collect()
torch.cuda.empty_cache()


model = AutoModelForMaskedLM.from_pretrained("vinai/bertweet-base")
tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base")
model.save_pretrained('LOKAL_DIR/models')
tokenizer.save_pretrained('LOKAL_DIR/models')

In [None]:
!rm metric_df.csv
data = pd.read_csv("data/FinalBalancedDataset.csv", index_col=0)
data.columns = ["label", "raw_text"]
test = TrainClass(optimizer_name='Adam',
               weight_decay=0.01,
               learning_rate=0.00002,
               mb_size=16,
               train_epochs=3,
              perc_training_tox=0.4256)
trained_model = test.train_full_model(data)

trained_model.save_pretrained('.')