In [None]:
!pip install transformers
!pip install datasets

In [None]:
import tensorflow as tf
import tensorflow_hub as hub

from transformers import RobertaTokenizer, TFRobertaModel
from transformers import BertTokenizer, TFBertModel
from transformers import XLNetTokenizer, TFXLNetModel
from datasets import load_dataset
from huggingface_hub import login
from sklearn.utils import shuffle
from tensorflow.keras.optimizers import Adam
import os
import pandas as pd
import regex as re
import string

In [None]:
dataset = load_dataset("nevikw39/ADReSSo_whisper-large-v3_transcript")
dataset["train"]=dataset["train"].shuffle(seed=42)
transcript = dataset["train"]['transcript_no-chunked']

In [None]:
models = {'roberta':(RobertaTokenizer, 'roberta-large', TFRobertaModel),
          'bert':(BertTokenizer, 'bert-base-uncased', TFBertModel),
          'xlnet':(XLNetTokenizer, 'xlnet-large-cased', TFXLNetModel)}

In [None]:
tokenizer, model_type, model_name = models['roberta']

In [None]:
def make_inputs(tokenizer, model_type, serie, max_len = 256):
    tokenizer = tokenizer.from_pretrained(model_type, lowercase=True )
    tokenized_data = [tokenizer.encode_plus(text, max_length=max_len,
                                            padding = 'max_length',
                                            add_special_tokens=True,
                                            truncation = True) for text in serie]

    input_ids = np.array([text['input_ids'] for text in tokenized_data])
    attention_mask = np.array([text['attention_mask'] for text in tokenized_data])
    return input_ids, attention_mask

In [None]:
import numpy as np
input_ids_train, attention_mask_train = \
make_inputs(tokenizer, model_type, transcript, max_len = 256)

In [None]:
##### TPU or no TPU
def init_model(model_name, model_type, num_labels, Tpu = 'on', max_len = 256):
# ------------------------------------------------ with TPU --------------------------------------------------------------#
    if Tpu == 'on':
        # a few lines of code to get our tpu started and our data distributed on it
        resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='grpc://' + os.environ['COLAB_TPU_ADDR'])
        tf.config.experimental_connect_to_cluster(resolver)
        tf.tpu.experimental.initialize_tpu_system(resolver)
        # print("All devices: ", tf.config.list_logical_devices('TPU'))

        strategy = tf.distribute.TPUStrategy(resolver)
        with strategy.scope():

            model_ = model_name.from_pretrained(model_type)
            # inputs
            input_ids = tf.keras.Input(shape = (max_len, ), dtype = 'int32')
            attention_masks = tf.keras.Input(shape = (max_len,), dtype = 'int32')

            outputs = model_([input_ids, attention_masks])

            if 'xlnet' in model_type:
                # cls is the last token in xlnet tokenization
                outputs = outputs[0]
                cls_output = tf.squeeze(outputs[:, -1:, :], axis=1)
            else:
                cls_output = outputs[1]

            final_output = tf.keras.layers.Dense(num_labels, activation = 'softmax')(cls_output)
            model = tf.keras.Model(inputs = [input_ids, attention_masks], outputs = final_output)
            model.compile(optimizer = Adam(learning_rate = 5e-6), loss = 'categorical_crossentropy',
                        metrics = ['accuracy'])
# ------------------------------------------------ without TPU --------------------------------------------------------------#
    else:
        model_ = model_name.from_pretrained(model_type)
        # inputs
        input_ids = tf.keras.Input(shape = (max_len, ), dtype = 'int32')
        attention_masks = tf.keras.Input(shape = (max_len,), dtype = 'int32')

        outputs = model_([input_ids, attention_masks])

        if 'xlnet' in model_type:
            # cls is the last token in xlnet tokenization
            outputs = outputs[0]
            cls_output = tf.squeeze(outputs[:, -1:, :], axis=1)
        else:
            cls_output = outputs[1]


        final_output = tf.keras.layers.Dense(num_labels, activation = 'softmax')(cls_output)

        model = tf.keras.Model(inputs = [input_ids, attention_masks], outputs = final_output)

        model.compile(optimizer = Adam(learning_rate = 5e-6), loss = 'binary_crossentropy',
                    metrics = ['accuracy'])
    return model

In [None]:
model = init_model(model_name, model_type, num_labels = 2, Tpu = 'on', max_len = 256)

In [None]:
train_y = tf.keras.utils.to_categorical(dataset["train"]["label"], num_classes=2)

In [None]:
history = model.fit([input_ids_train, attention_mask_train], train_y,
          callbacks=tf.keras.callbacks.EarlyStopping(patience=10),
          validation_split=0.25, epochs = 50, batch_size = 2,
          shuffle = True)

In [None]:
import matplotlib.pyplot as plt
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.show()

In [None]:
input_ids_test, attention_mask_test = make_inputs(tokenizer, model_type, dataset["test"]['transcript_no-chunked'], max_len = 256)
model.evaluate(x = [input_ids_test, attention_mask_test],
               y = tf.keras.utils.to_categorical(dataset["test"]["label"], num_classes=2),
               batch_size = 4)