In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
cd /content/drive/MyDrive

/content/drive/MyDrive


In [None]:
!pip install transformers
!pip install tensorflow_recommenders
!pip install nltk emoji==0.6.0

In [None]:
import os
import json
import random
import math
from glob import glob
from urllib.parse import urlparse

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, auc
from tqdm.auto import tqdm
from transformers import AutoTokenizer, TFAutoModel
import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.callbacks import ModelCheckpoint 
from tensorflow.keras.layers import Input, Dense, Dropout, Concatenate, Embedding, LSTM, MaxPooling1D, Conv1D
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.text import Tokenizer
import tensorflow_recommenders as tfrs
import gensim
from nltk.tokenize import wordpunct_tokenize, word_tokenize
import pickle

In [None]:
os.environ['PYTHONHASHSEED'] = str(42)
random.seed(42)
tf.random.set_seed(42)
np.random.seed(42)
tqdm.pandas()

# Train model

In [None]:
def build_model(model_name='vinai/bertweet-base', max_length=128, 
                hidden_size=768, n_annotators=35804, n_groups=331,
                classifier_dropout=0.2):
    bert = TFAutoModel.from_pretrained(model_name)
    input_ids = Input(shape=(max_length,), name='input_ids', dtype="int32")
    attention_mask = Input(shape=(max_length,), name='attention_mask', dtype="int32")
    bert.layers[0]._name = 'roberta'
    x1 = bert.roberta(input_ids, attention_mask=attention_mask)[1]
    x = [x1]
    annotator_id = Input(name='annotator_id', shape=(1, ))
    x2 = Embedding(n_annotators, 32, name='annotator_embed')(annotator_id)
    x2 = tf.keras.layers.Reshape((32, ))(x2)
    x.append(x2)
    group_id = Input(name='group_id', shape=(1, ))
    x3 = Embedding(n_groups, 32, name='group_embed')(group_id)
    x3 = tf.keras.layers.Reshape((32, ))(x3)
    x.append(x3)
    x = tf.concat(x, axis=1)
    for i in range(3):
        x = tfrs.layers.dcn.Cross(projection_dim=hidden_size, 
                                  kernel_initializer="glorot_uniform", 
                                  name=f'cross_{i}')(x)
        x = Dropout(classifier_dropout)(x)
    for i in range(3):
        x = tf.keras.layers.Dense(hidden_size, activation="relu", name=f'dense_{i}')(x)
        x = Dropout(classifier_dropout)(x)
    out = Dense(1, name="out")(x)
    model = Model(inputs={'input_ids': input_ids, 'attention_mask': attention_mask, 
                          'annotator_id': annotator_id, 'group_id': group_id}, outputs=out)
    return model

In [None]:
df_merged = pd.read_parquet('toxicity_ratings.parquet')
input_ids = np.load('input_ids_toxicity.npy')
attention_mask = np.load('attention_mask_toxicity.npy')

model = build_model()

In [None]:
model.get_layer('roberta').trainable = True

In [None]:
train, val = train_test_split(df_merged, test_size=1/10, random_state=42)
inputs = {'input_ids': input_ids[train['comment_index'], :], 'attention_mask': attention_mask[train['comment_index'], :]}
inputs['annotator_id'] = np.array(train['annotator_id'])
inputs['group_id'] = np.array(train['group_id'])
inputs_val = {'input_ids': input_ids[val['comment_index'], :], 'attention_mask': attention_mask[val['comment_index'], :]}
inputs_val['annotator_id'] = np.array(val['annotator_id'])
inputs_val['group_id'] = np.array(val['group_id'])

In [None]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5), 
              loss=tf.keras.losses.MeanAbsoluteError(), 
              metrics=[tf.metrics.MeanAbsoluteError()])
checkpoint = ModelCheckpoint('jury-{epoch:03d}', verbose=1, monitor='val_loss', 
                             save_best_only=True, mode='auto')  
model.fit(inputs, np.array(train.labels), 
          validation_data=(inputs_val, np.array(val.labels)), 
          batch_size=512, epochs=2, callbacks=[checkpoint])
model.get_layer('roberta').trainable = False
model.fit(inputs, np.array(train.labels), 
          validation_data=(inputs_val, np.array(val.labels)), 
          batch_size=512, epochs=6, callbacks=[checkpoint])
model.save_weights('jury')