In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
cd /content/drive/MyDrive

/content/drive/MyDrive


In [None]:
!pip install transformers
!pip install tensorflow_recommenders
!pip install nltk emoji==0.6.0

In [4]:
import os
import json
import random
import math
from glob import glob
from urllib.parse import urlparse

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, auc
from tqdm.auto import tqdm
from transformers import AutoTokenizer, TFAutoModel
import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.callbacks import ModelCheckpoint 
from tensorflow.keras.layers import Input, Dense, Dropout, Concatenate, Embedding, LSTM, MaxPooling1D, Conv1D
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.text import Tokenizer
import tensorflow_recommenders as tfrs
import gensim
from nltk.tokenize import wordpunct_tokenize, word_tokenize
import pickle

In [5]:
os.environ['PYTHONHASHSEED'] = str(42)
random.seed(42)
tf.random.set_seed(42)
np.random.seed(42)
tqdm.pandas()

# Preprocess the dataset

Download the toxicity_ratings.json from the following link: https://data.esrg.stanford.edu/study/toxicity-perspectives

In [None]:
data = []
with open('toxicity_ratings.json', 'r') as f:
    for line in tqdm(f, total=107620):
        temp_ = json.loads(line)
        temp = {}
        temp['comment'] = temp_['comment']
        temp['comment_id'] = temp_['comment_id']
        temp['perspective_score'] = temp_['perspective_score']
        count = 0
        for r in temp_['ratings']:
            for i in r:
                temp[i + str(count)] = r[i]
            count += 1
        data.append(temp)

# Extract group and text information from comment_id (Each group of annotators were assigned the same set of comments)
df = pd.DataFrame(data)
df['comment_group_id'] = df['comment_id'].apply(lambda x: x.split('_')[0])
df['comment_text_id'] = df['comment_id'].apply(lambda x: x.split('_')[1])

# Assign annotator id
df_long = pd.wide_to_long(df, stubnames=list(temp_['ratings'][0].keys()), i='comment_id', j='rater_order').reset_index()
df_long['annotator'] = df_long['comment_group_id'].astype(int)*10 + df_long['rater_order'].astype(int)
df_merged = df_long.rename({'comment': 'inputs', 'toxic_score': 'labels'}, axis=1)
le = LabelEncoder()
df_merged['annotator_id'] = le.fit_transform(df_merged['annotator'])

# Assign comment index
le = LabelEncoder()
df_merged['comment_index'] = le.fit_transform(df_merged['comment_id'])

# Assign group id
df_merged['group_merged'] = df_merged['race'] + df_merged['gender'] + df_merged['political_affilation']
le = LabelEncoder()
df_merged['group_id'] = le.fit_transform(df_merged['group_merged'])

  0%|          | 0/107620 [00:00<?, ?it/s]

In [None]:
print('# of annotators:', df_merged['annotator_id'].max()+1)
print('# of groups:', df_merged['group_id'].max() + 1)

# of annotators: 26905
# of groups: 330


In [None]:
df_merged.to_parquet('toxicity_ratings.parquet')

# Tokenize comments

In [6]:
def tokenize(df, model_name='vinai/bertweet-base'):
    df_comment = df[['comment_index', 'inputs']].drop_duplicates().sort_values('comment_index')
    input_ids = []
    attention_mask = []
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    for record in tqdm(df_comment['inputs']):
        encoded_text = tokenizer.encode_plus(text=record,
                                             add_special_tokens=True,
                                             return_attention_mask=True,
                                             max_length=128,
                                             padding='max_length',
                                             truncation=True)
        input_ids.append(encoded_text.get("input_ids"))
        attention_mask.append(encoded_text.get("attention_mask"))
    input_ids = np.vstack(input_ids)
    attention_mask = np.vstack(attention_mask)
    return input_ids, attention_mask

In [7]:
df_merged = pd.read_parquet('toxicity_ratings.parquet')
input_ids, attention_mask = tokenize(df_merged)
np.save('input_ids_toxicity.npy', input_ids)
np.save('attention_mask_toxicity.npy', attention_mask)

Downloading (…)lve/main/config.json:   0%|          | 0.00/558 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/843k [00:00<?, ?B/s]

Downloading (…)solve/main/bpe.codes:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


  0%|          | 0/107620 [00:00<?, ?it/s]