In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
import tensorflow_hub as hub
import tensorflow_text as text
from transformers import *

np.set_printoptions(suppress=True)
import shutil
tf.get_logger().setLevel('ERROR')

PATH = '/Users/farahmasood/Desktop/scedev/'
dataset = pd.read_csv(PATH+'labeled_data.csv')

nRowsRead = None
dataset0 = pd.read_csv('/Users/farahmasood/Desktop/scedev/labeled_data.csv', delimiter=',', nrows = nRowsRead)
dataset0.name = 'labeled_data.csv'
nRow, nCol = dataset0.shape

c=dataset0['class']
dataset0.rename(columns={'tweet' : 'text',
                   'class' : 'category'}, 
                    inplace=True)
a=dataset0['text']
b=dataset0['category'].map({0: 'hate_speech', 1: 'offensive_language',2: 'neither'})

dataset= pd.concat([a,b,c], axis=1)
dataset.rename(columns={'class' : 'label'}, 
                    inplace=True)


hate, ofensive, neither = np.bincount(dataset['label'])
total = hate + ofensive + neither

In [2]:
from tensorflow.keras.utils import to_categorical
categorical_labels = to_categorical(np.array(dataset['label']),num_classes=3)

X_train_, X_test, y_train_, y_test = train_test_split(
    dataset.index.values,
    categorical_labels,
    test_size=0.10,
    random_state=42,
    stratify=categorical_labels,    
)

In [3]:
print(y_train_)

[[0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 ...
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]]


In [4]:
X_train_, X_vals, y_train, y_vals = train_test_split(
    X_train_,
    y_train_,
    test_size=0.10,
    random_state=42,
    stratify= y_train_,  
)

In [5]:
dataset['data_type'] = ['not_set']*dataset.shape[0]
dataset.loc[X_train_, 'data_type'] = 'train'
dataset.loc[X_vals, 'data_type'] = 'vals'
dataset.loc[X_test, 'data_type'] = 'test'

dataset.groupby(['category', 'label', 'data_type']).count()

dataset_train = dataset.loc[dataset["data_type"]=="train"]
dataset_vals = dataset.loc[dataset["data_type"]=="vals"]
dataset_test = dataset.loc[dataset["data_type"]=="test"]

train_dataset = tf.data.Dataset.from_tensor_slices((dataset_train.text.values, dataset_train.label.values))
vals_dataset = tf.data.Dataset.from_tensor_slices((dataset_vals.text.values, dataset_vals.label.values))
test_dataset = tf.data.Dataset.from_tensor_slices((dataset_test.text.values, dataset_test.label.values))

train_dataset = train_dataset.shuffle(len(dataset_train)).batch(32, drop_remainder=False)
vals_dataset = vals_dataset.shuffle(len(dataset_vals)).batch(32, drop_remainder=False)
test_dataset = test_dataset.shuffle(len(dataset_test)).batch(32, drop_remainder=False)

In [6]:
tf_encoder = 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1'
preprocess = 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/1'

bert_pre = hub.KerasLayer(preprocess)

for text_batch, label_batch in train_dataset.take(1):
  for i in range(1):
    tweet = text_batch.numpy()[i]
    label = label_batch.numpy()[i]

text_test = ['this is such an amazing movie!']
text_test = [tweet]

preprocessed = bert_pre(text_test)
bert_model = hub.KerasLayer(tf_encoder)
results = bert_model(preprocessed)

weight0 = (1 / hate)*(total)/3.0 
weight1 = (1 / ofensive)*(total)/3.0
weight2 = (1 / neither)*(total)/3.0
weights = {0: weight0, 1: weight1, 2: weight2}
bias = np.array([3.938462, 15, 5.])

INFO:absl:Using /var/folders/sd/9h__nn_90fq1rv5jz40nz6180000gn/T/tfhub_modules to cache modules.
INFO:absl:Downloading TF-Hub Module 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/1'.
INFO:absl:Downloaded https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/1, Total size: 3.22MB
INFO:absl:Downloaded TF-Hub Module 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/1'.
INFO:absl:Downloading TF-Hub Module 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1'.
INFO:absl:Downloaded https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1, Total size: 115.55MB
INFO:absl:Downloaded TF-Hub Module 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1'.


In [None]:
def RNNmodel():
    text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
    preprocessing_layer = hub.KerasLayer(preprocess, name='preprocessing')
    encoder_inputs = preprocessing_layer(text_input)
    encoder = hub.KerasLayer(tf_encoder, trainable=True, name='BERT_encoder')
    outputs = encoder(encoder_inputs)
    net = sequence_output = outputs["sequence_output"] # [batch_size, seq_length, 768]
    
    net = tf.keras.layers.Dense(512, activation="relu")(net)
    net = tf.keras.layers.LSTM(32)(net)
    net = tf.keras.layers.Dropout(0.2)(net)
    net = tf.keras.layers.Dense(3, activation="softmax", name='classifier')(net)
    return tf.keras.Model(text_input, net)

In [8]:
rnn = RNNmodel()
bertresult = rnn(tf.constant(text_test))
print(tf.sigmoid(bertresult))

tf.Tensor([[0.5508838 0.6257106 0.5700116]], shape=(1, 3), dtype=float32)


In [9]:
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

In [10]:
from official.nlp import optimization
epochs = 80
steps_per_epoch = tf.data.experimental.cardinality(train_dataset).numpy()
num_train_steps = steps_per_epoch * epochs
num_warmup_steps = int(0.1*num_train_steps)

init_lr = 3e-5
optimizer = optimization.create_optimizer(init_lr=init_lr,
                                          num_train_steps=num_train_steps,
                                          num_warmup_steps=num_warmup_steps,
                                          optimizer_type='adamw')

rnn.compile(optimizer=optimizer,
                          loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                          metrics=tf.keras.metrics.SparseCategoricalAccuracy('accuracy'))

INFO:absl:using Adamw optimizer
INFO:absl:gradient_clip_norm=1.000000


In [11]:
loss, accuracy = rnn.evaluate(test_dataset)

print(f'Loss: {loss}')
print(f'Accuracy: {accuracy}')

Loss: 0.8041883111000061
Accuracy: 0.7732957005500793
