In [18]:
from transformers import BertTokenizer
from transformers import TFBertModel
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import tensorflow as tf

df = pd.read_csv("FinalBalancedDataset.csv")

tokenizer = BertTokenizer.from_pretrained('bert-base-cased') #bert is a really good nlp model(s) that already understands sentiment and can assign diff numerical values to words

token = tokenizer.encode_plus ( #tf no longer allows u to input strings, so you have to tokenize data first
    df['tweet'].iloc[0],
    max_length = 256,
    truncation = True,
    padding = 'max_length',
    add_special_tokens = True,
    return_tensors = 'tf'
)

xInputIds = np.zeros((len(df), 256))
xAttnMasks = np.zeros((len(df), 256))

def generate_training_data(df, ids, masks, tokenizer):
    for i, text in tqdm(enumerate(df['tweet'])):
        tokenized_text = tokenizer.encode_plus (
            text,
            max_length = 256,
            truncation = True,
            padding = 'max_length',
            add_special_tokens = True,
            return_tensors = 'tf'
        )
        ids[i, :] = tokenized_text.input_ids
        masks[i, :] = tokenized_text.attention_mask
    return ids, masks

xInputIds, xAttnMasks = generate_training_data(df, xInputIds, xAttnMasks, tokenizer)

labels = np.zeros((len(df), 2))
labels[np.arange(len(df)), df['Toxicity'].values] = 1
labels

dataset = tf.data.Dataset.from_tensor_slices((xInputIds, xAttnMasks, labels))

def SentimentDatasetMapFunction(inputIds, attnMasks, labels): 
    return {
        'inputIds': inputIds,
        'attentionMask': attnMasks
    }, labels

dataset = dataset.map(SentimentDatasetMapFunction)

dataset = dataset.shuffle(10000).batch(20, drop_remainder = True)

p = 0.8
trainSize = int((len(df)//20)*p)

trainDataset = dataset.take(trainSize)
valDataset = dataset.skip(trainSize)

bertModel = TFBertModel.from_pretrained('bert-base-cased')

inputIds = tf.keras.layers.Input(shape = (256,), name = 'inputIds', dtype = 'int32')
attentionMasks = tf.keras.layers.Input(shape = (256,), name = 'attentionMask', dtype = 'int32')

bertEmbds = bertModel.bert(inputIds, attention_mask = attentionMasks)[1]
intermediateLayer = tf.keras.layers.Dense(512, activation = 'relu', name = 'intermediateLayer')(bertEmbds)
outputLayer = tf.keras.layers.Dense(2, activation = 'softmax', name = 'outputLayer')(intermediateLayer) 

model = tf.keras.Model(inputs = [inputIds, attentionMasks], outputs = outputLayer)

model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate = 1e-5, decay = 1e-6), loss = tf.keras.losses.CategoricalCrossentropy(), metrics = [tf.keras.metrics.CategoricalAccuracy('accuracy')])
model.fit (
    trainDataset,
    validation_data = valDataset,
    epochs = 1
)
model.save('model.h5')

# if i could change something, I would break up data and train in smaller batches instead of all at once, it took very long just for one epoch, but one epoch also got an accuracy of about 95%

0it [00:00, ?it/s]

Some layers from the model checkpoint at bert-base-cased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.
2022-06-04 14:16:53.780386: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


  15/2269 [..............................] - ETA: 1:00:43 - loss: 0.4393 - accuracy: 0.8200

KeyboardInterrupt: 