In [1]:
import pandas as pd
from transformers import BertTokenizer
from transformers import TFBertModel
import numpy as np
from tqdm.auto import tqdm
import tensorflow as tf

#twitter dataset of opinions and facts
df = pd.read_csv("opinionfactdata.csv")

# all of this is extremely similar to OK2Say
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

xInputIds = np.zeros((len(df), 256))
xAttnMasks = np.zeros((len(df), 256))

def generate_training_data(df, ids, masks, tokenizer):
    for i, text in tqdm(enumerate(df['sentence'])):
        tokenized_text = tokenizer.encode_plus (
            text,
            max_length = 256,
            truncation = True,
            padding = 'max_length',
            add_special_tokens = True,
            return_tensors = 'tf'
        )
        ids[i, :] = tokenized_text.input_ids
        masks[i, :] = tokenized_text.attention_mask
    return ids, masks

xInputIds, xAttnMasks = generate_training_data(df, xInputIds, xAttnMasks, tokenizer)

labels = np.zeros((len(df), 2))
labels[np.arange(len(df)), df['label'].values] = 1

dataset = tf.data.Dataset.from_tensor_slices((xInputIds, xAttnMasks, labels))

def SentimentDatasetMapFunction(inputIds, attnMasks, labels): 
    return {
        'inputIds': inputIds,
        'attentionMask': attnMasks
    }, labels

dataset = dataset.map(SentimentDatasetMapFunction)

dataset = dataset.shuffle(10000).batch(20, drop_remainder = True)

p = 0.8
trainSize = int((len(df)//20)*p)

trainDataset = dataset.take(trainSize)
valDataset = dataset.skip(trainSize)

bertModel = TFBertModel.from_pretrained('bert-base-cased')

inputIds = tf.keras.layers.Input(shape = (256,), name = 'inputIds', dtype = 'int32')
attentionMasks = tf.keras.layers.Input(shape = (256,), name = 'attentionMask', dtype = 'int32')

bertEmbds = bertModel.bert(inputIds, attention_mask = attentionMasks)[1]
intermediateLayer = tf.keras.layers.Dense(512, activation = 'relu', name = 'intermediateLayer')(bertEmbds)
outputLayer = tf.keras.layers.Dense(2, activation = 'softmax', name = 'outputLayer')(intermediateLayer) 

model = tf.keras.Model(inputs = [inputIds, attentionMasks], outputs = outputLayer)

model.compile(optimizer = tf.keras.optimizers.legacy.Adam(learning_rate = 1e-5, decay = 1e-6), loss = tf.keras.losses.CategoricalCrossentropy(), metrics = [tf.keras.metrics.CategoricalAccuracy('accuracy')])
model.fit (
    trainDataset,
    validation_data = valDataset,
    epochs = 1  
)
model.save('opinionfactmodel.h5')

  from .autonotebook import tqdm as notebook_tqdm
8092it [00:02, 3439.82it/s]
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of th



  saving_api.save_model(
