In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from transformers import TFAutoModel, AutoTokenizer
from tensorflow.keras.metrics import BinaryAccuracy

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load and preprocess the data
df = pd.read_csv('training.1600000.processed.noemoticon.csv', encoding='latin', engine='python', header=None)
df.columns = ['target', 'id', 'date', 'flag', 'user', 'text']
df = df.sample(n=1000)

In [33]:
X = df['text']
labels = df['target']
labels[labels == 4] = 1

print(labels)

1105507    1
1062593    1
1547440    1
1409509    1
1324396    1
          ..
1240471    1
1001524    1
1014063    1
1416810    1
1337048    1
Name: target, Length: 1000, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  labels[labels == 4] = 1


In [4]:
# Tokenize and pad the sequences
'''tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
sequences = tokenizer.texts_to_sequences(X)'''

'tokenizer = Tokenizer()\ntokenizer.fit_on_texts(X)\nsequences = tokenizer.texts_to_sequences(X)'

In [5]:
# Load BERT tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
# Tokenize the input data
'''input_ids = []
for example in X:
    tokens = tokenizer.encode(example, add_special_tokens=True, max_length=50, padding='max_length', truncation=True)
    input_ids.append(tokens)'''

max_len = 50
def tokenize(sentence):
    tokens = tokenizer.encode_plus(sentence, max_length=max_len,
                                   truncation=True, padding='max_length',
                                   add_special_tokens=True, return_attention_mask=True,
                                   return_token_type_ids=False, return_tensors='tf')
    return tokens['input_ids'], tokens['attention_mask']

# initialize two arrays for input tensors
Xids = np.zeros((len(df), max_len))
Xmask = np.zeros((len(df), max_len))

for i, sentence in enumerate(df['text']):
    Xids[i, :], Xmask[i, :] = tokenize(sentence)

In [6]:
#labels = tf.keras.utils.to_categorical(labels)

In [7]:
# Create the attention mask matrix
'''attention_mask = np.ones_like(input_ids)
padded_positions = np.equal(input_ids, 0)
attention_mask[padded_positions] = 0'''

'attention_mask = np.ones_like(input_ids)\npadded_positions = np.equal(input_ids, 0)\nattention_mask[padded_positions] = 0'

In [8]:
dataset = tf.data.Dataset.from_tensor_slices((Xids, Xmask, labels))

def map_func(sequence, attention, labels):
    return {'input_ids': sequence, 'attention_mask': attention}, labels

dataset = dataset.map(map_func)

for i in dataset.take(1):
    print(i)

({'input_ids': <tf.Tensor: shape=(50,), dtype=float64, numpy=
array([  101.,   137.,   173.,  6622.,  7880., 12355.,   157.,  1324.,
        1775.,  1111.,  1103.,   108.,   189., 10073.,  8634., 26032.,
        1116.,  6294.,   119.,  4095.,  1122.,  1209.,  1782.,   117.,
        1133.,  1191.,  1177.,   146.,   112.,  1325.,  3325.,  1103.,
         109.,   102.,     0.,     0.,     0.,     0.,     0.,     0.,
           0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
           0.,     0.])>, 'attention_mask': <tf.Tensor: shape=(50,), dtype=float64, numpy=
array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])>}, <tf.Tensor: shape=(), dtype=int64, numpy=1>)


In [9]:
dataset = dataset.batch(64)

In [10]:
for i in dataset.take(1):
    print(i)

({'input_ids': <tf.Tensor: shape=(64, 50), dtype=float64, numpy=
array([[  101.,   137.,   173., ...,     0.,     0.,     0.],
       [  101.,   146.,  1306., ...,     0.,     0.,     0.],
       [  101.,   137., 14477., ...,     0.,     0.,     0.],
       ...,
       [  101.,   137.,   179., ...,     0.,     0.,     0.],
       [  101.,  1199.,  2196., ...,     0.,     0.,     0.],
       [  101.,   146.,  1341., ...,     0.,     0.,     0.]])>, 'attention_mask': <tf.Tensor: shape=(64, 50), dtype=float64, numpy=
array([[1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.],
       ...,
       [1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.]])>}, <tf.Tensor: shape=(64,), dtype=int64, numpy=
array([1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,

In [11]:
DS_LEN = len(df)

train_len = 0.8  # 90-10 split
val_len = 0.5

train = dataset.take(round(DS_LEN*train_len))  # get first 90% of batches
test = dataset.skip(round(DS_LEN*train_len))  # skip first 90% and keep final 10%

test = dataset.take(round(DS_LEN*val_len))  # get first 90% of batches
val = dataset.skip(round(DS_LEN*val_len))  # skip first 90% and keep final 10%



del dataset  # optionally, delete dataset to free up disk-space

In [12]:
# Define the BERT model
bert = TFAutoModel.from_pretrained('bert-base-cased')

Some layers from the model checkpoint at bert-base-cased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [13]:
input_ids = tf.keras.layers.Input(shape=(50,), name='input_ids', dtype='int32')
mask = tf.keras.layers.Input(shape=(50,), name='attention_mask', dtype='int32')

# we consume the last_hidden_state tensor from bert (discarding pooled_outputs)
embeddings = bert(input_ids, attention_mask=mask)[0]

X = tf.keras.layers.LSTM(64)(embeddings)
X = tf.keras.layers.BatchNormalization()(X)
X = tf.keras.layers.Dense(64, activation='relu')(X)
X = tf.keras.layers.Dropout(0.1)(X)
y = tf.keras.layers.Dense(1, activation='sigmoid', name='outputs')(X)

# define input and output layers of our model
model = tf.keras.Model(inputs=[input_ids, mask], outputs=y)

# freeze the BERT layer - otherwise we will be training 100M+ parameters...
model.layers[2].trainable = False

print(model.summary())

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 50)]         0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 50)]         0           []                               
                                                                                                  
 tf_bert_model (TFBertModel)    TFBaseModelOutputWi  108310272   ['input_ids[0][0]',              
                                thPoolingAndCrossAt               'attention_mask[0][0]']         
                                tentions(last_hidde                                               
                                n_state=(None, 50,                                            

In [14]:
# Define a checkpoint callback
'''checkpoint_path = "checkpoint/model.ckpt"
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_path,
    save_weights_only=True,
    save_best_only=True,  # Save only the best model based on a monitored metric
    monitor='val_acc',  # Metric to monitor for saving the best model
    verbose=1
)'''

'checkpoint_path = "checkpoint/model.ckpt"\ncheckpoint_callback = tf.keras.callbacks.ModelCheckpoint(\n    filepath=checkpoint_path,\n    save_weights_only=True,\n    save_best_only=True,  # Save only the best model based on a monitored metric\n    monitor=\'val_acc\',  # Metric to monitor for saving the best model\n    verbose=1\n)'

In [15]:
# Load the saved weights from the checkpoint
'''checkpoint_path = "checkpoint/model.ckpt"
model.load_weights(checkpoint_path)'''

'checkpoint_path = "checkpoint/model.ckpt"\nmodel.load_weights(checkpoint_path)'

In [16]:
print(tokenizer.vocab_size)

28996


In [17]:

optimizer = tf.keras.optimizers.Adam(0.01)
#loss = tf.keras.losses.CategoricalCrossentropy()  # categorical = one-hot
loss = tf.keras.losses.binary_crossentropy
binary_accuracy = BinaryAccuracy(name='binary_accuracy')

model.compile(optimizer=optimizer, loss=loss, metrics=[binary_accuracy])

history = model.fit(train, validation_data=val, epochs=40)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [19]:
print(model.evaluate(test))

[0.5270599722862244, 0.8159999847412109]


In [36]:
pred = model.predict(test)



In [37]:
results = []

for p in pred:
    if p >= 0.5:
        results.append(1)
    else:
        results.append(0)

'''for a in labels:
    if a >= :
        answers.append(0)
    else:
        answers.append(1)'''

print(results)
print(labels.tolist())

[1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 

In [45]:
for i in test.take(1):
    print(i[1])

tf.Tensor(
[1 1 1 1 1 1 0 0 0 0 1 0 1 0 1 1 1 0 1 1 1 1 1 1 1 0 1 1 0 0 0 1 1 0 1 0 0
 0 1 1 1 1 0 0 1 0 1 0 0 0 0 0 0 0 1 0 1 1 0 0 0 1 1 0], shape=(64,), dtype=int64)


In [None]:
model.save('sentimentTransformerV3')



INFO:tensorflow:Assets written to: sentimentTransformerV2\assets


INFO:tensorflow:Assets written to: sentimentTransformerV2\assets
