In [1]:
import tensorflow as tf
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [2]:
import transformers

In [3]:
import pandas as pd
import numpy as np
import pickle
use_saved_embeddings = True

In [4]:
train_set = pd.read_csv('./data/Corona_NLP_train_clean_no_hash.csv')
test_set = pd.read_csv('./data/Corona_NLP_test_clean_no_hash.csv')

In [5]:
from transformers import TFRobertaForSequenceClassification, RobertaConfig, RobertaTokenizer, glue_convert_examples_to_features

# Initializing a BERT bert-base-uncased style configuration
configuration = RobertaConfig()
configuration.update({'num_labels': 3})
# Initializing a model from the bert-base-uncased style configuration
model = TFRobertaForSequenceClassification(configuration)

# Accessing the model configuration
configuration = model.config
model

<transformers.models.roberta.modeling_tf_roberta.TFRobertaForSequenceClassification at 0x223fe765f70>

In [6]:
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

In [7]:
class_token = "[CLS]"
eos_token = "[SEP]"

In [8]:
tokenizer.cls_token = class_token
tokenizer.eos_token = eos_token

In [9]:
X_train = list(train_set['OriginalTweet'].apply(lambda x: class_token + x + eos_token).values)
y_train = list(train_set['SentimentCode'].values)
X_test = list(test_set['OriginalTweet'].apply(lambda x: class_token + x + eos_token).values)
y_test = list(test_set['SentimentCode'].values)

In [10]:
X_train = tokenizer(X_train, padding=True, truncation=True, return_tensors='tf', max_length=100)
X_test = tokenizer(X_test, padding=True, truncation=True, return_tensors='tf', max_length=100)

In [11]:
y_train_onehot = np.zeros((len(y_train), 3))
for i, hot in enumerate(y_train):
    y_train_onehot[i ,hot] = 1
y_test_onehot = np.zeros((len(y_test), 3))
for i, hot in enumerate(y_test):
    y_test_onehot[i ,hot] = 1
y_test_onehot

array([[1., 0., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       ...,
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.]])

In [12]:
X_train['input_ids']

<tf.Tensor: shape=(41106, 100), dtype=int32, numpy=
array([[    3, 10975,  7454, ...,     1,     1,     1],
       [    3, 10975,  7454, ...,     1,     1,     1],
       [    3, 10975,  7454, ...,     1,     1,     1],
       ...,
       [    3, 10975,  7454, ...,     1,     1,     1],
       [    3, 10975,  7454, ...,     1,     1,     1],
       [    3, 10975,  7454, ...,     1,     1,     1]])>

In [13]:
train_set['OriginalTweet'].apply(lambda x: class_token + x + eos_token).values

array(['[CLS]advice Talk to your neighbours family to exchange phone numbers create contact list with phone numbers of neighbours schools employer chemist GP set up online shopping accounts if poss adequate supplies of regular meds but not over order[SEP]',
       '[CLS]Coronavirus Australia: Woolworths to give elderly, disabled dedicated shopping hours amid COVID-19 outbreak [SEP]',
       "[CLS]My food stock is not the only one which is empty... PLEASE, don't panic, THERE WILL BE ENOUGH FOOD FOR EVERYONE if you do not take more than you need. Stay calm, stay safe. [SEP]",
       ...,
       '[CLS]You know it\x92s getting tough when is rationing toilet paper martinsville, help us out!![SEP]',
       '[CLS]Is it wrong that the smell of hand sanitizer is starting to turn me on? [SEP]',
       "[CLS] Well new/used Rift S are going for $700.00 on Amazon rn although the normal market price is usually $400.00 . Prices are really crazy right now for vr headsets since HL Alex was announced an

In [14]:
callbacks = list()

In [15]:
callbacks.append(tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', min_delta=0.001, patience=4, verbose=1,
    mode='auto', baseline=None, restore_best_weights=False
))

In [16]:
filepath ='./models'
callbacks.append(tf.keras.callbacks.ModelCheckpoint(
    filepath, monitor='val_loss', verbose=1, save_best_only=False,
    save_weights_only=False, mode='auto', save_freq='epoch',
    options=None
))

fit(
    x=None, y=None, batch_size=None, epochs=1, verbose=1, callbacks=None,
    validation_split=0.0, validation_data=None, shuffle=True, class_weight=None,
    sample_weight=None, initial_epoch=0, steps_per_epoch=None,
    validation_steps=None, validation_batch_size=None, validation_freq=1,
    max_queue_size=10, workers=1, use_multiprocessing=False
)


In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)
loss = tf.keras.losses.CategoricalCrossentropy(from_logits=True)
model.compile(optimizer=optimizer, loss=loss)
model.fit(x=X_train['input_ids'], y=tf.constant(y_train_onehot), epochs=30, steps_per_epoch=1000, batch_size=8,
         validation_data=(X_test['input_ids'] , tf.constant(y_test_onehot)), validation_batch_size=8, callbacks=callbacks)

Epoch 1/30
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method

Epoch 00001: saving model to .\models
Epoch 2/30

Epoch 00002: saving model to .\models
Epoch 3/30

Epoch 00003: saving model to .\models
Epoch 4/30

Epoch 00004: saving model to .\models
Epoch 5/30

In [None]:
model.save('roberta_classifier_early_stop.tfm')

In [None]:
y_pred = model.predict(X_test['input_ids'])

In [None]:
y_pred.logits[:32,:]

In [None]:
tf.nn.softmax(y_pred.logits, axis=1).numpy().argmax(axis=1)

In [None]:
if

In [None]:
if
# stuff = model(X_tensors['input_ids'][:10], X_tensors['attention_mask'][:10], output_hidden_states=True)
if use_saved_embeddings:
    with open('embeddings_train.pkl', 'rb') as f:
        train_embeddings = pickle.load(f)
    with open('embeddings_test.pkl', 'rb') as f:
        test_embeddings = pickle.load(f)
else:
    train_embeddings = get_embeddings_batchwise(X_train_tensors, 128, model)
    test_embeddings = get_embeddings_batchwise(X_test_tensors, 128, model) 
    with open('embeddings_train.pkl', 'wb') as f:
        pickle.dump(train_embeddings, f)
    with open('embeddings_test.pkl', 'wb') as f:
        pickle.dump(test_embeddings, f)

In [None]:
len(X_test['input_ids'])

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.metrics import f1_score, plot_confusion_matrix

In [None]:
f1_score(y_test, tf.nn.softmax(y_pred.logits, axis=1).numpy().argmax(axis=1), average=None)