In [1]:
import tensorflow as tf
tf.test.is_gpu_available()

Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.


False

In [2]:
import pandas as pd
import numpy as np
import pickle
use_saved_embeddings = True

In [3]:
train_set = pd.read_csv('./data/Corona_NLP_train_clean.csv')
test_set = pd.read_csv('./data/Corona_NLP_test_clean.csv')

In [4]:
from transformers import TFRobertaModel, RobertaConfig, RobertaTokenizer

# Initializing a BERT bert-base-uncased style configuration
configuration = RobertaConfig()

# Initializing a model from the bert-base-uncased style configuration
model = TFRobertaModel(configuration)

# Accessing the model configuration
configuration = model.config

In [5]:
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

In [6]:
train_set

Unnamed: 0.1,Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment,SentimentCode
0,1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,Positive,2
1,2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive,2
2,3,3802,48754,,16-03-2020,My food stock is not the only one which is emp...,Positive,2
3,4,3803,48755,,16-03-2020,"Me, ready to go at supermarket during the COVI...",Extremely Negative,0
4,5,3804,48756,"ÜT: 36.319708,-82.363649",16-03-2020,As news of the regions first confirmed COVID-...,Positive,2
...,...,...,...,...,...,...,...,...
41117,41152,44951,89903,"Wellington City, New Zealand",14-04-2020,Airline pilots offering to stock supermarket s...,Neutral,1
41118,41153,44952,89904,,14-04-2020,Response to complaint not provided citing COVI...,Extremely Negative,0
41119,41154,44953,89905,,14-04-2020,You know its getting tough when is rationing ...,Positive,2
41120,41155,44954,89906,,14-04-2020,Is it wrong that the smell of hand sanitizer i...,Neutral,1


In [7]:
X_train = list(train_set['OriginalTweet'].apply(lambda x: "[CLS]" + x + "[SEP]").values)
y_train = list(train_set['SentimentCode'].values)
X_test = list(test_set['OriginalTweet'].apply(lambda x: "[CLS]" + x + "[SEP]").values)
y_test = list(test_set['SentimentCode'].values)

In [8]:
X_train = tokenizer(X_train)
X_test = tokenizer(X_test)

In [9]:
def make_padded_tensor(ids, limit):
    for i, seq in enumerate(ids['input_ids']):
        if len(seq) > limit:
            ids['input_ids'][i] = seq[:limit]
            ids['attention_mask'][i] = ids['attention_mask'][i][:limit]
        else:
            ids['input_ids'][i] = seq + [0] * (limit - len(seq))
            ids['attention_mask'][i] = ids['attention_mask'][i] + [0] * (limit - len(seq))
    ids['input_ids'] = tf.constant(ids['input_ids'], dtype=tf.int32)
    ids['attention_mask'] = tf.constant(ids['attention_mask'], dtype=tf.int32)
    return ids

In [10]:
X_train_tensors = make_padded_tensor(X_train, 100)
X_test_tensors = make_padded_tensor(X_test, 100)

In [11]:
def get_embeddings_batchwise(X_tensors, batch_size, model, embedding_size=768):
    number_of_batches = 1 + len(X_tensors['input_ids']) // batch_size
    n_examples = len(X_tensors['input_ids'])
    sequence_length = len(X_tensors['input_ids'][0])
    embeddings = np.zeros((n_examples, embedding_size))
    for i in range(number_of_batches):
        print(f'batch {i} of {number_of_batches}. {i * batch_size} of {n_examples} Examples')
        results = model(X_tensors['input_ids'][i * batch_size:(i + 1) * batch_size], 
                        X_tensors['attention_mask'][i * batch_size:(i + 1) * batch_size], 
                        output_hidden_states=True) 
        hidden_dims = results[2][1]
        embeddings[i * batch_size:(i + 1) * batch_size] = hidden_dims[:,0]
    return embeddings

In [12]:
# stuff = model(X_tensors['input_ids'][:10], X_tensors['attention_mask'][:10], output_hidden_states=True)
if use_saved_embeddings:
    with open('embeddings_train.pkl', 'rb') as f:
        train_embeddings = pickle.load(f)
    with open('embeddings_test.pkl', 'rb') as f:
        test_embeddings = pickle.load(f)
else:
    train_embeddings = get_embeddings_batchwise(X_train_tensors, 128, model)
    test_embeddings = get_embeddings_batchwise(X_test_tensors, 128, model) 
    with open('embeddings_train.pkl', 'wb') as f:
        pickle.dump(train_embeddings, f)
    with open('embeddings_test.pkl', 'wb') as f:
        pickle.dump(test_embeddings, f)

In [13]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.metrics import f1_score, plot_confusion_matrix

In [14]:
knn_class = KNeighborsClassifier()
knn_class = knn_class.fit(train_embeddings, y_train)
print(f1_score(knn_class.predict(train_embeddings), y_train, average=None))
print(f1_score(knn_class.predict(test_embeddings), y_test, average=None))

[0.67318444 0.56462197 0.70023291]
[0.         0.         0.57880943]


In [None]:
clf = svm.SVC(decision_function_shape='ovo')
clf.fit(train_embeddings, y_train)
print(f1_score(clf.predict(train_embeddings), y_train, average=None))
print(f1_score(clf.predict(test_embeddings), y_test, average=None))