In [None]:
import tensorflow as tf
tf.test.is_gpu_available()

In [None]:
import pandas as pd
import numpy as np

In [None]:
train_set = pd.read_csv('./data/Corona_NLP_train_clean.csv')
test_set = pd.read_csv('./data/Corona_NLP_test_clean.csv')

In [None]:
from transformers import TFRobertaModel, RobertaConfig, RobertaTokenizer

# Initializing a BERT bert-base-uncased style configuration
configuration = RobertaConfig()

# Initializing a model from the bert-base-uncased style configuration
model = TFRobertaModel(configuration)

# Accessing the model configuration
configuration = model.config

In [None]:
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

In [None]:
train_set

In [None]:
X_train = list(train_set['OriginalTweet'].apply(lambda x: "[CLS]" + x + "[SEP]").values)
y_train = list(train_set['SentimentCode'].values)

In [None]:
X_train = tokenizer(X_train)

In [None]:
X_train.keys()

In [None]:
len(X_train['input_ids'][0])

In [None]:
len(X_train['attention_mask'][0])

In [None]:
max_len = 0
lengths = list()
for seq in X_train['input_ids']:
    if len(seq) > max_len:
        max_len = len(seq)
    lengths.append(len(seq))

In [None]:
max_len

In [None]:
def make_padded_tensor(ids, limit):
    for i, seq in enumerate(ids['input_ids']):
        if len(seq) > limit:
            ids['input_ids'][i] = seq[:limit]
            ids['attention_mask'][i] = ids['attention_mask'][i][:limit]
        else:
            ids['input_ids'][i] = seq + [0] * (limit - len(seq))
            ids['attention_mask'][i] = ids['attention_mask'][i] + [0] * (limit - len(seq))
    ids['input_ids'] = tf.constant(ids['input_ids'], dtype=tf.int32)
    ids['attention_mask'] = tf.constant(ids['attention_mask'], dtype=tf.int32)
    return ids

In [None]:
X_tensors = make_padded_tensor(X_train, 100)

In [None]:
def get_embeddings_batchwise(X_tensors, batch_size, model, embedding_size=768):
    number_of_batches = 1 + len(X_tensors['input_ids']) // batch_size
    n_examples = len(X_tensors['input_ids'])
    sequence_length = len(X_tensors['input_ids'][0])
    embeddings = np.zeros((n_examples, embedding_size))
    for i in range(number_of_batches):
        print(f'batch {i} of {number_of_batches}. {i * batch_size} of {n_examples} Examples')
        results = model(X_tensors['input_ids'][i * batch_size:(i + 1) * batch_size], 
                        X_tensors['attention_mask'][i * batch_size:(i + 1) * batch_size], 
                        output_hidden_states=True) 
        hidden_dims = results[2][1]
        embeddings[i * batch_size:(i + 1) * batch_size] = hidden_dims[:,0]
    return embeddings

In [None]:
# stuff = model(X_tensors['input_ids'][:10], X_tensors['attention_mask'][:10], output_hidden_states=True)
embeddings = get_embeddings_batchwise(X_tensors, 128, model)

In [None]:
import pickle
with open('embeddings_train.pkl', 'wb') as f:
    pickle.dump(embeddings, f)

In [None]:
embeddings.shape

In [None]:
from sklearn.neighbors import KNeighborsClassifier