#### code modified from: https://towardsdatascience.com/discover-the-sentiment-of-reddit-subgroup-using-roberta-model-10ab9a8271b8

## Train on Covid-CA/Election-16

In [1]:
!rm -rf phase2-models*

In [2]:
import pandas as pd
import tensorflow as tf
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
import tensorflow_datasets as tfds
from transformers import TFRobertaForSequenceClassification
from transformers import RobertaTokenizer, RobertaConfig, AutoTokenizer
import os


# Load your Dataset
train_tweets = pd.read_csv('data/Covid_CA_new.csv').dropna()
# train_tweets = pd.read_csv('data/Election16_new.csv').dropna()
training_sentences, testing_sentences = train_test_split(train_tweets[['text', 'target']],
                                                         test_size=0.2)
# model initialization
model = TFRobertaForSequenceClassification.from_pretrained("vinai/bertweet-covid19-base-uncased", num_labels=5)
# model = TFRobertaForSequenceClassification.from_pretrained("vinai/bertweet-base", num_labels=5)
roberta_tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-covid19-base-uncased", use_fast=False)
# roberta_tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base", use_fast=False)

max_length = 128

batch_size = 64

def convert_example_to_feature(review):
    # combine step for tokenization, WordPiece vector mapping and will
    # add also special tokens and truncate reviews longer than our max length
    return roberta_tokenizer.encode_plus(review,
                                 add_special_tokens=True,  # add [CLS], [SEP]
                                 max_length=max_length,  # max length of the text that can go to RoBERTa
                                 pad_to_max_length=True,  # add [PAD] tokens at the end of sentence
                                 return_attention_mask=True,  # add attention mask to not focus on pad tokens
                                 )

# map to the expected input to TFRobertaForSequenceClassification, see here
def map_example_to_dict(input_ids, attention_masks, label):
    return {
      "input_ids": input_ids,
      "attention_mask": attention_masks,
           }, label

def encode_examples(ds, limit=-1):
    # Prepare Input list
    input_ids_list = []
    attention_mask_list = []
    label_list = []

    if (limit > 0):
        ds = ds.take(limit)

    for review, label in tfds.as_numpy(ds):
        bert_input = convert_example_to_feature(review.decode())
        input_ids_list.append(bert_input['input_ids'])
        attention_mask_list.append(bert_input['attention_mask'])
        label_list.append([label])

    return tf.data.Dataset.from_tensor_slices((input_ids_list,
                                               attention_mask_list,
                                               label_list)).map(map_example_to_dict)

training_sentences_modified = tf.data.Dataset.from_tensor_slices((training_sentences['text'],
                                                                  training_sentences['target']))

testing_sentences_modified = tf.data.Dataset.from_tensor_slices((testing_sentences['text'],
                                                                 testing_sentences['target']))

ds_train_encoded = encode_examples(training_sentences_modified).repeat(2).shuffle(10000).batch(batch_size)
ds_test_encoded = encode_examples(testing_sentences_modified).batch(batch_size)



learning_rate = 7e-5
number_of_epochs = 20

class ModelMetrics(tf.keras.callbacks.Callback):
    def on_train_begin(self, logs={}):
        self.count_n = 1

    def on_epoch_end(self, batch, logs={}):
        
        os.mkdir('phase2-models' + str(self.count_n))
        self.model.save_pretrained('phase2-models' + str(self.count_n)) # this folder address should match with folder we created above

        self.count_n += 1

metrics = ModelMetrics()

# model.layers[0].trainable = False
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=1e-08)

loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
model.fit(ds_train_encoded, epochs=number_of_epochs,
          validation_data=ds_test_encoded, callbacks=[metrics])

Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Some layers from the model checkpoint at vinai/bertweet-covid19-base-uncased were not used when initializing TFRobertaForSequenceClassification: ['lm_head']
- This IS expected if you are initializing TFRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaForSequenceClassification from the checkpoin

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7f44f2d317d0>

## Test on 2020 tweets

In [9]:
import preprocessor as p

def predict(test_tweets, model, output_name):
    for i,v in enumerate(test_tweets['text']):
        test_tweets.loc[i,'processed_text'] = p.clean(v)

    test_tweets['target'] = 0
    # prepare data as per RoBERTa model input
    submission_sentences_modified = tf.data.Dataset.from_tensor_slices((test_tweets['processed_text'],
                                                              test_tweets['target']))
    ds_submission_encoded = encode_examples(submission_sentences_modified).batch(batch_size)

    # predict sentiment of Reddit comments
    submission_pre = tf.nn.softmax(model.predict(ds_submission_encoded))
    submission_pre_argmax = tf.math.argmax(submission_pre[0], axis=1)
    test_tweets['target'] = submission_pre_argmax
    test_tweets.to_csv(output_name, index=False) # save to file
    return test_tweets

In [4]:
data1 = pd.read_csv('data/Election20.csv')
data2 = pd.read_csv('data/Covid_US.csv')
data1['label'] = 'election'
data2['label'] = 'covid'
test_data = pd.concat([data1, data2])
test_data = test_data.sample(frac=1) # shuffle


In [5]:
test_data = test_data.reset_index(drop=True)
test_data.describe()

Unnamed: 0,time,text,label
count,2317,2317,2317
unique,2268,2164,2
top,Thu Aug 20 18:35:07 +0000 2020,RT @realDonaldTrump: Many more people would ha...,election
freq,3,8,1298


In [6]:
predict(test_data, model, 'predict-data/Covid_model_predict.csv')



Unnamed: 0,time,text,label,processed_text,target
0,Thu Oct 22 18:23:59 +0000 2020,RT @BillOReilly: Lesley Stahl denies the econo...,covid,: Lesley Stahl denies the economy before the p...,1
1,Wed Oct 07 18:17:08 +0000 2020,RT @CaslerNoel: The first time I heard a story...,election,: The first time I heard a story about Trump r...,1
2,Wed Oct 21 18:20:29 +0000 2020,RT @seanhannity: BREAKING: Jim Jordan Says Sta...,election,: BREAKING: Jim Jordan Says Staff Has Independ...,1
3,Tue Oct 06 18:22:43 +0000 2020,RT @AntillanaSoy_: nobody recovers from COVID-...,covid,: nobody recovers from COVID-19 in days ... sp...,1
4,Tue Aug 18 18:01:22 +0000 2020,RT @phatpussymo: Like cool... but she’s dead a...,election,: Like cool... but shes dead and theres thousa...,1
...,...,...,...,...,...
2312,Tue Oct 06 18:29:05 +0000 2020,RT @Christo29932651: @kimKBaltimore 60 years o...,election,: years of incremental feminization &amp; atta...,1
2313,Tue Sep 29 18:09:36 +0000 2020,RT @kylegriffin1: Inbox: Biden for President a...,covid,: Inbox: Biden for President announced Joe Bid...,2
2314,Wed Oct 07 18:48:34 +0000 2020,RT @NikkoGuy: my mental health during coronavi...,covid,: my mental health during coronavirus,2
2315,Wed Sep 30 18:32:19 +0000 2020,@BillCorbett Was freshman in highschool at age...,election,"Was freshman in highschool at age , quite diff...",1
