## Setup

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import pandas as pd
import numpy as np

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
data_file_location = '/content/drive/My Drive/SYSC 4907 COVID & Deep Learning/Model Training/'
training_data_file = 'FINAL_COMBINED_3_TRAIN.xlsx'
training_data_path = data_file_location + training_data_file
training_data = pd.read_excel(training_data_path)
training_data.head(10)

Unnamed: 0,DATA,LABEL,Unnamed: 2,Unnamed: 3
0,.... literally stuck in the 4th dimension...,0,"02/18/2021, 18:44:59",1362473201490681856
1,Looking for a map of #COVID19 vaccine provider...,1,"02/24/2021, 19:35:52",1364660332308889602
2,We need bold action -- not only to end the pan...,2,"02/18/2021, 18:42:04",1362472468401840129
3,Literally looks like a cult...,0,"02/24/2021, 19:37:01",1364660621850136582
4,This is what “school reopening” looks like. Re...,0,"02/18/2021, 18:45:10",1362473247829336071
5,"voidedOrc36584 allofDuty Brother, chances are ...",1,"02/18/2021, 18:43:46",1362472893901377545
6,"harliekirk11 Hey you dumb fuck, do a little re...",0,"03/07/2021, 22:21:34",1368688300958056448
7,Would you really make this appeal if you had p...,2,"02/24/2021, 19:44:57",1364662620893175808
8,Spin Sensation ujeeb_R88 has joined Peshawar Z...,1,"02/18/2021, 18:44:30",1362473080719826953
9,"Early on in the pandemic, covid-19 outbreaks d...",0,"02/18/2021, 18:40:19",1362472027353980931


In [None]:
test_data_file = 'FINAL_COMBINED_3_TEST.xlsx'
test_data_path = data_file_location + test_data_file
test_data = pd.read_excel(test_data_path)
test_data.head(10)

Unnamed: 0,DATA,LABEL,Unnamed: 2,Unnamed: 3
0,Why should anyone have to hack for this techno...,1,"02/18/2021, 18:40:36",1362472099185590273
1,Did they test positive for COVID?,1,"03/07/2021, 17:52:06",1368620488063324166
2,Our EY COVID-19 Response Group supporting #Thu...,2,"02/18/2021, 18:45:07",1362473236781498370
3,Joe Biden's Coronavirus Coordinator just said ...,0,"03/07/2021, 17:52:30",1368620587027927042
4,#Tanzania: Vice President uluhuSamia with no f...,1,"02/18/2021, 18:45:07",1362473237414821890
5,"COVID-19 hospitalizations dropped to 1,137 Thu...",1,"02/18/2021, 18:41:21",1362472286624751618
6,Look like it nuh sink ina your cerebrum yet......,0,"03/07/2021, 17:52:26",1368620569386758149
7,"*spring 2023* PRINCETON, N.J. — The Ivy League...",1,"02/18/2021, 18:45:05",1362473227163996164
8,"It’s not just the industry trifecta"" of chicke...",0,"02/24/2021, 19:36:47",1364660566824992769
9,"Fuck Johnathan Corona, all the homies hate Joh...",0,"02/18/2021, 18:45:25",1362473312505356289


In [None]:
training_data.LABEL.value_counts()

0    1672
1    1425
2     895
Name: LABEL, dtype: int64

In [None]:
test_data.LABEL.value_counts()

0    229
1    208
2     76
Name: LABEL, dtype: int64

In [None]:
subset = training_data[['DATA', 'LABEL']]
x_train = subset['DATA'].to_numpy()
y_train = subset['LABEL'].to_numpy()
subset = test_data[['DATA', 'LABEL']]
x_val = subset['DATA'].to_numpy()
y_val = subset['LABEL'].to_numpy()
print(x_train[0:5])
print(len(x_train), "Training sequences")
print(y_train[0:5])
print(len(x_val), "Testing sequences")

['.... literally stuck in the 4th dimension...'
 'Looking for a map of #COVID19 vaccine providers? Find our interactive map here: #InThisTogetherOhio'
 'We need bold action -- not only to end the pandemic, but to rebuild our health care system.'
 'Literally looks like a cult...'
 'This is what “school reopening” looks like. Reopening schools before every teacher is vaccinated is sanctioning mass death.']
3992 Training sequences
[0 1 2 0 0]
513 Testing sequences


## Implement a Transformer block as a layer

## Implement embedding layer

Two seperate embedding layers, one for tokens, one for token index (positions).

In [None]:
def tokenize(inp):
  i = 0
  for entry in inp:
    inp[i] = tf.keras.preprocessing.text.one_hot(inp[i], n=25000, lower=True)
    i = i + 1
  return inp

In [None]:
x_train = tokenize(x_train)
x_val = tokenize(x_val)

In [None]:
maxlen = 40
x_train = keras.preprocessing.sequence.pad_sequences(x_train, maxlen=maxlen)
x_val = keras.preprocessing.sequence.pad_sequences(x_val, maxlen=maxlen)

In [None]:

class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions


In [None]:

class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.2):
        super(TransformerBlock, self).__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)


In [None]:
vocab_size = 25000  # Only consider the top 25k words
embed_dim = 32  # Embedding size for each token
num_heads = 2  # Number of attention heads
ff_dim = 32  # Hidden layer size in feed forward network inside transformer

INP = layers.Input(shape=(maxlen,))
embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
x1 = embedding_layer(INP)
transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
x2 = transformer_block(x1)
x3 = layers.GlobalAveragePooling1D()(x2)
x4 = layers.Dropout(0.1)(x3)
layer_dense = layers.Dense(20, activation="relu")(x4)
x_drop = layers.Dropout(0.2)(layer_dense)
OUT = layers.Dense(3, activation="softmax")(x_drop)

model = keras.Model(inputs=INP, outputs=OUT)


In [None]:
model.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 40)]              0         
_________________________________________________________________
token_and_position_embedding (None, 40, 32)            801280    
_________________________________________________________________
transformer_block_1 (Transfo (None, 40, 32)            10656     
_________________________________________________________________
global_average_pooling1d_1 ( (None, 32)                0         
_________________________________________________________________
dropout_6 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_6 (Dense)              (None, 20)                660       
_________________________________________________________________
dropout_7 (Dropout)          (None, 20)                0   

## Train and Evaluate

In [None]:
opt = tf.keras.optimizers.Adam(learning_rate=1e-4)
model.compile(opt, loss='sparse_categorical_crossentropy', metrics=["sparse_categorical_accuracy"])

In [None]:
history = model.fit(
    x_train, y_train, batch_size=32, epochs=10, validation_data=(x_val, y_val)
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
