## Installing & importing needed libraries

In [1]:
# ! pip install transformers
# ! pip install nltk
# ! pip install --upgrade gdown
# ! pip install pytorch-crf

In [2]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

import re
import os
import numpy as np
import pandas as pd
from IPython import display
import time

import torch
from torch.utils.data import Dataset, DataLoader
from torchcrf import CRF

from transformers import BertTokenizer, TFBertForSequenceClassification, BertModel
from transformers import InputExample, InputFeatures

from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score, f1_score

import matplotlib.pyplot as plt
import gdown
from tqdm import tqdm, trange
import time

In [3]:
os.environ["CUDA_LAUNCH_BLOCKING"]="1"

In [4]:
# nltk.download('all-corpora')

Downloading dataset

In [5]:
# uri = "https://drive.google.com/uc?id=1fJujYj3rBuh34FukQa2bG7lhefcYNNwX"
# output = "dataset/dataset_steam_review.csv"
# if not os.path.exists("dataset/"):
#   os.makedirs("dataset/")
# gdown.download(url=uri, output=output, quiet=False, fuzzy=True)

## Preprocessing Data

In [6]:
# df_steam_reviews = pd.read_csv(output)
df_steam_reviews = pd.read_csv("D:/Training/Machine Learning/Datasets/dataset_steam_review/dataset.csv")
df_steam_reviews = df_steam_reviews.sample(frac=1).reset_index(drop=True) #shuffle the data
df_steam_reviews.shape

(6417106, 5)

Remove early access reviews

In [7]:
# remove Early Access Reviews
df_steam_reviews = df_steam_reviews[df_steam_reviews.review_text.str.strip() != 'Early Access Review']
# size of dataframe
df_steam_reviews.shape

(5392419, 5)

In [8]:
# Preprocessing

def remove_links(x):
    x = re.sub(r"http\S+", "", x)
    x = re.sub(r"https\S+", "", x)
    x = re.sub(r"www.\S+", "", x)
    x = re.sub(".*\..*\..*", "", x)
    return x

def remove_hashtag(x):
    x = re.sub("@[A-Za-z0-9_]+","", x)
    x = re.sub("#[A-Za-z0-9_]+","", x)
    return x

def remove_punct(x):
    x = re.sub(r"[()!?:;,.'-]","", x)
    return x

def remove_emoji(x):
    x = x.replace(":)", "")
    x = x.replace(":-)", "")
    x = x.replace(":(", "")
    x = x.replace("-_-", "")
    x = x.replace(";)", "")
    x = x.replace(";-)", "")
    # REFERENCE FOR EMOJI_PATTERN: https://gist.github.com/slowkow/7a7f61f495e3dbb7e3d767f97bd7304b
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags 
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    x = emoji_pattern.sub(r'', x)
    return x

def remove_titiktitik(x):
    x = x.replace("..", "")
    x = x.replace("...", "")
    x = x.replace("....", "")
    x = x.replace(".....", "")
    x = x.replace("...................", "")
    return x

def remove_money(x):
    x = re.sub("€", "", x)
    x = re.sub("$", "", x)
    x = x.replace("usd", "")
    return x

def fix_typo(x):
    x = x.replace("veru", "very")
    x = x.replace("gud", "good")
    x = x.replace("gut", "good")
    x = x.replace("withouth", "without")
    x = x.replace("noob", "newbie")
    x = x.replace("dis", "this")
    x = x.replace("noobs", "newbie")
    x = x.replace("nice1", "nice")
    x = x.replace("4ever", "forever")
    x = x.replace("w0n", "won")
    x = re.sub("&lt;3", "", x)
    x = x.replace("graficks", "graphics")
    x = x.replace("dissapeared", "disappeared")
    x = x.replace("yr", "year")
    x = x.replace("yrs", "years")
    x = x.replace("dosent", "doesnt")
    x = x.replace("awsume","awesome")
    x = re.sub("&lt3","",x)
    x = x.replace("compatative", "competitive")
    x = x.replace("cyyounterstrikesyyource", "counter strike source")
    x = x.replace("&amp","and")
    x = x.replace("yyoure","your")
    x = x.replace("cyyounter","counter")
    x = x.replace("child hood", "childhood")
    return x

In [9]:
# convert review text to string
df_steam_reviews["review_text"] = df_steam_reviews["review_text"].astype(str)
# convert to lowercase
df_steam_reviews["review_text"] = df_steam_reviews["review_text"].apply(str.lower)

# drop the reviews with null score
df_steam_reviews = df_steam_reviews[df_steam_reviews["review_score"].notnull()]
df_steam_reviews["review_score"] = np.where(df_steam_reviews["review_score"]==-1, 0, df_steam_reviews["review_score"])

# remove links
df_steam_reviews["review_text"] = df_steam_reviews["review_text"].apply(remove_links)

# remove hashtag
df_steam_reviews["review_text"] = df_steam_reviews["review_text"].apply(remove_hashtag)

# removing punctuation
df_steam_reviews["review_text"] = df_steam_reviews["review_text"].apply(remove_punct)

# removing dots
df_steam_reviews["review_text"] = df_steam_reviews["review_text"].apply(remove_titiktitik)

# removing emoji
df_steam_reviews["review_text"] = df_steam_reviews["review_text"].apply(remove_emoji)

# remove money symbols
df_steam_reviews["review_text"] = df_steam_reviews["review_text"].apply(remove_money)

# fix any typo
df_steam_reviews["review_text"] = df_steam_reviews["review_text"].apply(fix_typo)

# remove any stopwords
stop = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y']
df_steam_reviews["review_text"] = df_steam_reviews["review_text"].apply(lambda x: " ".join(x for x in x.split() if x not in stop))

# lemmatize
lemmatizer = WordNetLemmatizer()
df_steam_reviews["review_text"] = df_steam_reviews["review_text"].apply(lambda x: " ".join(lemmatizer.lemmatize(word) for word in x.split()))

#remove empty text
df_steam_reviews = df_steam_reviews[df_steam_reviews.review_text.str.strip() != '']

# distribution of negative and positive reviews
df_steam_reviews["review_score"].value_counts()

1    1965881
0     300832
Name: review_score, dtype: int64

In [10]:
# Most reviewed game
df2 = df_steam_reviews.groupby(['app_id'])['app_id'].count().reset_index(name='count').sort_values(['count'], ascending=False)
df3 = df2['app_id'].values.tolist()[0:10]

print(df2.iloc[:10])
# print(df3)

for id in df3:
    df_gn = df_steam_reviews.loc[df_steam_reviews['app_id'].isin([id])]['app_name'].unique()
    print(df_gn)

      app_id  count
22       570  48575
1565  218620  47112
1181  105600  46821
21       550  31270
2156  252950  30497
1739  230410  25308
5872  391540  21905
2582  271590  21651
23       620  21038
132     4000  17596
['Dota 2']
['PAYDAY 2']
['Terraria']
['Left 4 Dead 2']
['Rocket League']
['Warframe']
['Undertale']
['Grand Theft Auto V']
['Portal 2']
["Garry's Mod"]


In [11]:
reviewed_steam = df_steam_reviews.loc[df_steam_reviews["app_id"].isin(df3)] # take 10 most reviewed game only

# sampling the data
p = 0.05
reviewed_steam = reviewed_steam.sample(frac = p).reset_index(drop=True) # take 5% of data and shuffle it

Balancing data

In [12]:
# # take the positive as many as the negative ones
total_data = len(reviewed_steam["review_score"])
total_data_positive = len(reviewed_steam[reviewed_steam["review_score"] == 1])
total_data_negative = len(reviewed_steam[reviewed_steam["review_score"] == 0])

print("Total data cleaned:", total_data)
print("Total data positive:", total_data_positive)
print("Total data negative:", total_data_negative)

# df_steam_reviews_balanced_positive = reviewed_steam[reviewed_steam["review_score"] == 1].sample(n = total_data_negative) 
# df_steam_reviews_balanced_negative = reviewed_steam[reviewed_steam["review_score"] == 0] 
# df_steam_reviews_balanced = pd.concat([df_steam_reviews_balanced_positive, df_steam_reviews_balanced_negative])
# df_steam_reviews_balanced = df_steam_reviews_balanced.sample(frac = 1).reset_index(drop=True) #shuffle the data again
# df_steam_reviews_balanced.head()

# print("Data Balanced with ratio 1:1...")
# print(f"Jumlah data positive : jumlah data negative = {len(df_steam_reviews_balanced_positive)} : {total_data_negative}")

Total data cleaned: 15589
Total data positive: 14073
Total data negative: 1516


In [13]:
# reviews = df_steam_reviews_balanced["review_text"].values.tolist()
# labels = df_steam_reviews_balanced["review_score"].tolist()
reviews = reviewed_steam["review_text"].values.tolist()
labels = reviewed_steam["review_score"].tolist()

In [14]:
print(len(reviews))

15589


Preparing data into train/valid/test split

In [15]:
# split the dataset into train, validation and holdout sets (60-20-20)
training_sentences, test_sentences, training_labels, test_labels = train_test_split(reviews, labels, test_size=.4)

validation_sentences, holdout_sentences, validation_labels, holdout_labels = train_test_split(test_sentences, test_labels, test_size=.5)

## Vector Extraction using BERT

In [16]:
MAX_LEN = 128
NUM_LABELS = 2
BATCH = 64
DEVICE_USED = "cuda:0"
LEARNING_RATE = 1e-6
LAMBDA_L2 = 2e-6
EPOCHS = 250
MODEL_PATH = "D:/Training/Machine Learning/NLP/Sentiment Analysis/proposed_model/bilstm_crf/"

In [17]:
device = torch.device(DEVICE_USED if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [18]:
# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('D:/Training/Machine Learning/Datasets/bert-base-uncased', model_max_length=MAX_LEN)
# tokenizer = BertTokenizer.from_pretrained('/home/jupyter-23521059/bert-base-uncased', model_max_length=MAX_LEN)

In [19]:
def TokenizeDataset(x_data, y_data):
    if len(x_data) != len(y_data):
        raise Exception("x_data and y_data size are different!")
    
    t = trange(len(x_data), colour="green", position=0, leave=True)
    
    out_padded_token_list = []
    out_att_mask = []
    out_tok_type_id = []
    out_target = []
    
    for sentence_idx in t:
        t.set_description(f"Tokenizing data [{sentence_idx + 1}/{len(x_data)}]...")
        encoded_sentence = tokenizer.encode_plus(
            x_data[sentence_idx],
            add_special_tokens = True,
            max_length = MAX_LEN,
            truncation = 'longest_first',
            padding = 'max_length',
            return_attention_mask = True,
            return_tensors = 'pt'
        )

        padded_token_list = encoded_sentence['input_ids']
        att_mask = encoded_sentence['attention_mask']
        tok_type_id = encoded_sentence['token_type_ids']
        target = torch.tensor(y_data[sentence_idx])
        
        out_padded_token_list.append(padded_token_list)
        out_att_mask.append(att_mask)
        out_tok_type_id.append(tok_type_id)
        out_target.append(target)
        
    output_data = {
        "input_ids": out_padded_token_list,
        "attention_mask": out_att_mask,
        "token_type_ids": out_tok_type_id,
        "label": out_target
    }

    return output_data

In [20]:
tokenized_training = TokenizeDataset(training_sentences, training_labels)

Tokenizing data [9353/9353]...: 100%|[32m█████████████████████████████████████████████[0m| 9353/9353 [00:10<00:00, 888.89it/s][0m


In [21]:
tokenized_validation = TokenizeDataset(validation_sentences, validation_labels)

Tokenizing data [3118/3118]...: 100%|[32m█████████████████████████████████████████████[0m| 3118/3118 [00:03<00:00, 898.66it/s][0m


In [22]:
tokenized_holdout = TokenizeDataset(holdout_sentences, holdout_labels)

Tokenizing data [3118/3118]...: 100%|[32m█████████████████████████████████████████████[0m| 3118/3118 [00:03<00:00, 907.80it/s][0m


In [23]:
model = BertModel.from_pretrained(
    'D:/Training/Machine Learning/Datasets/bert-base-uncased', 
#     '/home/jupyter-23521059/bert-base-uncased',
    num_labels=NUM_LABELS,
    output_hidden_states = True, # Whether the model returns all hidden-states,
    )

Some weights of the model checkpoint at D:/Training/Machine Learning/Datasets/bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [24]:
# # Put model to GPU
# model.to(device)

Extract the embedding weight

In [25]:
class TokenizedData(Dataset):
    def __init__(self, input_ids, attention_mask, token_type_ids, labels):
        self.input_ids = input_ids
        self.att_mask = attention_mask
        self.tti = token_type_ids
        self.labels = labels
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, idx):
        return {
            "input_ids": self.input_ids[idx],
            "attention_mask": self.att_mask[idx],
            "token_type_ids": self.tti[idx],
            "label": self.labels[idx]
        }

In [26]:
datas_train_tok = TokenizedData(tokenized_training['input_ids'], tokenized_training['attention_mask'], tokenized_training['token_type_ids'], tokenized_training['label'])

In [27]:
datas_valid_tok = TokenizedData(tokenized_validation['input_ids'], tokenized_validation['attention_mask'], tokenized_validation['token_type_ids'], tokenized_validation['label'])

In [28]:
datas_holdout_tok = TokenizedData(tokenized_holdout['input_ids'], tokenized_holdout['attention_mask'], tokenized_holdout['token_type_ids'], tokenized_holdout['label'])

In [29]:
dataloader_training_tok = DataLoader(
    datas_train_tok,
    batch_size = BATCH
)
dataloader_valid_tok = DataLoader(
    datas_valid_tok,
    batch_size = BATCH
)
dataloader_holdout_tok = DataLoader(
    datas_holdout_tok,
    batch_size = BATCH
)

In [30]:
# model.eval()
# with torch.no_grad():
#     outputs = model(
#         input_ids = tokenized_training['input_ids'][0],
#         attention_mask = tokenized_training['attention_mask'][0],
#         token_type_ids = tokenized_training['token_type_ids'][0]
#     )
    
#     last_hidden_state = outputs[0]
#     hidden_states = outputs[2]
#     initial_embedding = hidden_states[0] # initial embedding
#     word_embed_4_last_layers = torch.stack(hidden_states[-4:]).sum(0) #sum of last 4 hidden layers
    
#     print("last hidden state:", last_hidden_state.size())
#     print("hidden states:", len(hidden_states))
#     print("initial embedding:", initial_embedding.size())
#     print("last 4 layers:", word_embed_4_last_layers.size())

In [31]:
def ExtractEmbedding(the_model, datas, total_data):
    the_model.eval() # put model to evaluation mode
    
    bert_embedding_sv = []
    bert_embedding_label = []
    
    with torch.no_grad():
        t = tqdm(enumerate(datas), colour="green", position=0, leave=True, total=len(datas))
        i = 0
        for batch, data in t:
            for idx in range(len(data["input_ids"])):
                in_ids = data["input_ids"][idx]
                att_mask = data["attention_mask"][idx]
                tok_type = data["token_type_ids"][idx]

                output = the_model(
                    input_ids = in_ids,
                    attention_mask = att_mask,
                    token_type_ids = tok_type
                )
                
                hidden_states = output[2]
                word_embed_4_last_layers = torch.stack(hidden_states[-4:]).sum(0) #sum of last 4 hidden layers
#                 print("last 4 layers:", word_embed_4_last_layers.size())
#                 print("label:", data['label'][idx])
                
                bert_embedding_sv.append(word_embed_4_last_layers)
                bert_embedding_label.append(data['label'][idx])
                
                t.set_description(f"Extracting embedding weight [{i+1}/{total_data}] ")
                t.refresh()
                
                i += 1
        return bert_embedding_sv, bert_embedding_label

In [32]:
training_embeddings, training_embd_labels = ExtractEmbedding(model, dataloader_training_tok, len(tokenized_training['input_ids']))

Extracting embedding weight [9353/9353] : 100%|[32m██████████████████████████████████████[0m| 147/147 [35:43<00:00, 14.58s/it][0m


In [33]:
validation_embeddings, valid_embd_labels = ExtractEmbedding(model, dataloader_valid_tok, len(tokenized_validation['input_ids']))

Extracting embedding weight [3118/3118] : 100%|[32m████████████████████████████████████████[0m| 49/49 [12:21<00:00, 15.12s/it][0m


In [34]:
holdout_embeddings, holdout_embd_labels = ExtractEmbedding(model, dataloader_holdout_tok, len(tokenized_holdout['input_ids']))

Extracting embedding weight [3118/3118] : 100%|[32m████████████████████████████████████████[0m| 49/49 [12:30<00:00, 15.32s/it][0m


In [35]:
print(f"Panjang embedding: {len(training_embeddings)}, Panjang label: {len(training_embd_labels)}")

Panjang embedding: 9353, Panjang label: 9353


In [36]:
training_embeddings[0].size()

torch.Size([1, 128, 768])

## Proposed Model: BiLSTM + CRF

### Model Declaration

In [37]:
words = []
for sentence in reviews:
    for word in sentence:
        words.append(word)

words = list(set(words))
VOCAB_LEN = len(words)
print(VOCAB_LEN)

241


In [38]:
class ProposedModel1(torch.nn.Module):
    def __init__(self, lstm_in_size, lstm_hdn_size, lstm_layers, lstm_dropout = 0.2):
        super(ProposedModel1, self).__init__()
        
        self.lstm_layers = lstm_layers
        self.lstm_hdn_size = lstm_hdn_size
        
        self.batchnorm = torch.nn.BatchNorm1d(lstm_in_size)
        self.lstm = torch.nn.LSTM(
            input_size = lstm_in_size,
            hidden_size = lstm_hdn_size//2,
            num_layers = lstm_layers,
            bidirectional = True,
            dropout = lstm_dropout,
            batch_first = True
        )
        self.fc1 = torch.nn.Linear(
            in_features = lstm_hdn_size,
            out_features = 768 # number of features inside hidden layer
        )
        self.fc2 = torch.nn.Linear(
            in_features = 768,
            out_features = 2 # number of classes (binary)
        )
    
    def forward(self, x):
        x_perm = x.permute(0, 2, 1)
        b = self.batchnorm(x_perm)
        b_perm = b.permute(0, 2, 1)

        # Init the hidden state
        h0 = torch.zeros(2*self.lstm_layers, x.size(0), self.lstm_hdn_size//2).requires_grad_()
        h0 = h0.to(device)
      
        # Init the cell state
        c0 = torch.zeros(2*self.lstm_layers, x.size(0), self.lstm_hdn_size//2).requires_grad_()
        c0 = c0.to(device)

        h, _ = self.lstm(b_perm, (h0.detach(), c0.detach()))
        y1 = self.fc1(h)
        output = self.fc2(y1)
        
        return output

In [39]:
class ProposedModel2(torch.nn.Module):
    def __init__(self, gru_in_size, gru_hdn_size, gru_layers, gru_dropout):
        super(ProposedModel2, self).__init__()
        
        self.gru_layers = gru_layers
        self.gru_hdn_size = gru_hdn_size
        
        self.batchnorm = torch.nn.BatchNorm1d(gru_in_size)
        self.gru = torch.nn.GRU(
            input_size = gru_in_size,
            hidden_size = gru_hdn_size//2,
            num_layers = gru_layers,
            bidirectional = True,
            dropout = gru_dropout,
            batch_first = True
        )
        self.fc1 = torch.nn.Linear(
            in_features = gru_hdn_size,
            out_features = 768 # number of features inside hidden layer
        )
        self.fc2 = torch.nn.Linear(
            in_features = 768,
            out_features = 2 # number of classes (binary)
        )
    
    def forward(self, x):
        x_perm = x.permute(0,2,1)
        b = self.batchnorm(x_perm)
        b_perm = b.permute(0,2,1)
        
        # Init the hidden state
        h0 = torch.zeros(2*self.gru_layers, x.size(0), self.gru_hdn_size//2).requires_grad_()
        h0 = h0.to(device)

        h, _ = self.gru(b_perm, h0.detach())
        y1 = self.fc1(h)
        output = self.fc2(y1)
        
        return output

In [40]:
proposed_model1 = ProposedModel1(
    lstm_in_size = 768,
    lstm_hdn_size = 1024,
    lstm_layers = 2,
    lstm_dropout = 0.45
)
proposed_model1 = proposed_model1.to(device)
print(proposed_model1)

ProposedModel1(
  (batchnorm): BatchNorm1d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (lstm): LSTM(768, 512, num_layers=4, batch_first=True, dropout=0.25, bidirectional=True)
  (fc1): Linear(in_features=1024, out_features=768, bias=True)
  (fc2): Linear(in_features=768, out_features=2, bias=True)
)


In [41]:
proposed_model2 = ProposedModel2(
    gru_in_size = 768,
    gru_hdn_size = 1024,
    gru_layers = 2,
    gru_dropout = 0.35
)
proposed_model2 = proposed_model2.to(device)
print(proposed_model2)

ProposedModel2(
  (batchnorm): BatchNorm1d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (gru): GRU(768, 512, num_layers=4, batch_first=True, dropout=0.25, bidirectional=True)
  (fc1): Linear(in_features=1024, out_features=768, bias=True)
  (fc2): Linear(in_features=768, out_features=2, bias=True)
)


In [42]:
# Define loss function
loss_fn1 = torch.nn.BCELoss()

# Define optimizer
opt1 = torch.optim.AdamW(
    proposed_model1.parameters(),
    lr=LEARNING_RATE,
    weight_decay=LAMBDA_L2
)

In [43]:
# Define loss function
loss_fn2 = torch.nn.BCELoss()

# Define optimizer
opt2 = torch.optim.AdamW(
    proposed_model2.parameters(),
    lr=LEARNING_RATE,
    weight_decay=LAMBDA_L2
)

### Training & Validation

#### Create all needed functions to train, validate, and plot

Create dataloader for embedding vector

In [44]:
class EmbeddingDataset(Dataset):
    def __init__(self, arr_embed, arr_lbl):
        super(EmbeddingDataset, self).__init__()
        self.array_embed = arr_embed
        self.array_label = arr_lbl
    def __len__(self):
        return len(self.array_embed)
    def __getitem__(self, idx):
        all_embedding = self.array_embed[idx][0, :, :] # torch.squeeze(self.array_embed[idx])
        cls_embedding = self.array_embed[idx][0, 0, :]
        cls_embedding = cls_embedding[None, :]
        data_pair = {
            "all_embedding": all_embedding, # all embedding data (all of seq length) - size [128, 768]
            "cls_embedding": cls_embedding, # CLS embedding only - size [1, 768]
            "label": self.array_label[idx]
        }
        return data_pair

In [45]:
embed_train_dataset = EmbeddingDataset(training_embeddings, training_embd_labels)
embed_valid_dataset = EmbeddingDataset(validation_embeddings, valid_embd_labels)
embed_holdout_dataset = EmbeddingDataset(holdout_embeddings, holdout_embd_labels)

In [46]:
embed_train_dataloader = DataLoader(
    embed_train_dataset,
    batch_size = BATCH
)
embed_valid_dataloader = DataLoader(
    embed_valid_dataset,
    batch_size = BATCH
)
embed_holdout_dataloader = DataLoader(
    embed_holdout_dataset,
    batch_size = BATCH
)

Function to calculate metrics

In [47]:
def calc_metric(true_labels, predicted_labels):
    total_acc = balanced_accuracy_score(true_labels, predicted_labels)
    total_f1 = f1_score(true_labels, predicted_labels)
    returned_dict = {
        "total_accuracy": total_acc,
        "total_f1_score": total_f1
    }
    return returned_dict

Training function

In [48]:
def train_model(embed_dataloader, the_model, loss_func, optimizer):
    the_model.train()
    
    concat_true_lbl = []
    concat_pred_lbl = []
    
    for batch, datas in enumerate(embed_dataloader):
        embed = datas['all_embedding']
        lbl = datas['label']
        
        embed = embed.to(device)
        lbl = lbl.to(device)
        
        # FF
        output = the_model(embed)
        y_pred = output[:,0,0]
        y_pred = y_pred.type(torch.float)
        y_pred1 = output[:,0,:].to('cpu').detach()
        y_pred2 = np.argmax(y_pred1, axis=1).to(device).float().requires_grad_()
        
        # calculate loss
        lbl_float = lbl.float()
        loss = loss_func(y_pred2, lbl_float)
        
        # calculate metrics
        ground_truth = lbl_float.to('cpu').detach()
        ground_truth = ground_truth.int()
        predicted_lbl = y_pred2.to('cpu').detach()
        predicted_lbl = predicted_lbl.int()
        
        concat_true_lbl += ground_truth
        concat_pred_lbl += predicted_lbl
        
        # backpro
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    # calculate metrics
    calculated_metric = calc_metric(concat_true_lbl, concat_pred_lbl)
    
    returned_loss = loss.item()
    returned_loss /= BATCH
    
    return calculated_metric, returned_loss

Inference function

In [49]:
def inference(embed_dataloader, the_model, loss_func):
    the_model.eval()
    
    concat_true_lbl = []
    concat_pred_lbl = []
    with torch.no_grad():
        for batch, datas in enumerate(embed_dataloader):
            embed = datas['all_embedding']
            lbl = datas['label']

            embed = embed.to(device)
            lbl = lbl.to(device)

            # FF
            output = the_model(embed)
            y_pred = output[:,0,0]
            y_pred = y_pred.type(torch.float)
            y_pred1 = output[:,0,:].to('cpu').detach()
            y_pred2 = np.argmax(y_pred1, axis=1).to(device).float().requires_grad_()

            # calculate loss
            lbl_float = lbl.float()
            loss = loss_func(y_pred2, lbl_float)

            ground_truth = lbl_float.to('cpu').detach()
            predicted_lbl = y_pred2.to('cpu').detach()
            
            concat_true_lbl += ground_truth
            concat_pred_lbl += predicted_lbl
            
    # calculate metrics
    calculated_metric = calc_metric(concat_true_lbl, concat_pred_lbl)
    
    returned_loss = loss.item() 
    returned_loss /= BATCH
    
    return calculated_metric, returned_loss

In [50]:
class EarlyStopper:
    def __init__(self, patience=1, min_delta=0):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.min_validation_loss = np.inf

    def early_stop(self, validation_loss):
        if validation_loss < self.min_validation_loss:
            self.min_validation_loss = validation_loss
            self.counter = 0
        elif validation_loss > (self.min_validation_loss + self.min_delta):
            self.counter += 1
            if self.counter >= self.patience:
                return True
        return False

In [51]:
def training_sequence(epoch, training_device, proposed_model, training_dataloader, validation_dataloader, loss_fn, opt, saved_model_path, saved_model_name = "model.pt", use_early_stopping = False, patience = 3, min_delta = 10):
    proposed_model = proposed_model.to(training_device)
    
    # Make tqdm progress bar
    t = trange(epoch, position=0, leave=True, colour="green")
    
    history_chart = {
        "train_accuracy": [],
        "train_f1": [],
        "train_loss": [],
        "valid_accuracy": [],
        "valid_f1": [],
        "valid_loss": []
    }
    
    early_stopper = EarlyStopper(patience=patience, min_delta=min_delta)
    
    for ep in t:
        # Train the model
        train_metric, train_loss = train_model(training_dataloader, proposed_model, loss_fn, opt)
        
        # Measure loss and accuracy
        valid_metric, valid_loss = inference(validation_dataloader, proposed_model, loss_fn)

        t.set_description(f"Train loss: {train_loss:>.4f} train acc: {train_metric['total_accuracy']:>.2f}, val loss: {valid_loss:>.4f} val acc: {valid_metric['total_accuracy']:>.2f}")

        # Add to history to be plotted
        history_chart["train_accuracy"].append(train_metric['total_accuracy'])
        history_chart["train_f1"].append(train_metric["total_f1_score"])
        history_chart["train_loss"].append(train_loss)
        
        history_chart["valid_accuracy"].append(valid_metric['total_accuracy'])
        history_chart["valid_f1"].append(valid_metric["total_f1_score"])
        history_chart["valid_loss"].append(valid_loss)
        
        # Save model
        if(valid_metric["total_f1_score"] > max(history_chart["valid_f1"])):
            if saved_model_path[-1] == "/":
                os.makedirs(saved_model_path[:-1], exist_ok=True)
                if saved_model_name[-3:] == ".pt" or saved_model_name[-4:] == ".pth":
                    torch.save(proposed_model.state_dict(), saved_model_path + saved_model_name)
                else:
                    torch.save(proposed_model.state_dict(), saved_model_path + saved_model_name + ".pt")
            else:
                os.makedirs(saved_model_path, exist_ok=True)
                if saved_model_name[-3:] == ".pt" or saved_model_name[-4:] == ".pth":
                    torch.save(proposed_model.state_dict(), saved_model_path + "/" + saved_model_name)
                else:
                    torch.save(proposed_model.state_dict(), saved_model_path + "/" + saved_model_name + ".pt")
        elif(len(history_chart["valid_f1"]) == 1):
            if saved_model_path[-1] == "/":
                os.makedirs(saved_model_path[:-1], exist_ok=True)
                if saved_model_name[-3:] == ".pt" or saved_model_name[-4:] == ".pth":
                    torch.save(proposed_model.state_dict(), saved_model_path + saved_model_name)
                else:
                    torch.save(proposed_model.state_dict(), saved_model_path + saved_model_name + ".pt")
            else:
                os.makedirs(saved_model_path, exist_ok=True)
                if saved_model_name[-3:] == ".pt" or saved_model_name[-4:] == ".pth":
                    torch.save(proposed_model.state_dict(), saved_model_path + "/" + saved_model_name)
                else:
                    torch.save(proposed_model.state_dict(), saved_model_path + "/" + saved_model_name + ".pt")

        if early_stopper.early_stop(valid_metric["total_f1_score"]) and use_early_stopping == True:
            break
    return history_chart

Create a function to plot

In [52]:
def lets_plot(training_value, validation_value, y_caption, title, background_color='black'):
    fig = plt.figure()
    ax = fig.add_subplot(1, 1, 1) # nrows, ncols, index
    ax.set_facecolor(background_color)
    plt.plot(training_value, color='red', label='Train')
    if validation_value != None:
        plt.plot(validation_value, color='yellow', label='Valid')
        plt.legend(['Train', 'Valid'], loc='upper right')
    else:
        plt.legend(['Train'], loc='upper right')
    plt.title(title)
    
    plt.xlabel('Epochs')
    plt.ylabel(y_caption)
    plt.grid(color='white', linestyle='--', linewidth=0.5)
    plt.show

#### Train and Validate

In [53]:
history_with_lstm = training_sequence(EPOCHS, device, proposed_model1, embed_train_dataloader, embed_valid_dataloader, loss_fn1, opt1, MODEL_PATH + "/bilstm/", "model_bilstm.pt", True, 2, 3)

Train loss: 0.3472 train acc: 0.50, val loss: 0.2378 val acc: 0.50:  76%|[32m███████▋  [0m| 191/250 [1:15:41<23:22, 23.78s/it][0m


KeyboardInterrupt: 

In [None]:
history_with_gru = training_sequence(EPOCHS, device, proposed_model2, embed_train_dataloader, embed_valid_dataloader, loss_fn2, opt2, MODEL_PATH + "/bigru/", "model_bigru.pt", True, 2, 3)

#### Test

In [None]:
test_bilstm_metric, test_bilstm_loss = inference(embed_holdout_dataloader, proposed_model1, loss_fn1)
print(f"Test accuracy with BiLSTM: {test_bilstm_metric['total_accuracy']} - Test F1 Score with BiLSTM: {test_bilstm_metric['total_f1_score']}")

In [None]:
test_gru_metric, test_gru_loss = inference(embed_holdout_dataloader, proposed_model2, loss_fn2)
print(f"Test accuracy with BiGRU: {test_gru_metric['total_accuracy']} - Test F1 Score with BiGRU: {test_gru_metric['total_f1_score']}")

#### Plot Training & Validation

Plot BiLSTM model performance

In [None]:
lets_plot(history_with_lstm["train_accuracy"], history_with_lstm["valid_accuracy"], "", "Accuracy with BiLSTM")

In [None]:
lets_plot(history_with_lstm["train_f1"], history_with_lstm["valid_f1"], "", "F1 Score with BiLSTM")

In [None]:
lets_plot(history_with_lstm["train_loss"], history_with_lstm["valid_loss"], "", "Loss with BiLSTM")

Plot BiGRU model performance

In [None]:
lets_plot(history_with_gru["train_accuracy"], history_with_gru["valid_accuracy"], "", "Accuracy with BiGRU")

In [None]:
lets_plot(history_with_gru["train_f1"], history_with_gru["valid_f1"], "", "F1 Score with BiGRU")

In [None]:
lets_plot(history_with_gru["train_loss"], history_with_gru["valid_loss"], "", "Loss with BiGRU")

## Error Analysis