## Installing & importing needed libraries

In [4]:
# ! pip install --upgrade pip
# ! pip install transformers
# ! pip install nltk
# ! pip install --upgrade gdown
# ! pip install --upgrade tqdm
# ! pip install pytorch-crf
# ! pip install torch torchvision torchaudio

Collecting pip
  Using cached pip-23.1.1-py3-none-any.whl (2.1 MB)
Installing collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 23.0.1
    Uninstalling pip-23.0.1:
      Successfully uninstalled pip-23.0.1
Successfully installed pip-23.1.1
[0mCollecting transformers
  Using cached transformers-4.28.1-py3-none-any.whl (7.0 MB)
Collecting huggingface-hub<1.0,>=0.11.0 (from transformers)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Using cached tokenizers-0.13.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
Collecting packaging>=20.0 (from transformers)
  Using cached packaging-23.1-py3-none-any.whl (48 kB)
Installing collected packages: tokenizers, packaging, huggingface-hub, transformers
  Attempting uninst

In [5]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

import re
import os
import numpy as np
import pandas as pd
from IPython import display
import time

import torch
from torch.utils.data import Dataset, DataLoader
from torchcrf import CRF

from transformers import BertTokenizer, TFBertForSequenceClassification, BertModel
from transformers import InputExample, InputFeatures

from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score, f1_score

import matplotlib.pyplot as plt
import gdown
from tqdm import tqdm, trange
import time

from sagemaker import get_execution_role
# AWS Python SDK
import boto3

In [6]:
# When running on SageMaker, need execution role
role = get_execution_role()

In [7]:
os.environ["CUDA_LAUNCH_BLOCKING"]="1"

In [8]:
# nltk.download('all-corpora')

[nltk_data] Downloading collection 'all-corpora'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package bcp47 to /root/nltk_data...
[nltk_data]    |   Package bcp47 is already up-to-date!
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package biocreative_ppi is already up-to-date!
[nltk_data]    | Downloading package brown to /root/nltk_data...
[nltk_data]    |   Package brown is already up-to-date!
[nltk_data]    | Downloading package brown_tei to /root/nltk_data...
[nltk_data]    |   Package brown_tei is already up-to-date!
[nltk_data]    | Downloading package cess_cat to /root/nltk_data...
[nltk_data]    |   Package cess_cat is already up-to-date!
[nltk_data]    | Downloading pack

True

Downloading dataset

In [8]:
# uri = "https://drive.google.com/uc?id=1fJujYj3rBuh34FukQa2bG7lhefcYNNwX"
# output = "/home/sagemaker-user/datasets/dataset_steam_review.csv"
# if not os.path.exists("/home/sagemaker-user/datasets/"):
#   os.makedirs("/home/sagemaker-user/datasets/")
# gdown.download(url=uri, output=output, quiet=False, fuzzy=True)

Downloading...
From (uriginal): https://drive.google.com/uc?id=1fJujYj3rBuh34FukQa2bG7lhefcYNNwX
From (redirected): https://drive.google.com/uc?id=1fJujYj3rBuh34FukQa2bG7lhefcYNNwX&confirm=t&uuid=07272596-7975-4e65-aa24-7be373bdfddf
To: /home/sagemaker-user/datasets/dataset_steam_review.csv
100%|██████████| 2.16G/2.16G [00:20<00:00, 106MB/s] 


'/home/sagemaker-user/datasets/dataset_steam_review.csv'

In [13]:
my_s3_bucket = "machinelearning-research"
dataset_source = "datasets/dataset_steam_review/dataset.csv"
output = "/home/sagemaker-user/datasets/dataset_steam_review.csv"

if not os.path.exists("/home/sagemaker-user/datasets/"):
  os.makedirs("/home/sagemaker-user/datasets/")

s3 = boto3.client("s3" ,
                 aws_access_key_id="AKIA5ZBI6IRP3CAH6P5X",
                 aws_secret_access_key="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"
                 )

s3.download_file(
    Bucket=my_s3_bucket, Key=dataset_source, Filename=output
)

## Preprocessing Data

In [14]:
df_steam_reviews = pd.read_csv(output)
# df_steam_reviews = pd.read_csv("D:/Training/Machine Learning/Datasets/dataset_steam_review/dataset.csv")
df_steam_reviews = df_steam_reviews.sample(frac=1).reset_index(drop=True) #shuffle the data
df_steam_reviews.shape

(6417106, 5)

Remove early access reviews

In [15]:
# remove Early Access Reviews
df_steam_reviews = df_steam_reviews[df_steam_reviews.review_text.str.strip() != 'Early Access Review']
# size of dataframe
df_steam_reviews.shape

(5392419, 5)

In [16]:
# Preprocessing

def remove_links(x):
    x = re.sub(r"http\S+", "", x)
    x = re.sub(r"https\S+", "", x)
    x = re.sub(r"www.\S+", "", x)
    x = re.sub(".*\..*\..*", "", x)
    return x

def remove_hashtag(x):
    x = re.sub("@[A-Za-z0-9_]+","", x)
    x = re.sub("#[A-Za-z0-9_]+","", x)
    return x

def remove_punct(x):
    x = re.sub(r"[()!?:;,.'-]","", x)
    return x

def remove_emoji(x):
    x = x.replace(":)", "")
    x = x.replace(":-)", "")
    x = x.replace(":(", "")
    x = x.replace("-_-", "")
    x = x.replace(";)", "")
    x = x.replace(";-)", "")
    # REFERENCE FOR EMOJI_PATTERN: https://gist.github.com/slowkow/7a7f61f495e3dbb7e3d767f97bd7304b
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags 
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    x = emoji_pattern.sub(r'', x)
    return x

def remove_titiktitik(x):
    x = x.replace("..", "")
    x = x.replace("...", "")
    x = x.replace("....", "")
    x = x.replace(".....", "")
    x = x.replace("...................", "")
    return x

def remove_money(x):
    x = re.sub("€", "", x)
    x = re.sub("$", "", x)
    x = x.replace("usd", "")
    return x

def fix_typo(x):
    x = x.replace("veru", "very")
    x = x.replace("gud", "good")
    x = x.replace("gut", "good")
    x = x.replace("withouth", "without")
    x = x.replace("noob", "newbie")
    x = x.replace("dis", "this")
    x = x.replace("noobs", "newbie")
    x = x.replace("nice1", "nice")
    x = x.replace("4ever", "forever")
    x = x.replace("w0n", "won")
    x = re.sub("&lt;3", "", x)
    x = x.replace("graficks", "graphics")
    x = x.replace("dissapeared", "disappeared")
    x = x.replace("yr", "year")
    x = x.replace("yrs", "years")
    x = x.replace("dosent", "doesnt")
    x = x.replace("awsume","awesome")
    x = re.sub("&lt3","",x)
    x = x.replace("compatative", "competitive")
    x = x.replace("cyyounterstrikesyyource", "counter strike source")
    x = x.replace("&amp","and")
    x = x.replace("yyoure","your")
    x = x.replace("cyyounter","counter")
    x = x.replace("child hood", "childhood")
    return x

In [17]:
# convert review text to string
df_steam_reviews["review_text"] = df_steam_reviews["review_text"].astype(str)
# convert to lowercase
df_steam_reviews["review_text"] = df_steam_reviews["review_text"].apply(str.lower)

# drop the reviews with null score
df_steam_reviews = df_steam_reviews[df_steam_reviews["review_score"].notnull()]
df_steam_reviews["review_score"] = np.where(df_steam_reviews["review_score"]==-1, 0, df_steam_reviews["review_score"])

# remove links
df_steam_reviews["review_text"] = df_steam_reviews["review_text"].apply(remove_links)

# remove hashtag
df_steam_reviews["review_text"] = df_steam_reviews["review_text"].apply(remove_hashtag)

# removing punctuation
df_steam_reviews["review_text"] = df_steam_reviews["review_text"].apply(remove_punct)

# removing dots
df_steam_reviews["review_text"] = df_steam_reviews["review_text"].apply(remove_titiktitik)

# removing emoji
df_steam_reviews["review_text"] = df_steam_reviews["review_text"].apply(remove_emoji)

# remove money symbols
df_steam_reviews["review_text"] = df_steam_reviews["review_text"].apply(remove_money)

# fix any typo
df_steam_reviews["review_text"] = df_steam_reviews["review_text"].apply(fix_typo)

# remove any stopwords
stop = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y']
df_steam_reviews["review_text"] = df_steam_reviews["review_text"].apply(lambda x: " ".join(x for x in x.split() if x not in stop))

# lemmatize
lemmatizer = WordNetLemmatizer()
df_steam_reviews["review_text"] = df_steam_reviews["review_text"].apply(lambda x: " ".join(lemmatizer.lemmatize(word) for word in x.split()))

#remove empty text
df_steam_reviews = df_steam_reviews[df_steam_reviews.review_text.str.strip() != '']

# distribution of negative and positive reviews
df_steam_reviews["review_score"].value_counts()

1    1965881
0     300832
Name: review_score, dtype: int64

In [18]:
# Most reviewed game
df2 = df_steam_reviews.groupby(['app_id'])['app_id'].count().reset_index(name='count').sort_values(['count'], ascending=False)
df3 = df2['app_id'].values.tolist()[0:10]

print(df2.iloc[:10])
# print(df3)

for id in df3:
    df_gn = df_steam_reviews.loc[df_steam_reviews['app_id'].isin([id])]['app_name'].unique()
    print(df_gn)

      app_id  count
22       570  48575
1565  218620  47112
1181  105600  46821
21       550  31270
2156  252950  30497
1739  230410  25308
5872  391540  21905
2582  271590  21651
23       620  21038
132     4000  17596
['Dota 2']
['PAYDAY 2']
['Terraria']
['Left 4 Dead 2']
['Rocket League']
['Warframe']
['Undertale']
['Grand Theft Auto V']
['Portal 2']
["Garry's Mod"]


In [19]:
reviewed_steam = df_steam_reviews.loc[df_steam_reviews["app_id"].isin(df3)] # take 10 most reviewed game only

# sampling the data
p = 0.75
reviewed_steam = reviewed_steam.sample(frac = p).reset_index(drop=True) # take 75% of data and shuffle it

Balancing data

In [20]:
# # take the positive as many as the negative ones
total_data = len(reviewed_steam["review_score"])
total_data_positive = len(reviewed_steam[reviewed_steam["review_score"] == 1])
total_data_negative = len(reviewed_steam[reviewed_steam["review_score"] == 0])

print("Total data cleaned:", total_data)
print("Total data positive:", total_data_positive)
print("Total data negative:", total_data_negative)

# df_steam_reviews_balanced_positive = reviewed_steam[reviewed_steam["review_score"] == 1].sample(n = total_data_negative) 
# df_steam_reviews_balanced_negative = reviewed_steam[reviewed_steam["review_score"] == 0] 
# df_steam_reviews_balanced = pd.concat([df_steam_reviews_balanced_positive, df_steam_reviews_balanced_negative])
# df_steam_reviews_balanced = df_steam_reviews_balanced.sample(frac = 1).reset_index(drop=True) #shuffle the data again
# df_steam_reviews_balanced.head()

# print("Data Balanced with ratio 1:1...")
# print(f"Jumlah data positive : jumlah data negative = {len(df_steam_reviews_balanced_positive)} : {total_data_negative}")

Total data cleaned: 233830
Total data positive: 210602
Total data negative: 23228


In [21]:
# reviews = df_steam_reviews_balanced["review_text"].values.tolist()
# labels = df_steam_reviews_balanced["review_score"].tolist()
reviews = reviewed_steam["review_text"].values.tolist()
labels = reviewed_steam["review_score"].tolist()

In [22]:
print(len(reviews))

233830


Preparing data into train/valid/test split

In [23]:
# split the dataset into train, validation and holdout sets (60-20-20)
training_sentences, test_sentences, training_labels, test_labels = train_test_split(reviews, labels, test_size=.4)

validation_sentences, holdout_sentences, validation_labels, holdout_labels = train_test_split(test_sentences, test_labels, test_size=.5)

## Vector Extraction using BERT

In [24]:
MAX_LEN = 128
NUM_LABELS = 2
BATCH = 64
DEVICE_USED = "cuda:0"
LEARNING_RATE = 1e-6
LAMBDA_L2 = 1e-6
EPOCHS = 50
MODEL_PATH = "/home/sagemaker-user/Saved Models/Sentiment Analysis/proposed_model/"

In [25]:
device = torch.device(DEVICE_USED if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [26]:
# Load pre-trained model tokenizer (vocabulary)
# tokenizer = BertTokenizer.from_pretrained('D:/Training/Machine Learning/Datasets/bert-base-uncased', model_max_length=MAX_LEN)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', model_max_length=MAX_LEN)

In [27]:
def TokenizeDataset(x_data, y_data):
    if len(x_data) != len(y_data):
        raise Exception("x_data and y_data size are different!")
    
    t = trange(len(x_data), colour="green", position=0, leave=True)
    
    out_padded_token_list = []
    out_att_mask = []
    out_tok_type_id = []
    out_target = []
    
    for sentence_idx in t:
        t.set_description(f"Tokenizing data [{sentence_idx + 1}/{len(x_data)}]...")
        encoded_sentence = tokenizer.encode_plus(
            x_data[sentence_idx],
            add_special_tokens = True,
            max_length = MAX_LEN,
            truncation = 'longest_first',
            padding = 'max_length',
            return_attention_mask = True,
            return_tensors = 'pt'
        )

        padded_token_list = encoded_sentence['input_ids']
        att_mask = encoded_sentence['attention_mask']
        tok_type_id = encoded_sentence['token_type_ids']
        target = torch.tensor(y_data[sentence_idx])
        
        out_padded_token_list.append(padded_token_list)
        out_att_mask.append(att_mask)
        out_tok_type_id.append(tok_type_id)
        out_target.append(target)
        
    output_data = {
        "input_ids": out_padded_token_list,
        "attention_mask": out_att_mask,
        "token_type_ids": out_tok_type_id,
        "label": out_target
    }

    return output_data

In [28]:
tokenized_training = TokenizeDataset(training_sentences, training_labels)

Tokenizing data [140298/140298]...: 100%|[32m██████████[0m| 140298/140298 [03:59<00:00, 585.59it/s]


In [29]:
tokenized_validation = TokenizeDataset(validation_sentences, validation_labels)

Tokenizing data [46766/46766]...: 100%|[32m██████████[0m| 46766/46766 [01:19<00:00, 591.09it/s]


In [30]:
tokenized_holdout = TokenizeDataset(holdout_sentences, holdout_labels)

Tokenizing data [46766/46766]...: 100%|[32m██████████[0m| 46766/46766 [01:19<00:00, 590.70it/s]


In [31]:
model = BertModel.from_pretrained(
    # 'D:/Training/Machine Learning/Datasets/bert-base-uncased', 
    'bert-base-uncased',
    num_labels=NUM_LABELS,
    output_hidden_states = True, # Whether the model returns all hidden-states,
    )

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [32]:
# # Put model to GPU
# model.to(device)

Extract the embedding weight

In [33]:
class TokenizedData(Dataset):
    def __init__(self, input_ids, attention_mask, token_type_ids, labels):
        self.input_ids = input_ids
        self.att_mask = attention_mask
        self.tti = token_type_ids
        self.labels = labels
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, idx):
        return {
            "input_ids": self.input_ids[idx],
            "attention_mask": self.att_mask[idx],
            "token_type_ids": self.tti[idx],
            "label": self.labels[idx]
        }

In [34]:
datas_train_tok = TokenizedData(tokenized_training['input_ids'], tokenized_training['attention_mask'], tokenized_training['token_type_ids'], tokenized_training['label'])

In [35]:
datas_valid_tok = TokenizedData(tokenized_validation['input_ids'], tokenized_validation['attention_mask'], tokenized_validation['token_type_ids'], tokenized_validation['label'])

In [36]:
datas_holdout_tok = TokenizedData(tokenized_holdout['input_ids'], tokenized_holdout['attention_mask'], tokenized_holdout['token_type_ids'], tokenized_holdout['label'])

In [37]:
dataloader_training_tok = DataLoader(
    datas_train_tok,
    batch_size = BATCH
)
dataloader_valid_tok = DataLoader(
    datas_valid_tok,
    batch_size = BATCH
)
dataloader_holdout_tok = DataLoader(
    datas_holdout_tok,
    batch_size = BATCH
)

In [34]:
# model.eval()
# with torch.no_grad():
#     outputs = model(
#         input_ids = tokenized_training['input_ids'][0],
#         attention_mask = tokenized_training['attention_mask'][0],
#         token_type_ids = tokenized_training['token_type_ids'][0]
#     )
    
#     last_hidden_state = outputs[0]
#     hidden_states = outputs[2]
#     initial_embedding = hidden_states[0] # initial embedding
#     word_embed_4_last_layers = torch.stack(hidden_states[-4:]).sum(0) #sum of last 4 hidden layers
    
#     print("last hidden state:", last_hidden_state.size())
#     print("hidden states:", len(hidden_states))
#     print("initial embedding:", initial_embedding.size())
#     print("last 4 layers:", word_embed_4_last_layers.size())

In [38]:
def ExtractEmbedding(the_model, datas, total_data):
    the_model.eval() # put model to evaluation mode
    
    bert_embedding_sv = []
    bert_embedding_label = []
    
    with torch.no_grad():
        t = tqdm(enumerate(datas), colour="green", position=0, leave=True, total=len(datas))
        i = 0
        for batch, data in t:
            for idx in range(len(data["input_ids"])):
                in_ids = data["input_ids"][idx]
                att_mask = data["attention_mask"][idx]
                tok_type = data["token_type_ids"][idx]

                output = the_model(
                    input_ids = in_ids,
                    attention_mask = att_mask,
                    token_type_ids = tok_type
                )
                
                hidden_states = output[2]
                word_embed_4_last_layers = torch.stack(hidden_states[-4:]).sum(0) #sum of last 4 hidden layers
#                 print("last 4 layers:", word_embed_4_last_layers.size())
#                 print("label:", data['label'][idx])
                
                bert_embedding_sv.append(word_embed_4_last_layers)
                bert_embedding_label.append(data['label'][idx])
                
                t.set_description(f"Extracting embedding weight [{i+1}/{total_data}] ")
                t.refresh()
                
                i += 1
        return bert_embedding_sv, bert_embedding_label

In [None]:
training_embeddings, training_embd_labels = ExtractEmbedding(model, dataloader_training_tok, len(tokenized_training['input_ids']))

Extracting embedding weight [87000/140298] :  62%|[32m██████▏   [0m| 1359/2193 [3:09:23<1:55:29,  8.31s/it]

In [None]:
validation_embeddings, valid_embd_labels = ExtractEmbedding(model, dataloader_valid_tok, len(tokenized_validation['input_ids']))

In [None]:
holdout_embeddings, holdout_embd_labels = ExtractEmbedding(model, dataloader_holdout_tok, len(tokenized_holdout['input_ids']))

In [None]:
print(f"Panjang embedding: {len(training_embeddings)}, Panjang label: {len(training_embd_labels)}")

## Proposed Model: BiLSTM + CRF

### Model Declaration

In [None]:
words = []
for sentence in reviews:
    for word in sentence:
        words.append(word)

words = list(set(words))
VOCAB_LEN = len(words)
print(VOCAB_LEN)

In [None]:
class ProposedModel1(torch.nn.Module):
    def __init__(self, lstm_in_size, lstm_hdn_size, lstm_layers, lstm_dropout = 0.2):
        super(ProposedModel1, self).__init__()
        
        self.lstm_layers = lstm_layers
        self.lstm_hdn_size = lstm_hdn_size
        
        self.batchnorm = torch.nn.BatchNorm1d(lstm_in_size)
        self.lstm = torch.nn.LSTM(
            input_size = lstm_in_size,
            hidden_size = lstm_hdn_size//2,
            num_layers = lstm_layers,
            bidirectional = True,
            dropout = lstm_dropout,
            batch_first = True
        )
        self.fc1 = torch.nn.Linear(
            in_features = lstm_hdn_size,
            out_features = 2 # number of output classes
        )
        self.softmax = torch.nn.Softmax(dim=2)
    
    def forward(self, x):
        x_perm = x.permute(0, 2, 1)
        b = self.batchnorm(x_perm)
        b_perm = b.permute(0, 2, 1)

        # Init the hidden state
        h0 = torch.zeros(2*self.lstm_layers, x.size(0), self.lstm_hdn_size//2).requires_grad_()
        h0 = h0.to(device)
      
        # Init the cell state
        c0 = torch.zeros(2*self.lstm_layers, x.size(0), self.lstm_hdn_size//2).requires_grad_()
        c0 = c0.to(device)

        h, _ = self.lstm(b_perm, (h0.detach(), c0.detach()))
        y1 = self.fc1(h)
        output = self.softmax(y1)
        
        return output

In [None]:
class ProposedModel2(torch.nn.Module):
    def __init__(self, gru_in_size, gru_hdn_size, gru_layers, gru_dropout):
        super(ProposedModel2, self).__init__()
        
        self.gru_layers = gru_layers
        self.gru_hdn_size = gru_hdn_size
        
        self.batchnorm = torch.nn.BatchNorm1d(gru_in_size)
        self.gru = torch.nn.GRU(
            input_size = gru_in_size,
            hidden_size = gru_hdn_size//2,
            num_layers = gru_layers,
            bidirectional = True,
            dropout = gru_dropout,
            batch_first = True
        )
        self.fc1 = torch.nn.Linear(
            in_features = gru_hdn_size,
            out_features = 2 # number of output classes
        )
        self.softmax = torch.nn.Softmax(dim=2)
            
    
    def forward(self, x):
        x_perm = x.permute(0,2,1)
        b = self.batchnorm(x_perm)
        b_perm = b.permute(0,2,1)
        
        # Init the hidden state
        h0 = torch.zeros(2*self.gru_layers, x.size(0), self.gru_hdn_size//2).requires_grad_()
        h0 = h0.to(device)

        h, _ = self.gru(b_perm, h0.detach())
        y1 = self.fc1(h)
        output = self.softmax(y1)
        
        return output

In [None]:
proposed_model1 = ProposedModel1(
    lstm_in_size = 768,
    lstm_hdn_size = 512,
    lstm_layers = 2,
    lstm_dropout = 0.2
)
proposed_model1 = proposed_model1.to(device)
print(proposed_model1)

In [None]:
proposed_model2 = ProposedModel2(
    gru_in_size = 768,
    gru_hdn_size = 512,
    gru_layers = 2,
    gru_dropout = 0.22
)
proposed_model2 = proposed_model2.to(device)
print(proposed_model2)

In [None]:
# Define loss function
loss_fn1 = torch.nn.BCELoss()

# Define optimizer
opt1 = torch.optim.AdamW(
    proposed_model1.parameters(),
    lr=LEARNING_RATE,
    weight_decay=LAMBDA_L2
)

In [None]:
# Define loss function
loss_fn2 = torch.nn.BCELoss()

# Define optimizer
opt2 = torch.optim.AdamW(
    proposed_model2.parameters(),
    lr=LEARNING_RATE,
    weight_decay=LAMBDA_L2
)

### Training & Validation

#### Create all needed functions to train, validate, and plot

Create dataloader for embedding vector

In [None]:
class EmbeddingDataset(Dataset):
    def __init__(self, arr_embed, arr_lbl):
        super(EmbeddingDataset, self).__init__()
        self.array_embed = arr_embed
        self.array_label = arr_lbl
    def __len__(self):
        return len(self.array_embed)
    def __getitem__(self, idx):
        all_embedding = self.array_embed[idx][0, :, :] # torch.squeeze(self.array_embed[idx])
        cls_embedding = self.array_embed[idx][0, 0, :]
        cls_embedding = cls_embedding[None, :]
        data_pair = {
            "all_embedding": all_embedding, # all embedding data (all of seq length) - size [128, 768]
            "cls_embedding": cls_embedding, # CLS embedding only - size [1, 768]
            "label": self.array_label[idx]
        }
        return data_pair

In [None]:
embed_train_dataset = EmbeddingDataset(training_embeddings, training_embd_labels)
embed_valid_dataset = EmbeddingDataset(validation_embeddings, valid_embd_labels)
embed_holdout_dataset = EmbeddingDataset(holdout_embeddings, holdout_embd_labels)

In [None]:
embed_holdoutval_dataset = EmbeddingDataset(training_embeddings + validation_embeddings, training_embd_labels + valid_embd_labels)

In [None]:
embed_train_dataloader = DataLoader(
    embed_train_dataset,
    batch_size = BATCH
)
embed_valid_dataloader = DataLoader(
    embed_valid_dataset,
    batch_size = BATCH
)
embed_holdout_dataloader = DataLoader(
    embed_holdout_dataset,
    batch_size = BATCH
)

In [None]:
embed_holdoutval_dataloader = DataLoader(
    embed_holdoutval_dataset,
    batch_size = BATCH
)

Function to calculate metrics

In [None]:
def calc_metric(true_labels, predicted_labels):
    # total_acc = balanced_accuracy_score(true_labels, predicted_labels)
    total_f1 = f1_score(true_labels, predicted_labels)
    returned_dict = {
        "total_f1_score": total_f1
    }
    return returned_dict

Training function

In [None]:
def train_model(embed_dataloader, the_model, loss_func, optimizer):
    the_model.train()
    
    concat_true_lbl = []
    concat_pred_lbl = []
    
    for batch, datas in enumerate(embed_dataloader):
        embed = datas['all_embedding']
        lbl = datas['label']
        
        embed = embed.to(device)
        lbl = lbl.to(device)
        
        # FF
        output = the_model(embed)
        y_pred = output[:,0,:].to('cpu').detach()
        y_pred1 = np.argmax(y_pred, axis=1).to(device).float().requires_grad_()
        
        # calculate loss
        lbl_float = lbl.float()
        loss = loss_func(y_pred1, lbl_float)
        
        # calculate metrics
        ground_truth = lbl_float.to('cpu').detach()
        ground_truth = ground_truth.int()
        predicted_lbl = y_pred1.to('cpu').detach()
        predicted_lbl = predicted_lbl.int()
        
        concat_true_lbl += ground_truth
        concat_pred_lbl += predicted_lbl
        
        # backpro
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    # calculate metrics
    calculated_metric = calc_metric(concat_true_lbl, concat_pred_lbl)
    
    returned_loss = loss.item()
    returned_loss /= BATCH
    
    return calculated_metric, returned_loss

Inference function

In [None]:
def inference(embed_dataloader, the_model, loss_func):
    the_model.eval()
    
    concat_true_lbl = []
    concat_pred_lbl = []
    with torch.no_grad():
        for batch, datas in enumerate(embed_dataloader):
            embed = datas['all_embedding']
            lbl = datas['label']

            embed = embed.to(device)
            lbl = lbl.to(device)

            # FF
            output = the_model(embed)
            y_pred = output[:,0,:].to('cpu').detach()
            y_pred1 = np.argmax(y_pred, axis=1).to(device).float().requires_grad_()

            # calculate loss
            lbl_float = lbl.float()
            loss = loss_func(y_pred1, lbl_float)

            ground_truth = lbl_float.to('cpu').detach()
            predicted_lbl = y_pred1.to('cpu').detach()
            
            concat_true_lbl += ground_truth
            concat_pred_lbl += predicted_lbl
            
    # calculate metrics
    calculated_metric = calc_metric(concat_true_lbl, concat_pred_lbl)
    
    returned_loss = loss.item() 
    returned_loss /= BATCH
    
    return calculated_metric, returned_loss

In [None]:
class EarlyStopper:
    def __init__(self, patience=1, min_delta=0, stagnant_patience=20):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.min_validation_loss = np.inf
        self.stagnant_patience = stagnant_patience

    def early_stop(self, validation_loss):
        if validation_loss < self.min_validation_loss:
            self.min_validation_loss = validation_loss
            self.counter = 0
        elif validation_loss > (self.min_validation_loss + self.min_delta):
            self.counter += 1
            if self.counter >= self.patience:
                return True
        elif validation_loss == self.min_validation_loss:
            self.counter += 1
            if self.counter >= self.stagnant_patience:
                return True
        return False

In [None]:
def training_sequence(
    epoch, 
    training_device, 
    proposed_model, 
    training_dataloader, 
    validation_dataloader, 
    loss_fn, 
    opt, 
    saved_model_path, 
    saved_model_name = "model.pt", 
    use_early_stopping = False, 
    patience = 3, 
    min_delta = 10,
    stagnant_patience=10,
    is_holdout = False
):
    proposed_model = proposed_model.to(training_device)
    
    # Make tqdm progress bar
    t = trange(epoch, position=0, leave=True, colour="green")
    
    history_chart = {
        "train_f1": [],
        "train_loss": [],
        "valid_f1": [],
        "valid_loss": []
    }
    
    early_stopper = EarlyStopper(
        patience=patience, 
        min_delta=min_delta, 
        stagnant_patience=stagnant_patience
    )
    
    for ep in t:
        # Train the model
        train_metric, train_loss = train_model(training_dataloader, proposed_model, loss_fn, opt)
        
        # Measure loss and accuracy
        valid_metric, valid_loss = inference(validation_dataloader, proposed_model, loss_fn)

        t.set_description(f"Train loss: {train_loss:>.4f} train f1 score: {train_metric['total_f1_score']:>.2f}, val loss: {valid_loss:>.4f} val f1 score: {valid_metric['total_f1_score']:>.2f}")

        # Add to history to be plotted
        history_chart["train_f1"].append(train_metric["total_f1_score"])
        history_chart["train_loss"].append(train_loss)
        
        history_chart["valid_f1"].append(valid_metric["total_f1_score"])
        history_chart["valid_loss"].append(valid_loss)
        
        # Save model
        if(valid_metric["total_f1_score"] > max(history_chart["valid_f1"]) and is_holdout == False):
            if saved_model_path[-1] == "/":
                if not os.path.exists(saved_model_path[:-1]):
                    os.makedirs(saved_model_path[:-1])
                if saved_model_name[-3:] == ".pt" or saved_model_name[-4:] == ".pth":
                    torch.save(proposed_model.state_dict(), saved_model_path + saved_model_name)
                else:
                    torch.save(proposed_model.state_dict(), saved_model_path + saved_model_name + ".pt")
            else:
                if not os.path.exists(saved_model_path):
                    os.makedirs(saved_model_path)
                if saved_model_name[-3:] == ".pt" or saved_model_name[-4:] == ".pth":
                    torch.save(proposed_model.state_dict(), saved_model_path + "/" + saved_model_name)
                else:
                    torch.save(proposed_model.state_dict(), saved_model_path + "/" + saved_model_name + ".pt")
        elif(len(history_chart["valid_f1"]) == 1 and is_holdout == False):
            if saved_model_path[-1] == "/":
                if not os.path.exists(saved_model_path[:-1]):
                    os.makedirs(saved_model_path[:-1])
                if saved_model_name[-3:] == ".pt" or saved_model_name[-4:] == ".pth":
                    torch.save(proposed_model.state_dict(), saved_model_path + saved_model_name)
                else:
                    torch.save(proposed_model.state_dict(), saved_model_path + saved_model_name + ".pt")
            else:
                if not os.path.exists(saved_model_path):
                    os.makedirs(saved_model_path)
                if saved_model_name[-3:] == ".pt" or saved_model_name[-4:] == ".pth":
                    torch.save(proposed_model.state_dict(), saved_model_path + "/" + saved_model_name)
                else:
                    torch.save(proposed_model.state_dict(), saved_model_path + "/" + saved_model_name + ".pt")

        if early_stopper.early_stop(valid_metric["total_f1_score"]) and use_early_stopping == True:
            break
    return history_chart

Create a function to plot

In [None]:
def lets_plot(training_value, validation_value, y_caption, title, holdout_validation = False, background_color='black'):
    fig = plt.figure()
    ax = fig.add_subplot(1, 1, 1) # nrows, ncols, index
    ax.set_facecolor(background_color)
    plt.plot(training_value, color='red', label='Train')
    if validation_value != None:
        if holdout_validation == False:
            plt.plot(validation_value, color='yellow', label='Valid')
            plt.legend(['Train', 'Valid'], loc='upper right')
        else:
            plt.plot(validation_value, color='yellow', label='Valid')
            plt.legend(['Train+Valid', 'Test'], loc='upper right')
    else:
        plt.legend(['Train'], loc='upper right')
    plt.title(title)
    
    plt.xlabel('Epochs')
    plt.ylabel(y_caption)
    plt.grid(color='white', linestyle='--', linewidth=0.5)
    plt.show

#### Train and Validate

Training for LSTM (training data vs validation data)

In [None]:
history_with_lstm = training_sequence(EPOCHS, 
                                      device, 
                                      proposed_model1, 
                                      embed_train_dataloader, 
                                      embed_valid_dataloader, 
                                      loss_fn1, 
                                      opt1, 
                                      MODEL_PATH + "/bilstm/", "model_bilstm.pt", 
                                      False, 
                                      2, 
                                      0.5,
                                      15,
                                      False
)

Holdout validation for LSTM (training data + validation data vs test data)

In [None]:
history_holdoutval_with_lstm = training_sequence(
    EPOCHS, 
    device, 
    proposed_model1, 
    embed_holdoutval_dataloader, 
    embed_holdout_dataloader, 
    loss_fn1, 
    opt1,
    MODEL_PATH + "/bilstm/", 
    "model_bilstm_holdout.pt", 
    False, 
    2, 
    0.5,
    15,
    True
)

Training for GRU (training data vs validation data)

In [None]:
history_with_gru = training_sequence(
    EPOCHS, 
    device, 
    proposed_model2, 
    embed_train_dataloader, 
    embed_valid_dataloader, 
    loss_fn2, 
    opt2, 
    MODEL_PATH + "/bigru/", 
    "model_bigru.pt", 
    False, 
    2, 
    2,
    15,
    False
)

Holdout validation for GRU (training data + validation data vs test data)

In [None]:
history_holdoutval_with_gru = training_sequence(
    EPOCHS, 
    device, 
    proposed_model2, 
    embed_holdoutval_dataloader, 
    embed_holdout_dataloader, 
    loss_fn2, 
    opt2, 
    MODEL_PATH + "/bigru/", "model_bigru_holdout.pt", 
    False, 
    2, 
    2,
    15,
    True
)

Upload saved models to AWS S3

In [None]:
def UploadModels(model_folder, bucket_name, saved_dir):
    files = os.listdir(model_folder)
    
    if model_folder[-1] == "/":
        model_folder = model_folder[:-1]
    if saved_dir[-1] != "/":
        saved_dir += "/"
    
    t = tqdm(files, colour="green", position=0, leave=True)
    for file_source in t:
        t.set_description(f"Uploading model {file_source} to AWS S3...")
        s3.upload_file(
            Filename = model_folder,
            Bucket = bucket_name,
            Key = saved_dir + file_source
        )

In [None]:
UploadModels(MODEL_PATH + "/bilstm/", my_s3_bucket, "saved-models/Steam-Review-Sentiment-Analysis/bilstm/")

In [None]:
UploadModels(MODEL_PATH + "/bigru/", my_s3_bucket, "saved-models/Steam-Review-Sentiment-Analysis/bigru/")

#### Test

In [None]:
test_bilstm_metric, test_bilstm_loss = inference(embed_holdout_dataloader, proposed_model1, loss_fn1)
print(f"Test F1 score with BiLSTM: {test_bilstm_metric['total_f1_score']} - Test F1 Score with BiLSTM: {test_bilstm_metric['total_f1_score']}")

In [None]:
test_gru_metric, test_gru_loss = inference(embed_holdout_dataloader, proposed_model2, loss_fn2)
print(f"Test F1 score with BiGRU: {test_gru_metric['total_f1_score']} - Test F1 Score with BiGRU: {test_gru_metric['total_f1_score']}")

#### Plot Training & Validation

Plot BiLSTM model performance

In [None]:
lets_plot(history_with_lstm["train_f1"], history_with_lstm["valid_f1"], "", "F1 Score with BiLSTM")

In [None]:
lets_plot(history_with_lstm["train_loss"], history_with_lstm["valid_loss"], "", "Loss with BiLSTM")

In [None]:
lets_plot(history_holdoutval_with_lstm["train_f1"], history_holdoutval_with_lstm["valid_f1"], "", "F1 Score with BiLSTM After Holdout Validation", True)

Plot BiGRU model performance

In [None]:
lets_plot(history_with_gru["train_f1"], history_with_gru["valid_f1"], "", "F1 Score with BiGRU")

In [None]:
lets_plot(history_with_gru["train_loss"], history_with_gru["valid_loss"], "", "Loss with BiGRU")

In [None]:
lets_plot(history_holdoutval_with_gru["train_f1"], history_holdoutval_with_gru["valid_f1"], "", "F1 Score with BiGRU After Holdout Validation", True)

## Error Analysis