## Installing & importing needed libraries

In [1]:
# ! pip install --upgrade pip
# ! pip install transformers
# ! pip install nltk
# ! pip install --upgrade gdown
# ! pip install --upgrade tqdm
# ! pip install pytorch-crf
# ! pip install torch torchvision torchaudio

In [2]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

import re
import os
import numpy as np
import pandas as pd
from IPython import display

import torch
from torch.utils.data import Dataset, DataLoader
from torchcrf import CRF

from transformers import BertTokenizer, TFBertForSequenceClassification, BertModel
from transformers import TrainingArguments, Trainer
from transformers import InputExample, InputFeatures

import xgboost as xgb
import lightgbm as lgb

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

import matplotlib.pyplot as plt
import gdown
from tqdm import tqdm, trange
import time

import wandb

from sagemaker import get_execution_role
# AWS Python SDK
import boto3

In [3]:
## When running on SageMaker, need execution role
# role = get_execution_role()

In [4]:
os.environ["CUDA_LAUNCH_BLOCKING"]="1"

In [None]:
wandb.login(api_key='97b170d223eb55f86fe1fbf9640831ad76381a74')

In [None]:
wandb.init(project_name="steam_game_review_sentiment_analysis")

In [None]:
%env WANDB_PROJECT=steam_game_review_sentiment_analysis
%env WANDB_LOG_MODEL='end'

In [5]:
# nltk.download('all-corpora')

Downloading dataset

In [6]:
# uri = "https://drive.google.com/uc?id=1fJujYj3rBuh34FukQa2bG7lhefcYNNwX"
# output = "/home/sagemaker-user/datasets/dataset_steam_review.csv"
# if not os.path.exists("/home/sagemaker-user/datasets/"):
#   os.makedirs("/home/sagemaker-user/datasets/")
# gdown.download(url=uri, output=output, quiet=False, fuzzy=True)

In [7]:
my_s3_bucket = "machinelearning-research"
dataset_source = "datasets/dataset_steam_review/dataset.csv"
output = "/home/sagemaker-user/datasets/dataset_steam_review.csv"

# if not os.path.exists("/home/sagemaker-user/datasets/"):
#   os.makedirs("/home/sagemaker-user/datasets/")

s3 = boto3.client("s3" ,
                 aws_access_key_id="AKIA5ZBI6IRP3CAH6P5X",
                 aws_secret_access_key="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"
                 )

# s3.download_file(
#     Bucket=my_s3_bucket, Key=dataset_source, Filename=output
# )

## Preprocessing Data

In [8]:
# df_steam_reviews = pd.read_csv(output)
df_steam_reviews = pd.read_csv("D:/Training/Machine Learning/Datasets/dataset_steam_review/dataset.csv")
df_steam_reviews = df_steam_reviews.sample(frac=1).reset_index(drop=True) #shuffle the data
df_steam_reviews.shape

(6417106, 5)

Remove early access reviews

In [9]:
# remove Early Access Reviews
df_steam_reviews = df_steam_reviews[df_steam_reviews.review_text.str.strip() != 'Early Access Review']
# size of dataframe
df_steam_reviews.shape

(5392419, 5)

In [10]:
# Preprocessing

def remove_links(x):
    x = re.sub(r"http\S+", "", x)
    x = re.sub(r"https\S+", "", x)
    x = re.sub(r"www.\S+", "", x)
    x = re.sub(".*\..*\..*", "", x)
    return x

def remove_hashtag(x):
    x = re.sub("@[A-Za-z0-9_]+","", x)
    x = re.sub("#[A-Za-z0-9_]+","", x)
    return x

def remove_punct(x):
    x = re.sub(r"[()!?:;,.'-]","", x)
    return x

def remove_emoji(x):
    x = x.replace(":)", "")
    x = x.replace(":-)", "")
    x = x.replace(":(", "")
    x = x.replace("-_-", "")
    x = x.replace(";)", "")
    x = x.replace(";-)", "")
    # REFERENCE FOR EMOJI_PATTERN: https://gist.github.com/slowkow/7a7f61f495e3dbb7e3d767f97bd7304b
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags 
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    x = emoji_pattern.sub(r'', x)
    return x

def remove_titiktitik(x):
    x = x.replace("..", "")
    x = x.replace("...", "")
    x = x.replace("....", "")
    x = x.replace(".....", "")
    x = x.replace("...................", "")
    return x

def remove_money(x):
    x = re.sub("€", "", x)
    x = re.sub("$", "", x)
    x = x.replace("usd", "")
    return x

def fix_typo(x):
    x = x.replace("veru", "very")
    x = x.replace("gud", "good")
    x = x.replace("gut", "good")
    x = x.replace("withouth", "without")
    x = x.replace("noob", "newbie")
    x = x.replace("dis", "this")
    x = x.replace("noobs", "newbie")
    x = x.replace("nice1", "nice")
    x = x.replace("4ever", "forever")
    x = x.replace("w0n", "won")
    x = re.sub("&lt;3", "", x)
    x = x.replace("graficks", "graphics")
    x = x.replace("dissapeared", "disappeared")
    x = x.replace("yr", "year")
    x = x.replace("yrs", "years")
    x = x.replace("dosent", "doesnt")
    x = x.replace("awsume","awesome")
    x = re.sub("&lt3","",x)
    x = x.replace("compatative", "competitive")
    x = x.replace("cyyounterstrikesyyource", "counter strike source")
    x = x.replace("&amp","and")
    x = x.replace("yyoure","your")
    x = x.replace("cyyounter","counter")
    x = x.replace("child hood", "childhood")
    return x

In [11]:
# convert review text to string
df_steam_reviews["review_text"] = df_steam_reviews["review_text"].astype(str)
# convert to lowercase
df_steam_reviews["review_text"] = df_steam_reviews["review_text"].apply(str.lower)

# drop the reviews with null score
df_steam_reviews = df_steam_reviews[df_steam_reviews["review_score"].notnull()]
df_steam_reviews["review_score"] = np.where(df_steam_reviews["review_score"]==-1, 0, df_steam_reviews["review_score"])

# remove links
df_steam_reviews["review_text"] = df_steam_reviews["review_text"].apply(remove_links)

# remove hashtag
df_steam_reviews["review_text"] = df_steam_reviews["review_text"].apply(remove_hashtag)

# removing punctuation
df_steam_reviews["review_text"] = df_steam_reviews["review_text"].apply(remove_punct)

# removing dots
df_steam_reviews["review_text"] = df_steam_reviews["review_text"].apply(remove_titiktitik)

# removing emoji
df_steam_reviews["review_text"] = df_steam_reviews["review_text"].apply(remove_emoji)

# remove money symbols
df_steam_reviews["review_text"] = df_steam_reviews["review_text"].apply(remove_money)

# fix any typo
df_steam_reviews["review_text"] = df_steam_reviews["review_text"].apply(fix_typo)

# remove any stopwords
stop = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y']
df_steam_reviews["review_text"] = df_steam_reviews["review_text"].apply(lambda x: " ".join(x for x in x.split() if x not in stop))

# lemmatize
lemmatizer = WordNetLemmatizer()
df_steam_reviews["review_text"] = df_steam_reviews["review_text"].apply(lambda x: " ".join(lemmatizer.lemmatize(word) for word in x.split()))

#remove empty text
df_steam_reviews = df_steam_reviews[df_steam_reviews.review_text.str.strip() != '']

# distribution of negative and positive reviews
df_steam_reviews["review_score"].value_counts()

1    1965881
0     300832
Name: review_score, dtype: int64

In [12]:
# Most reviewed game
df2 = df_steam_reviews.groupby(['app_id'])['app_id'].count().reset_index(name='count').sort_values(['count'], ascending=False)
df3 = df2['app_id'].values.tolist()[0:10]

print(df2.iloc[:10])
# print(df3)

for id in df3:
    df_gn = df_steam_reviews.loc[df_steam_reviews['app_id'].isin([id])]['app_name'].unique()
    print(df_gn)

      app_id  count
22       570  48575
1565  218620  47112
1181  105600  46821
21       550  31270
2156  252950  30497
1739  230410  25308
5872  391540  21905
2582  271590  21651
23       620  21038
132     4000  17596
['Dota 2']
['PAYDAY 2']
['Terraria']
['Left 4 Dead 2']
['Rocket League']
['Warframe']
['Undertale']
['Grand Theft Auto V']
['Portal 2']
["Garry's Mod"]


In [13]:
reviewed_steam = df_steam_reviews.loc[df_steam_reviews["app_id"].isin(df3)] # take 10 most reviewed game only

# sampling the data
p = 0.07
reviewed_steam = reviewed_steam.sample(frac = p).reset_index(drop=True) # take 7% of data and shuffle it

Balancing data

In [14]:
# # take the positive as many as the negative ones
total_data = len(reviewed_steam["review_score"])
total_data_positive = len(reviewed_steam[reviewed_steam["review_score"] == 1])
total_data_negative = len(reviewed_steam[reviewed_steam["review_score"] == 0])

print("Total data cleaned:", total_data)
print("Total data positive:", total_data_positive)
print("Total data negative:", total_data_negative)

df_steam_reviews_balanced_positive = reviewed_steam[reviewed_steam["review_score"] == 1].sample(n = int(1.5*total_data_negative))
df_steam_reviews_balanced_negative = reviewed_steam[reviewed_steam["review_score"] == 0] 
df_steam_reviews_balanced = pd.concat([df_steam_reviews_balanced_positive, df_steam_reviews_balanced_negative])
df_steam_reviews_balanced = df_steam_reviews_balanced.sample(frac = 1).reset_index(drop=True) #shuffle the data again
df_steam_reviews_balanced.head()

# print("Data Balanced with ratio 1:1...")
print(f"Jumlah data positive : jumlah data negative = {len(df_steam_reviews_balanced_positive)} : {total_data_negative}")

Total data cleaned: 21824
Total data positive: 19634
Total data negative: 2190
Jumlah data positive : jumlah data negative = 3285 : 2190


In [15]:
reviews = df_steam_reviews_balanced["review_text"].values.tolist()
labels = df_steam_reviews_balanced["review_score"].tolist()
# reviews = reviewed_steam["review_text"].values.tolist()
# labels = reviewed_steam["review_score"].tolist()

In [16]:
print(len(reviews))

5475


Preparing data into train/valid/test split

In [17]:
# split the dataset into train, validation and holdout sets (60-20-20)
training_sentences, test_sentences, training_labels, test_labels = train_test_split(reviews, labels, test_size=.4)

validation_sentences, holdout_sentences, validation_labels, holdout_labels = train_test_split(test_sentences, test_labels, test_size=.5)

## Vector Extraction using BERT

In [18]:
MAX_LEN = 128
NUM_LABELS = 2
BATCH = 64
DEVICE_USED = "cuda:0"
LEARNING_RATE = 4e-6
LAMBDA_L2 = 1e-6
EPOCHS = 500
# MODEL_PATH = "/home/sagemaker-user/Saved Models/Sentiment Analysis/proposed_model/"
MODEL_PATH = "D:/Training/Machine Learning/NLP/Sentiment Analysis/proposed_model/"

In [19]:
device = torch.device(DEVICE_USED if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [20]:
# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('D:/Training/Machine Learning/Datasets/bert-base-uncased', model_max_length=MAX_LEN)
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', model_max_length=MAX_LEN)

In [21]:
def TokenizeDataset(x_data, y_data):
    if len(x_data) != len(y_data):
        raise Exception("x_data and y_data size are different!")
    
    t = trange(len(x_data), colour="green", position=0, leave=True)
    
    out_padded_token_list = []
    out_att_mask = []
    out_tok_type_id = []
    out_target = []
    
    for sentence_idx in t:
        t.set_description(f"Tokenizing data [{sentence_idx + 1}/{len(x_data)}]...")
        encoded_sentence = tokenizer.encode_plus(
            x_data[sentence_idx],
            add_special_tokens = True,
            max_length = MAX_LEN,
            truncation = 'longest_first',
            padding = 'max_length',
            return_attention_mask = True,
            return_tensors = 'pt'
        )

        padded_token_list = encoded_sentence['input_ids']
        att_mask = encoded_sentence['attention_mask']
        tok_type_id = encoded_sentence['token_type_ids']
        target = torch.tensor(y_data[sentence_idx])
        
        out_padded_token_list.append(padded_token_list)
        out_att_mask.append(att_mask)
        out_tok_type_id.append(tok_type_id)
        out_target.append(target)
        
    output_data = {
        "input_ids": out_padded_token_list,
        "attention_mask": out_att_mask,
        "token_type_ids": out_tok_type_id,
        "label": out_target
    }

    return output_data

In [22]:
tokenized_training = TokenizeDataset(training_sentences, training_labels)

Tokenizing data [3285/3285]...: 100%|[32m█████████████████████████████████████████████[0m| 3285/3285 [00:04<00:00, 671.05it/s][0m


In [23]:
tokenized_validation = TokenizeDataset(validation_sentences, validation_labels)

Tokenizing data [1095/1095]...: 100%|[32m█████████████████████████████████████████████[0m| 1095/1095 [00:01<00:00, 681.65it/s][0m


In [24]:
tokenized_holdout = TokenizeDataset(holdout_sentences, holdout_labels)

Tokenizing data [1095/1095]...: 100%|[32m█████████████████████████████████████████████[0m| 1095/1095 [00:01<00:00, 679.42it/s][0m


In [25]:
model = BertModel.from_pretrained(
    'D:/Training/Machine Learning/Datasets/bert-base-uncased', 
#     'bert-base-uncased',
    num_labels=NUM_LABELS,
    output_hidden_states = True, # Whether the model returns all hidden-states,
    )

Some weights of the model checkpoint at D:/Training/Machine Learning/Datasets/bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [26]:
# # Put model to GPU
# model.to(device)

Extract the embedding weight

In [27]:
class TokenizedData(Dataset):
    def __init__(self, input_ids, attention_mask, token_type_ids, labels):
        self.input_ids = input_ids
        self.att_mask = attention_mask
        self.tti = token_type_ids
        self.labels = labels
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, idx):
        return {
            "input_ids": self.input_ids[idx],
            "attention_mask": self.att_mask[idx],
            "token_type_ids": self.tti[idx],
            "label": self.labels[idx]
        }

In [28]:
datas_train_tok = TokenizedData(tokenized_training['input_ids'], tokenized_training['attention_mask'], tokenized_training['token_type_ids'], tokenized_training['label'])

In [29]:
datas_valid_tok = TokenizedData(tokenized_validation['input_ids'], tokenized_validation['attention_mask'], tokenized_validation['token_type_ids'], tokenized_validation['label'])

In [30]:
datas_holdout_tok = TokenizedData(tokenized_holdout['input_ids'], tokenized_holdout['attention_mask'], tokenized_holdout['token_type_ids'], tokenized_holdout['label'])

In [31]:
dataloader_training_tok = DataLoader(
    datas_train_tok,
    batch_size = BATCH
)
dataloader_valid_tok = DataLoader(
    datas_valid_tok,
    batch_size = BATCH
)
dataloader_holdout_tok = DataLoader(
    datas_holdout_tok,
    batch_size = BATCH
)

In [32]:
# model.eval()
# with torch.no_grad():
#     outputs = model(
#         input_ids = tokenized_training['input_ids'][0],
#         attention_mask = tokenized_training['attention_mask'][0],
#         token_type_ids = tokenized_training['token_type_ids'][0]
#     )
    
#     last_hidden_state = outputs[0]
#     hidden_states = outputs[2]
#     initial_embedding = hidden_states[0] # initial embedding
#     word_embed_4_last_layers = torch.stack(hidden_states[-4:]).sum(0) #sum of last 4 hidden layers
    
#     print("last hidden state:", last_hidden_state.size())
#     print("hidden states:", len(hidden_states))
#     print("initial embedding:", initial_embedding.size())
#     print("last 4 layers:", word_embed_4_last_layers.size())

In [33]:
def ExtractEmbedding(the_model, datas, total_data):
    the_model.eval() # put model to evaluation mode
    
    bert_embedding_sv = []
    bert_embedding_label = []
    
    with torch.no_grad():
        t = tqdm(enumerate(datas), colour="green", position=0, leave=True, total=len(datas))
        i = 0
        for batch, data in t:
            for idx in range(len(data["input_ids"])):
                in_ids = data["input_ids"][idx]
                att_mask = data["attention_mask"][idx]
                tok_type = data["token_type_ids"][idx]

                output = the_model(
                    input_ids = in_ids,
                    attention_mask = att_mask,
                    token_type_ids = tok_type
                )
                
                hidden_states = output[2]
                word_embed_4_last_layers = torch.stack(hidden_states[-4:]).sum(0) #sum of last 4 hidden layers
#                 print("last 4 layers:", word_embed_4_last_layers.size())
#                 print("label:", data['label'][idx])
                
                bert_embedding_sv.append(word_embed_4_last_layers)
                bert_embedding_label.append(data['label'][idx])
                
                t.set_description(f"Extracting embedding weight [{i+1}/{total_data}] ")
                t.refresh()
                
                i += 1
        return bert_embedding_sv, bert_embedding_label

In [34]:
training_embeddings, training_embd_labels = ExtractEmbedding(model, dataloader_training_tok, len(tokenized_training['input_ids']))

Extracting embedding weight [3285/3285] : 100%|[32m████████████████████████████████████████[0m| 52/52 [14:52<00:00, 17.17s/it][0m


In [35]:
validation_embeddings, valid_embd_labels = ExtractEmbedding(model, dataloader_valid_tok, len(tokenized_validation['input_ids']))

Extracting embedding weight [1095/1095] : 100%|[32m████████████████████████████████████████[0m| 18/18 [04:42<00:00, 15.69s/it][0m


In [36]:
holdout_embeddings, holdout_embd_labels = ExtractEmbedding(model, dataloader_holdout_tok, len(tokenized_holdout['input_ids']))

Extracting embedding weight [1095/1095] : 100%|[32m████████████████████████████████████████[0m| 18/18 [04:32<00:00, 15.15s/it][0m


In [37]:
print(f"Panjang embedding: {len(training_embeddings)}, Panjang label: {len(training_embd_labels)}")

Panjang embedding: 3285, Panjang label: 3285


In [45]:
class EmbeddingDataset(Dataset):
    def __init__(self, arr_embed, arr_lbl):
        super(EmbeddingDataset, self).__init__()
        self.array_embed = arr_embed
        self.array_label = arr_lbl
    def __len__(self):
        return len(self.array_embed)
    def __getitem__(self, idx):
        all_embedding = self.array_embed[idx][0, :, :] # torch.squeeze(self.array_embed[idx])
        cls_embedding = self.array_embed[idx][0, 0, :]
        cls_embedding = cls_embedding[None, :]
        data_pair = {
#             "all_embedding": all_embedding, # all embedding data (all of seq length) - size [128, 768]
#             "cls_embedding": cls_embedding, # CLS embedding only - size [1, 768]
            "x": cls_embedding,
            "labels": self.array_label[idx]
        }
        return data_pair

In [46]:
embed_train_dataset = EmbeddingDataset(training_embeddings, training_embd_labels)
embed_valid_dataset = EmbeddingDataset(validation_embeddings, valid_embd_labels)
embed_holdout_dataset = EmbeddingDataset(holdout_embeddings, holdout_embd_labels)

In [47]:
embed_holdoutval_dataset = EmbeddingDataset(training_embeddings + validation_embeddings, training_embd_labels + valid_embd_labels)

## Baseline Model

### Model Declaration

In [72]:
class ProposedModel1(torch.nn.Module):
    def __init__(self, lstm_in_size, lstm_hdn_size, lstm_layers, loss_func, lstm_dropout = 0.2):
        super(ProposedModel1, self).__init__()
        
        self.lstm_layers = lstm_layers
        self.lstm_hdn_size = lstm_hdn_size
        
        self.batchnorm = torch.nn.BatchNorm1d(lstm_in_size)
        self.lstm = torch.nn.LSTM(
            input_size = lstm_in_size,
            hidden_size = lstm_hdn_size//2,
            num_layers = lstm_layers,
            bidirectional = True,
            dropout = lstm_dropout,
            batch_first = True
        )
        self.fc1 = torch.nn.Linear(
            in_features = lstm_hdn_size,
            out_features = 512 
        )
        self.relu = torch.nn.ReLU()
        self.dropout1 = torch.nn.Dropout(p=0.55)
        self.fc2 = torch.nn.Linear(
            in_features = 512,
            out_features = 64
        )
        self.dropout2 = torch.nn.Dropout(p=0.45)
        self.fc3 = torch.nn.Linear(
            in_features = 64,
            out_features = 2
        )
        self.sigmoid = torch.nn.Sigmoid()
        self.loss_func = loss_func
    
    def process_weight(self, x):
        x_perm = x.permute(0, 2, 1)
        b = self.batchnorm(x_perm)
        b_perm = b.permute(0, 2, 1)

        # Init the hidden state
        h0 = torch.zeros(2*self.lstm_layers, x.size(0), self.lstm_hdn_size//2).requires_grad_()
        h0 = h0.to(device)
      
        # Init the cell state
        c0 = torch.zeros(2*self.lstm_layers, x.size(0), self.lstm_hdn_size//2).requires_grad_()
        c0 = c0.to(device)

        h, _ = self.lstm(b_perm, (h0.detach(), c0.detach()))
        y1 = self.dropout1(h)
        y1 = self.fc1(y1)
        y1 = self.relu(y1)
        y1 = self.dropout1(y1)
        y1 = self.fc2(y1)
        y1 = self.relu(y1)
        y1 = self.dropout2(y1)
        y1 = self.fc3(y1)
        output = self.sigmoid(y1)
        
        return output
    
    def forward(self, x, labels):
        output = self.process_weight(x)
        y_pred = output[:,0,:].argmax(dim=1).float().requires_grad_()
        
        # Calculate loss
        loss = self.loss_func(y_pred, labels.float())
        
        return {"loss": loss, "logits": output[:,0,:]}

    def predict(self, x):
        output = self.process_weight(x)
        y_pred = output[:,0,:].argmax(dim=1)
        
        return y_pred

In [73]:
class ProposedModel2(torch.nn.Module):
    def __init__(self, gru_in_size, gru_hdn_size, gru_layers, gru_dropout, loss_func):
        super(ProposedModel2, self).__init__()
        
        self.gru_layers = gru_layers
        self.gru_hdn_size = gru_hdn_size
        
        self.batchnorm = torch.nn.BatchNorm1d(gru_in_size)
        self.gru = torch.nn.GRU(
            input_size = gru_in_size,
            hidden_size = gru_hdn_size//2,
            num_layers = gru_layers,
            bidirectional = True,
            dropout = gru_dropout,
            batch_first = True
        )
        self.fc1 = torch.nn.Linear(
            in_features = gru_hdn_size,
            out_features = 512
        )
        self.relu = torch.nn.ReLU()
        self.dropout1 = torch.nn.Dropout(p=0.66)
        self.fc2 = torch.nn.Linear(
            in_features = 512,
            out_features = 128
        )
        self.dropout2 = torch.nn.Dropout(p=0.54)
        self.fc3 = torch.nn.Linear(
            in_features = 128,
            out_features = 2
        )
        self.sigmoid = torch.nn.Sigmoid()
        self.loss_func = loss_func
    
    def process_weight(self, x):
        x_perm = x.permute(0,2,1)
        b = self.batchnorm(x_perm)
        b_perm = b.permute(0,2,1)
        
        # Init the hidden state
        h0 = torch.zeros(2*self.gru_layers, x.size(0), self.gru_hdn_size//2).requires_grad_()
        h0 = h0.to(device)

        h, _ = self.gru(b_perm, h0.detach())
        y1 = self.dropout1(h)
        y1 = self.fc1(h)
        y1 = self.relu(y1)
        y1 = self.dropout1(y1)
        y1 = self.fc2(y1)
        y1 = self.relu(y1)
        y1 = self.dropout2(y1)
        y1 = self.fc3(y1)
        output = self.sigmoid(y1)
        
        return output
    
    def forward(self, x, labels):
        output = self.process_weight(x)
        y_pred = output[:,0,:].argmax(dim=1).float().requires_grad_()
        
        # Calculate loss
        loss = self.loss_func(y_pred, labels.float())
        
        return {"loss": loss, "logits": output[:,0,:]}
    
    def predict(self, x):
        output = self.process_weight(x)
        y_pred = output[:,0,:].argmax(dim=1)
        
        return y_pred

In [74]:
# class_weight = torch.Tensor([1.5, 1])
proposed_model1 = ProposedModel1(
    lstm_in_size = 768,
    lstm_hdn_size = 1024,
    lstm_layers = 2,
    lstm_dropout = 0.59,
    loss_func = torch.nn.BCEWithLogitsLoss()
)
proposed_model1 = proposed_model1.to(device)
print(proposed_model1)

ProposedModel1(
  (batchnorm): BatchNorm1d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (lstm): LSTM(768, 512, num_layers=2, batch_first=True, dropout=0.59, bidirectional=True)
  (fc1): Linear(in_features=1024, out_features=512, bias=True)
  (relu): ReLU()
  (dropout1): Dropout(p=0.55, inplace=False)
  (fc2): Linear(in_features=512, out_features=64, bias=True)
  (dropout2): Dropout(p=0.45, inplace=False)
  (fc3): Linear(in_features=64, out_features=2, bias=True)
  (sigmoid): Sigmoid()
  (loss_func): BCEWithLogitsLoss()
)


In [75]:
# class_weight = torch.Tensor([1.5, 1])
proposed_model2 = ProposedModel2(
    gru_in_size = 768,
    gru_hdn_size = 1024,
    gru_layers = 2,
    gru_dropout = 0.65,
    loss_func = torch.nn.BCELoss()
)
proposed_model2 = proposed_model2.to(device)
print(proposed_model2)

ProposedModel2(
  (batchnorm): BatchNorm1d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (gru): GRU(768, 512, num_layers=2, batch_first=True, dropout=0.65, bidirectional=True)
  (fc1): Linear(in_features=1024, out_features=512, bias=True)
  (relu): ReLU()
  (dropout1): Dropout(p=0.66, inplace=False)
  (fc2): Linear(in_features=512, out_features=128, bias=True)
  (dropout2): Dropout(p=0.54, inplace=False)
  (fc3): Linear(in_features=128, out_features=2, bias=True)
  (sigmoid): Sigmoid()
  (loss_func): BCEWithLogitsLoss()
)


### Training & Validation

#### Create all needed functions to train, validate, and plot

Function to calculate metrics

In [98]:
def calc_metric_for_trainer(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred, average='micro')
    precision = precision_score(y_true=labels, y_pred=pred, average='micro')
    f1 = f1_score(y_true=labels, y_pred=pred, average='weighted')

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

Create a function to plot

In [56]:
def lets_plot(training_value, validation_value, y_caption, title, holdout_validation = False, background_color='black'):
    fig = plt.figure()
    ax = fig.add_subplot(1, 1, 1) # nrows, ncols, index
    ax.set_facecolor(background_color)
    plt.plot(training_value, color='red', label='Train')
    if validation_value != None:
        if holdout_validation == False:
            plt.plot(validation_value, color='yellow', label='Valid')
            plt.legend(['Train', 'Valid'], loc='upper right')
        else:
            plt.plot(validation_value, color='yellow', label='Valid')
            plt.legend(['Train+Valid', 'Test'], loc='upper right')
    else:
        plt.legend(['Train'], loc='upper right')
    plt.title(title)
    
    plt.xlabel('Epochs')
    plt.ylabel(y_caption)
    plt.grid(color='white', linestyle='--', linewidth=0.5)
    plt.show

#### Train and Validate

Training for LSTM (training data vs validation data)

In [76]:
training_args_bilstm = TrainingArguments(
    output_dir=MODEL_PATH + "/bilstm/trainer/",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=BATCH,
    per_device_eval_batch_size=BATCH,
    num_train_epochs=EPOCHS,
    seed=101,
    learning_rate=LEARNING_RATE,
    report_to="wandb", # "none"
    run_name="bilstm"
)

In [77]:
trainer_bilstm = Trainer(
    model=proposed_model1,
    args=training_args_bilstm,
    train_dataset=embed_train_dataset,
    eval_dataset=embed_valid_dataset,
    compute_metrics=calc_metric_for_trainer,
)

In [78]:
trainer_bilstm.train()



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.697168,0.43105,0.43105,0.43105,0.377181
2,No log,0.7,0.426484,0.426484,0.426484,0.374023
3,No log,0.699525,0.428311,0.428311,0.428311,0.377459
4,No log,0.69927,0.430137,0.430137,0.430137,0.381564
5,No log,0.696933,0.437443,0.437443,0.437443,0.393529
6,No log,0.69852,0.429224,0.429224,0.429224,0.376649
7,No log,0.698647,0.428311,0.428311,0.428311,0.374551
8,No log,0.698556,0.427397,0.427397,0.427397,0.371687
9,No log,0.696622,0.43653,0.43653,0.43653,0.390209
10,0.701000,0.698046,0.43105,0.43105,0.43105,0.380085


TrainOutput(global_step=26000, training_loss=0.7017178732065054, metrics={'train_runtime': 543.5357, 'train_samples_per_second': 3021.881, 'train_steps_per_second': 47.835, 'total_flos': 0.0, 'train_loss': 0.7017178732065054, 'epoch': 500.0})

Holdout validation for LSTM (training data + validation data vs test data)

In [82]:
trainer_bilstm_holdout = Trainer(
    model=proposed_model1,
    args=training_args_bilstm,
    train_dataset=embed_holdoutval_dataset,
    eval_dataset=embed_holdout_dataset,
    compute_metrics=calc_metric_for_trainer,
)

In [83]:
trainer_bilstm_holdout.train()
wandb.finish()



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.692331,0.466667,0.466667,0.466667,0.425266
2,No log,0.691546,0.46758,0.46758,0.46758,0.425328
3,No log,0.690796,0.466667,0.466667,0.466667,0.420857
4,No log,0.692587,0.46484,0.46484,0.46484,0.421432
5,No log,0.690979,0.468493,0.468493,0.468493,0.426004
6,No log,0.693974,0.461187,0.461187,0.461187,0.416205
7,No log,0.692076,0.468493,0.468493,0.468493,0.429044
8,0.703200,0.691765,0.46758,0.46758,0.46758,0.425944
9,0.703200,0.69127,0.46484,0.46484,0.46484,0.417563
10,0.703200,0.692367,0.46484,0.46484,0.46484,0.4208


TrainOutput(global_step=34500, training_loss=0.7017109445765398, metrics={'train_runtime': 618.6354, 'train_samples_per_second': 3540.05, 'train_steps_per_second': 55.768, 'total_flos': 0.0, 'train_loss': 0.7017109445765398, 'epoch': 500.0})

Training for GRU (training data vs validation data)

In [79]:
training_args_bigru = TrainingArguments(
    output_dir=MODEL_PATH + "/bigru/trainer/",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=BATCH,
    per_device_eval_batch_size=BATCH,
    num_train_epochs=EPOCHS,
    seed=101,
    learning_rate=LEARNING_RATE,
    report_to="none" #"azure-ml"
)

In [80]:
trainer_bigru = Trainer(
    model=proposed_model2,
    args=training_args_bigru,
    train_dataset=embed_train_dataset,
    eval_dataset=embed_valid_dataset,
    compute_metrics=calc_metric_for_trainer,
)

In [81]:
trainer_bigru.train()



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.708874,0.595434,0.595434,0.595434,0.473664
2,No log,0.710792,0.592694,0.592694,0.592694,0.466572
3,No log,0.708308,0.596347,0.596347,0.596347,0.475541
4,No log,0.710792,0.592694,0.592694,0.592694,0.466572
5,No log,0.709221,0.594521,0.594521,0.594521,0.473167
6,No log,0.710226,0.593607,0.593607,0.593607,0.468479
7,No log,0.707394,0.598174,0.598174,0.598174,0.477914
8,No log,0.708655,0.595434,0.595434,0.595434,0.475041
9,No log,0.710573,0.592694,0.592694,0.592694,0.467993
10,0.705700,0.70966,0.594521,0.594521,0.594521,0.470379


TrainOutput(global_step=26000, training_loss=0.7053646251971905, metrics={'train_runtime': 474.3216, 'train_samples_per_second': 3462.841, 'train_steps_per_second': 54.815, 'total_flos': 0.0, 'train_loss': 0.7053646251971905, 'epoch': 500.0})

Holdout validation for GRU (training data + validation data vs test data)

In [84]:
trainer_bigru_holdout = Trainer(
    model=proposed_model2,
    args=training_args_bigru,
    train_dataset=embed_holdoutval_dataset,
    eval_dataset=embed_holdout_dataset,
    compute_metrics=calc_metric_for_trainer,
)

In [85]:
trainer_bigru_holdout.train()
wandb.finish()



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.727761,0.578995,0.578995,0.578995,0.443878
2,No log,0.727103,0.578995,0.578995,0.578995,0.448248
3,No log,0.727889,0.578082,0.578082,0.578082,0.444878
4,No log,0.728455,0.577169,0.577169,0.577169,0.442943
5,No log,0.727542,0.578995,0.578995,0.578995,0.445349
6,No log,0.726756,0.579909,0.579909,0.579909,0.448727
7,No log,0.728455,0.577169,0.577169,0.577169,0.442943
8,0.704000,0.728108,0.578082,0.578082,0.578082,0.443411
9,0.704000,0.727761,0.578995,0.578995,0.578995,0.443878
10,0.704000,0.727103,0.578995,0.578995,0.578995,0.448248


TrainOutput(global_step=34500, training_loss=0.7049176264223845, metrics={'train_runtime': 535.8255, 'train_samples_per_second': 4087.152, 'train_steps_per_second': 64.387, 'total_flos': 0.0, 'train_loss': 0.7049176264223845, 'epoch': 500.0})

#### Test

In [86]:
prediction_bilstm = trainer_bilstm_holdout.predict(embed_holdout_dataset)

In [125]:
print("Testing metrics:", prediction_bilstm[2])

Testing metrics: {'test_loss': 0.6929692029953003, 'test_accuracy': 0.4621004566210046, 'test_precision': 0.4621004566210046, 'test_recall': 0.4621004566210046, 'test_f1': 0.4155708620914493, 'test_runtime': 0.2336, 'test_samples_per_second': 4686.842, 'test_steps_per_second': 77.044}


In [87]:
prediction_bigru = trainer_bigru_holdout.predict(embed_holdout_dataset)

In [126]:
print("Testing metrics:", prediction_bigru[2])

Testing metrics: {'test_loss': 0.7263175845146179, 'test_accuracy': 0.5799086757990868, 'test_precision': 0.5799086757990868, 'test_recall': 0.5799086757990868, 'test_f1': 0.4515783468175115, 'test_runtime': 0.1557, 'test_samples_per_second': 7032.671, 'test_steps_per_second': 115.606}


#### Plot Training & Validation

Plot BiLSTM model performance

In [None]:
# lets_plot(history_with_lstm["train_f1"], history_with_lstm["valid_f1"], "", "F1 Score with BiLSTM")

In [None]:
# lets_plot(history_holdoutval_with_lstm["train_f1"], history_holdoutval_with_lstm["valid_f1"], "", "F1 Score with BiLSTM After Holdout Validation", True)

In [None]:
# lets_plot(history_with_lstm["train_loss"], history_with_lstm["valid_loss"], "", "Loss with BiLSTM")

Plot BiGRU model performance

In [None]:
# lets_plot(history_with_gru["train_f1"], history_with_gru["valid_f1"], "", "F1 Score with BiGRU")

In [None]:
# lets_plot(history_with_gru["train_loss"], history_with_gru["valid_loss"], "", "Loss with BiGRU")

In [None]:
# lets_plot(history_holdoutval_with_gru["train_f1"], history_holdoutval_with_gru["valid_f1"], "", "F1 Score with BiGRU After Holdout Validation", True)

## Proposed Method

### BiLSTM + CRF

In [113]:
class ProposedModel3(torch.nn.Module):
    def __init__(self, p_dropout, lstm_in_size, lstm_hdn_size, lstm_layers, lstm_dropout = 0.2):
        super(ProposedModel3, self).__init__()
        
        self.lstm_layers = lstm_layers
        self.lstm_hdn_size = lstm_hdn_size
        
        self.batchnorm = torch.nn.BatchNorm1d(lstm_in_size)
        self.lstm = torch.nn.LSTM(
            input_size = lstm_in_size,
            hidden_size = lstm_hdn_size//2,
            num_layers = lstm_layers,
            bidirectional = True,
            dropout = lstm_dropout,
            batch_first = True
        )
        self.fc1 = torch.nn.Linear(
            in_features = lstm_hdn_size,
            out_features = 2 # number of output classes
        )
        self.dropout = torch.nn.Dropout(p=p_dropout)
        self.crf = CRF(2, batch_first=True)
    
    def process_emission(self, x):
        x_perm = x.permute(0, 2, 1)
        b = self.batchnorm(x_perm)
        b_perm = b.permute(0, 2, 1)

        # Init the hidden state
        h0 = torch.zeros(2*self.lstm_layers, x.size(0), self.lstm_hdn_size//2).requires_grad_()
        h0 = h0.to(device)
      
        # Init the cell state
        c0 = torch.zeros(2*self.lstm_layers, x.size(0), self.lstm_hdn_size//2).requires_grad_()
        c0 = c0.to(device)

        h, _ = self.lstm(b_perm, (h0.detach(), c0.detach()))
        d = self.dropout(h)
        emission = self.fc1(d)
        return emission
    
    def forward(self, x, labels, att_mask=None):
        emission = self.process_emission(x)
        emission_perm = emission.permute(1,0,2)
        labels = labels[None,:]
        loss = -1 * self.crf(emission_perm, labels)
        
        return {"loss": loss, "logits": emission}
    
    def __evaluate__(self, x, att_mask=None):
        emission = self.process_emission(x)
        emission_perm = emission.permute(1,0,2)
        decoded = self.crf.decode(emission_perm)
        print("eval decoded:", decoded)
        
        return decoded

    def predict(self, x, att_mask=None):
        emission = self.process_emission(x)
        emission_perm = emission.permute(1,0,2)
        decoded = self.crf.decode(emission_perm)
        
        return decoded

In [114]:
proposed_method3 = ProposedModel3(
    p_dropout = 0.45,
    lstm_in_size = 768,
    lstm_hdn_size = 1024,
    lstm_layers = 2,
    lstm_dropout = 0.52
)
proposed_method3 = proposed_method3.to(device)

In [118]:
def calc_metric_for_trainer2(p):
    pred, labels = p
    pred = pred[:,0,:]
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred, average='micro')
    precision = precision_score(y_true=labels, y_pred=pred, average='micro')
    f1 = f1_score(y_true=labels, y_pred=pred, average='weighted')

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

In [119]:
training_args_bilstm_crf = TrainingArguments(
    output_dir=MODEL_PATH + "/bilstm-crf/trainer/",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=BATCH,
    per_device_eval_batch_size=BATCH,
    num_train_epochs=EPOCHS,
    seed=101,
    learning_rate=LEARNING_RATE,
    report_to="none" #"azure-ml"
)

In [120]:
trainer_bilstm_crf = Trainer(
    model=proposed_method3,
    args=training_args_bilstm_crf,
    train_dataset=embed_train_dataset,
    eval_dataset=embed_valid_dataset,
    compute_metrics=calc_metric_for_trainer2,
)

In [121]:
trainer_bilstm_crf.train()



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,43.09024,0.632877,0.632877,0.632877,0.538263
2,No log,42.833546,0.680365,0.680365,0.680365,0.636848
3,No log,42.474873,0.694064,0.694064,0.694064,0.662589
4,No log,42.226841,0.705936,0.705936,0.705936,0.687449
5,No log,41.862316,0.718721,0.718721,0.718721,0.70357
6,No log,41.511837,0.725114,0.725114,0.725114,0.714044
7,No log,41.173527,0.724201,0.724201,0.724201,0.71782
8,No log,40.804672,0.725114,0.725114,0.725114,0.720023
9,No log,40.363831,0.728767,0.728767,0.728767,0.723744
10,41.668100,40.06163,0.726027,0.726027,0.726027,0.724696


TrainOutput(global_step=26000, training_loss=16.564458627554085, metrics={'train_runtime': 6119.6101, 'train_samples_per_second': 268.399, 'train_steps_per_second': 4.249, 'total_flos': 0.0, 'train_loss': 16.564458627554085, 'epoch': 500.0})

In [123]:
trainer_bilstm_crf_holdout = Trainer(
    model=proposed_method3,
    args=training_args_bilstm_crf,
    train_dataset=embed_holdoutval_dataset,
    eval_dataset=embed_holdout_dataset,
    compute_metrics=calc_metric_for_trainer2,
)

In [124]:
trainer_bilstm_crf_holdout.train()
wandb.finish()



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,32.707649,0.798174,0.798174,0.798174,0.797065
2,No log,32.862682,0.796347,0.796347,0.796347,0.794573
3,No log,32.69363,0.799087,0.799087,0.799087,0.79767
4,No log,32.304409,0.8,0.8,0.8,0.798988
5,No log,32.557915,0.794521,0.794521,0.794521,0.792928
6,No log,32.959053,0.793607,0.793607,0.793607,0.791121
7,No log,32.478275,0.796347,0.796347,0.796347,0.794864
8,14.083600,32.198376,0.8,0.8,0.8,0.799394
9,14.083600,32.019386,0.800913,0.800913,0.800913,0.800572
10,14.083600,32.360889,0.795434,0.795434,0.795434,0.7938


TrainOutput(global_step=34500, training_loss=4.252512012426404, metrics={'train_runtime': 7737.2316, 'train_samples_per_second': 283.047, 'train_steps_per_second': 4.459, 'total_flos': 0.0, 'train_loss': 4.252512012426404, 'epoch': 500.0})

In [127]:
prediction_bilstm_crf = trainer_bilstm_crf_holdout.predict(embed_holdout_dataset)

In [128]:
print("Testing metrics:", prediction_bilstm_crf[2])

Testing metrics: {'test_loss': 58.3538818359375, 'test_accuracy': 0.8054794520547945, 'test_precision': 0.8054794520547945, 'test_recall': 0.8054794520547945, 'test_f1': 0.804495007110843, 'test_runtime': 0.8501, 'test_samples_per_second': 1288.017, 'test_steps_per_second': 21.173}


Upload saved models to AWS S3

In [None]:
def UploadModels(model_folder, bucket_name, saved_dir):
    files = os.listdir(model_folder)
    
    if model_folder[-1] == "/":
        model_folder = model_folder[:-1]
    if saved_dir[-1] != "/":
        saved_dir += "/"
    
    t = tqdm(files, colour="green", position=0, leave=True)
    for file_source in t:
        t.set_description(f"Uploading model {file_source} to AWS S3...")
        s3.upload_file(
            Filename = model_folder,
            Bucket = bucket_name,
            Key = saved_dir + file_source
        )

In [None]:
UploadModels(MODEL_PATH + "/bilstm/", my_s3_bucket, "saved-models/Steam-Review-Sentiment-Analysis/bilstm/")

In [None]:
UploadModels(MODEL_PATH + "/bigru/", my_s3_bucket, "saved-models/Steam-Review-Sentiment-Analysis/bigru/")

## Error Analysis