In [None]:
# baseline: lstm + attnetion --> trained just on steam reviews
# train on amazon, transfer learn on steam, re predict on amazon --> 
#      see if it still remembers what it learned from amazon dataset (should see performance decrease (forgets))


In [1]:
import numpy as np
from numpy import savetxt
import pandas as pd
import matplotlib.pyplot as plt
import time
from tqdm.notebook import tqdm
import sys

# pytorch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils import data


from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [2]:
print (torch.cuda.is_available())
print (torch.cuda.current_device())
print (torch.cuda.get_device_name(0))
print (torch.cuda.memory_allocated())
print (torch.cuda.memory_cached())

True
0
GeForce GTX 1060 with Max-Q Design
0
0


In [3]:
df = pd.read_csv("data/cleaned_steam_data_4-15_15Kwords.csv", encoding='utf8', index_col=0)

In [4]:
title_columns = [x for x in df.columns.tolist() if x.startswith('title_')]
drop_cols = ['funny', 'is_early_access_review', 'helpful', 'review', 'cleaned_reviews', 'hour_played', 'Year', 'Month', 'Day']
drop_cols += title_columns

df.drop(drop_cols, axis=1, inplace=True)
df.head()

Unnamed: 0,recommendation,encoded_1,encoded_2,encoded_3,encoded_4,encoded_5,encoded_6,encoded_7,encoded_8,encoded_9,...,encoded_185,encoded_186,encoded_187,encoded_188,encoded_189,encoded_190,encoded_191,encoded_192,encoded_193,encoded_194
0,1,0,0,0,0,0,0,0,0,0,...,5883,8055,5987,3547,5987,5030,3547,3547,11106,5315
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,9665
2,1,0,0,0,0,0,0,0,0,0,...,11106,3090,13779,7175,7891,1064,3380,1917,5409,3118
7,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,13408,11246,9544,6270,11106
9,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,11132,13289,12336,8729,12154,11106


In [5]:
MAX_SEQ_LEN = len(df.columns.tolist())-1
VOCAB_SIZE = 14845 # should ideally just transport this from prev
EMBED_DIM = 128
LSTM_DIM = 64

In [6]:
# only need 2-3 lines for attention
class Attention(nn.Module):
    def __init__(self, feature_dim, step_dim, bias=True, **kwargs):
        super(Attention, self).__init__(**kwargs)
        
        self.supports_masking = True

        self.bias = bias
        self.feature_dim = feature_dim
        self.step_dim = step_dim
        self.features_dim = 0
        
        weight = torch.zeros(feature_dim, 1)
        nn.init.kaiming_uniform_(weight)
        self.weight = nn.Parameter(weight)
        
        if bias:
            self.b = nn.Parameter(torch.zeros(step_dim))
    
    def forward(self, x, mask=None):
        feature_dim = self.feature_dim 
        step_dim = self.step_dim

        eij = torch.mm(
            x.contiguous().view(-1, feature_dim), 
            self.weight
        ).view(-1, step_dim)
        
        if self.bias:
            eij = eij + self.b
            
        eij = torch.tanh(eij)
        a = torch.exp(eij)
        
        if mask is not None:
            a = a * mask

        a = a / (torch.sum(a, 1, keepdim=True) + 1e-10)

        weighted_input = x * torch.unsqueeze(a, -1)
        return torch.sum(weighted_input, 1)

In [7]:
# build pytorch model
DROPOUT = 0.1
BATCH_SIZE = 128

class Attention_Net(nn.Module):
    def __init__(self):
        super(Attention_Net, self).__init__()
        
        # define architecture
        self.embedding = nn.Embedding(VOCAB_SIZE, EMBED_DIM)
        
        self.lstm = nn.LSTM(EMBED_DIM,
                            LSTM_DIM,
                            bidirectional=True,
                            dropout=0.2,
                            batch_first=True)
        
        # attention layer
        self.attention_layer = Attention(LSTM_DIM * 2, MAX_SEQ_LEN)
        
        self.linear = nn.Linear(LSTM_DIM * 2, 2) # change here to 1 or 2 depending on loss

        
        
    def forward(self, x):
        h_embedding = self.embedding(x)
        h_embedding = torch.squeeze(torch.unsqueeze(h_embedding, 0)).view(BATCH_SIZE, MAX_SEQ_LEN, -1)
        h_lstm, _ = self.lstm(h_embedding)
        h_lstm_atten = self.attention_layer(h_lstm)
        out = self.linear(h_lstm_atten)
        return out

In [8]:
class SteamDataset(data.Dataset):
    def __init__(self, data):
        #'Initialization'
        self.data = data
        text_cols = [x for x in df.columns.tolist() if x.startswith("encoded")]
        self.train = torch.tensor(data[text_cols].values).cuda()
        labels = data['recommendation'].tolist()
        
        self.one_hot_labels = torch.tensor(np.array(labels)).squeeze().type(torch.LongTensor).cuda() # change to longtensor if using custom loss
        

    def __len__(self):
        #'Denotes the total number of samples'
        return len(self.data)

    def __getitem__(self, index):
        #'Generates one sample of data'
        
        # Load data and get label
        X = self.train[index]
        Y = self.one_hot_labels[index]
        return X, Y

In [9]:
train_num = int(0.8 * len(df))
steam_dataset = SteamDataset(df[:train_num])
steam_data_loader = data.DataLoader(steam_dataset, batch_size=BATCH_SIZE, num_workers=0, drop_last=True)
steam_data_loader

<torch.utils.data.dataloader.DataLoader at 0x2e7e9d98dc8>

In [10]:
attention_model = Attention_Net().cuda()
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(attention_model.parameters(), lr=0.001)

  "num_layers={}".format(dropout, num_layers))


In [11]:
# def weighted_binary_cross_entropy(output, target, weights=None):
        
#     if weights is not None:
#         assert len(weights) == 2
        
#         loss = weights[1] * (target * torch.log(output)) + \
#                weights[0] * ((1 - target) * torch.log(1 - output))
#     else:
#         loss = target * torch.log(output) + (1 - target) * torch.log(1 - output)

#     return torch.neg(torch.mean(loss))

In [12]:
# training loop
EPOCHS = 10
start = time.time()

for i in range(EPOCHS):
    second_start = time.time()
    running_loss = 0
    correct = 0

    with tqdm(total=len(steam_data_loader), file=sys.stdout) as pbar:
        for idx, (train_X, train_Y) in enumerate(steam_data_loader):
            optimizer.zero_grad()
            pred_y = attention_model(train_X)
#             print ('pred_y = ', pred_y)
#             print ('train_Y = ', train_Y)
            
            loss = loss_function(pred_y, train_Y)
            loss.backward()
            optimizer.step()
            running_loss += loss
            pbar.set_description('Epoch {} | summed loss = {}'.format(i+1, torch.round(running_loss)))
            pbar.update(1)
            tqdm._instances.clear()
    
    

    print ('Epoch {} | took {} seconds | accuracy: {}% | summed loss: {} | avg loss: {}'
                   .format(i+1, time.time() - second_start, round(correct / len(steam_data_loader), 2), running_loss, running_loss / len(steam_dataset)))

print ("Took {} seconds".format(time.time() - start))

HBox(children=(FloatProgress(value=0.0, max=2535.0), HTML(value='')))


Epoch 1 | took 75.44005990028381 seconds | accuracy: 0.0% | summed loss: 964.609130859375 | avg loss: 0.0029722806066274643


HBox(children=(FloatProgress(value=0.0, max=2535.0), HTML(value='')))


Epoch 2 | took 74.89218473434448 seconds | accuracy: 0.0% | summed loss: 705.8658447265625 | avg loss: 0.0021750067826360464


HBox(children=(FloatProgress(value=0.0, max=2535.0), HTML(value='')))


Epoch 3 | took 75.91214942932129 seconds | accuracy: 0.0% | summed loss: 652.403564453125 | avg loss: 0.0020102716516703367


HBox(children=(FloatProgress(value=0.0, max=2535.0), HTML(value='')))


Epoch 4 | took 75.32235598564148 seconds | accuracy: 0.0% | summed loss: 611.435546875 | avg loss: 0.0018840357661247253


HBox(children=(FloatProgress(value=0.0, max=2535.0), HTML(value='')))




KeyboardInterrupt: 

In [14]:
steam_eval_dataset = SteamDataset(df[train_num:])
steam_eval_data_loader = data.DataLoader(steam_eval_dataset, batch_size=BATCH_SIZE, num_workers=0, drop_last=True)

In [15]:
# evaluate
correct = 0
eval_loss = 0
with torch.no_grad():
    for i, (test_X, test_Y) in enumerate(steam_eval_data_loader):
        preds = attention_model(test_X)
        for idx, each_pred in enumerate(preds):
            if each_pred[0] >= 0.5 and test_Y[idx] == 0:
                correct += 1
            elif each_pred[0] < 0.5 and test_Y[idx] == 1:
                correct += 1
        loss = loss_function(preds, test_Y)
        eval_loss += loss
        
        
print ("Eval accuracy: {}".format(correct / len(steam_eval_dataset)))
print ("Eval summed loss: {} | avg loss: {}".format(eval_loss, eval_loss / len(steam_eval_dataset)))



Eval accuracy: 0.7607045135208421
Eval summed loss: 354.26202392578125 | avg loss: 0.004366382025182247


In [24]:
PATH = 'models/word200_date4-01_batch30_epoch500.pt'
torch.save(attention_model.state_dict(), PATH)