In [1]:
import numpy as np
from numpy import savetxt
import pandas as pd
import matplotlib.pyplot as plt
import time
from tqdm.notebook import tqdm
import sys

# pytorch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils import data


from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [2]:
print (torch.cuda.is_available())
print (torch.cuda.current_device())
print (torch.cuda.get_device_name(0))
print (torch.cuda.memory_allocated())
print (torch.cuda.memory_cached())

True
0
GeForce GTX 1060 with Max-Q Design
0
0


In [3]:
df = pd.read_csv("data/cleaned_steam_data_3-29.csv", encoding='utf8', index_col=0)

In [4]:
title_columns = [x for x in df.columns.tolist() if x.startswith('title_')]
drop_cols = ['funny', 'is_early_access_review', 'helpful', 'review', 'cleaned_reviews', 'hour_played', 'Year', 'Month', 'Day']
drop_cols += title_columns

df.drop(drop_cols, axis=1, inplace=True)
df.head()

Unnamed: 0,recommendation,encoded_1,encoded_2,encoded_3,encoded_4,encoded_5,encoded_6,encoded_7,encoded_8,encoded_9,...,encoded_185,encoded_186,encoded_187,encoded_188,encoded_189,encoded_190,encoded_191,encoded_192,encoded_193,encoded_194
0,1,0,0,0,0,0,0,0,0,0,...,33572,15335,18484,25557,18484,12429,8541,25557,14122,23610
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,30552
2,1,0,0,0,0,0,0,0,0,0,...,14122,39490,27271,39764,28167,8875,1494,40182,6951,39745
7,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,34155,14517,19363,40032,14122
9,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,10818,2776,9421,19118,3218,14122


In [5]:
# # convert to classification problem --> turn helpful to 0s 1s
# new_helpful = []
# num_positive = 0
# for val in df.helpful.tolist():
#     if val > 0:
#         new_helpful.append(1)
#         num_positive += 1
#     else:
#         new_helpful.append(0)
# df.drop(['helpful'], axis=1, inplace=True)
# df['helpful'] = new_helpful
# df.head()

In [6]:
MAX_SEQ_LEN = len(df.columns.tolist())-1
VOCAB_SIZE = 41248 # should ideally just transport this from prev
EMBED_DIM = 128
LSTM_DIM = 64

In [7]:
class Attention(nn.Module):
    def __init__(self, feature_dim, step_dim, bias=True, **kwargs):
        super(Attention, self).__init__(**kwargs)
        
        self.supports_masking = True

        self.bias = bias
        self.feature_dim = feature_dim
        self.step_dim = step_dim
        self.features_dim = 0
        
        weight = torch.zeros(feature_dim, 1)
        nn.init.kaiming_uniform_(weight)
        self.weight = nn.Parameter(weight)
        
        if bias:
            self.b = nn.Parameter(torch.zeros(step_dim))
    
    def forward(self, x, mask=None):
        feature_dim = self.feature_dim 
        step_dim = self.step_dim

        eij = torch.mm(
            x.contiguous().view(-1, feature_dim), 
            self.weight
        ).view(-1, step_dim)
        
        if self.bias:
            eij = eij + self.b
            
        eij = torch.tanh(eij)
        a = torch.exp(eij)
        
        if mask is not None:
            a = a * mask

        a = a / (torch.sum(a, 1, keepdim=True) + 1e-10)

        weighted_input = x * torch.unsqueeze(a, -1)
        return torch.sum(weighted_input, 1)

In [8]:
# build pytorch model
DROPOUT = 0.1
BATCH_SIZE = 32

class Attention_Net(nn.Module):
    def __init__(self):
        super(Attention_Net, self).__init__()
        
        # define architecture
        self.embedding = nn.Embedding(VOCAB_SIZE, EMBED_DIM)
#         self.embedding_dropout = nn.Dropout2d(DROPOUT) # take this out potentially
        
        self.lstm = nn.LSTM(EMBED_DIM, LSTM_DIM, bidirectional=True, batch_first=True)
        
        # attention layer
        self.attention_layer = Attention(LSTM_DIM * 2, MAX_SEQ_LEN)
        
        self.linear = nn.Linear(LSTM_DIM * 2, 2) # change here to 1 or 2 depending on loss
        
        
        self.softmax = nn.Softmax()
        #self.softmax = nn.Sigmoid()

        
        
        
    def forward(self, x):
        h_embedding = self.embedding(x)
        h_embedding = torch.squeeze(torch.unsqueeze(h_embedding, 0)).view(BATCH_SIZE, MAX_SEQ_LEN, -1)
        h_lstm, _ = self.lstm(h_embedding)
        h_lstm_atten = self.attention_layer(h_lstm)
        out = self.linear(h_lstm_atten)
        softmax_out = self.softmax(out)
        return softmax_out
            
        

In [9]:
class SteamDataset(data.Dataset):
    def __init__(self, data):
        #'Initialization'
        self.data = data
        text_cols = [x for x in df.columns.tolist() if x.startswith("encoded")]
        self.train = torch.tensor(data[text_cols].values).cuda()
        labels = data['recommendation'].tolist()
        
        self.one_hot_labels = torch.tensor(np.array(labels)).squeeze().type(torch.LongTensor).cuda() # change to longtensor if using custom loss
        

    def __len__(self):
        #'Denotes the total number of samples'
        return len(self.data)

    def __getitem__(self, index):
        #'Generates one sample of data'
        
        # Load data and get label
        X = self.train[index]
        Y = self.one_hot_labels[index]
        return X, Y

In [10]:
train_num = int(0.8 * len(df))
steam_dataset = SteamDataset(df[:train_num])
steam_data_loader = data.DataLoader(steam_dataset, batch_size=BATCH_SIZE, num_workers=0, drop_last=True)
steam_data_loader

<torch.utils.data.dataloader.DataLoader at 0x1aa1cd45f48>

In [11]:
# num_negative = len(df) - num_positive
# print ('positive examples = ', num_positive)
# print ('negative examples = ', num_negative)

# weights = torch.tensor([1/(num_negative / len(df)), 1 / (num_positive / len(df))]).cuda()
# weights

In [12]:
attention_model = Attention_Net().cuda()
# loss_function = nn.CrossEntropyLoss(weight=weights)
loss_function = nn.CrossEntropyLoss()
optimizer = optim.SGD(attention_model.parameters(), lr=0.005)

In [13]:
# def weighted_binary_cross_entropy(output, target, weights=None):
        
#     if weights is not None:
#         assert len(weights) == 2
        
#         loss = weights[1] * (target * torch.log(output)) + \
#                weights[0] * ((1 - target) * torch.log(1 - output))
#     else:
#         loss = target * torch.log(output) + (1 - target) * torch.log(1 - output)

#     return torch.neg(torch.mean(loss))

In [21]:
# training loop
EPOCHS = 50
start = time.time()

for i in range(EPOCHS):
    second_start = time.time()
    running_loss = 0
    correct = 0

    with tqdm(total=len(steam_data_loader), file=sys.stdout) as pbar:
        for idx, (train_X, train_Y) in enumerate(steam_data_loader):
            attention_model.zero_grad()
            pred_y = attention_model(train_X)
            
            if i == EPOCHS - 1:
                # calc accuracy #
                for idx, each_pred in enumerate(pred_y):
                    if each_pred[0] >= 0.5 and train_Y[idx] == 0:
                        correct += 1
                    elif each_pred[0] < 0.5 and train_Y[idx] == 1:
                        correct += 1
            

            # loss = weighted_binary_cross_entropy(pred_y, train_Y, weights)

            loss = loss_function(pred_y, train_Y)
            loss.backward()
            optimizer.step()
            running_loss += loss
            pbar.set_description('Epoch {} | summed loss = {}'.format(i+1, torch.round(running_loss)))
            pbar.update(1)
            tqdm._instances.clear()
    
    

    print ('Epoch {} | took {} seconds | accuracy: {}% | summed loss: {} | avg loss: {}'
                   .format(i+1, time.time() - second_start, round(correct / len(steam_data_loader), 2), running_loss, running_loss / len(steam_dataset)))

print ("Took {} seconds".format(time.time() - start))

HBox(children=(FloatProgress(value=0.0, max=10141.0), HTML(value='')))




Epoch 1 | took 201.14004755020142 seconds | accuracy: 0.0% | summed loss: 5939.67431640625 | avg loss: 0.018302107229828835


HBox(children=(FloatProgress(value=0.0, max=10141.0), HTML(value='')))


Epoch 2 | took 198.2076530456543 seconds | accuracy: 0.0% | summed loss: 5878.20361328125 | avg loss: 0.01811269484460354


HBox(children=(FloatProgress(value=0.0, max=10141.0), HTML(value='')))


Epoch 3 | took 198.0682737827301 seconds | accuracy: 0.0% | summed loss: 5796.42333984375 | avg loss: 0.01786070317029953


HBox(children=(FloatProgress(value=0.0, max=10141.0), HTML(value='')))


Epoch 4 | took 197.887277841568 seconds | accuracy: 0.0% | summed loss: 5709.64013671875 | avg loss: 0.017593294382095337


HBox(children=(FloatProgress(value=0.0, max=10141.0), HTML(value='')))


Epoch 5 | took 197.28873801231384 seconds | accuracy: 0.0% | summed loss: 5591.1025390625 | avg loss: 0.01722804084420204


HBox(children=(FloatProgress(value=0.0, max=10141.0), HTML(value='')))


Epoch 6 | took 198.21293759346008 seconds | accuracy: 0.0% | summed loss: 5269.8984375 | avg loss: 0.01623830571770668


HBox(children=(FloatProgress(value=0.0, max=10141.0), HTML(value='')))


Epoch 7 | took 197.81958675384521 seconds | accuracy: 0.0% | summed loss: 6098.09033203125 | avg loss: 0.018790239468216896


HBox(children=(FloatProgress(value=0.0, max=10141.0), HTML(value='')))


Epoch 8 | took 197.92127966880798 seconds | accuracy: 0.0% | summed loss: 6977.61572265625 | avg loss: 0.021500349044799805


HBox(children=(FloatProgress(value=0.0, max=10141.0), HTML(value='')))


Epoch 9 | took 197.8317539691925 seconds | accuracy: 0.0% | summed loss: 5148.56396484375 | avg loss: 0.015864433720707893


HBox(children=(FloatProgress(value=0.0, max=10141.0), HTML(value='')))


Epoch 10 | took 197.17630624771118 seconds | accuracy: 0.0% | summed loss: 4608.93798828125 | avg loss: 0.014201666228473186


HBox(children=(FloatProgress(value=0.0, max=10141.0), HTML(value='')))


Epoch 11 | took 196.81707978248596 seconds | accuracy: 0.0% | summed loss: 4528.275390625 | avg loss: 0.013953118585050106


HBox(children=(FloatProgress(value=0.0, max=10141.0), HTML(value='')))


Epoch 12 | took 197.92001795768738 seconds | accuracy: 0.0% | summed loss: 4462.5732421875 | avg loss: 0.013750668615102768


HBox(children=(FloatProgress(value=0.0, max=10141.0), HTML(value='')))


Epoch 13 | took 196.826819896698 seconds | accuracy: 0.0% | summed loss: 4422.88818359375 | avg loss: 0.013628385029733181


HBox(children=(FloatProgress(value=0.0, max=10141.0), HTML(value='')))


Epoch 14 | took 196.8264124393463 seconds | accuracy: 0.0% | summed loss: 4394.17431640625 | avg loss: 0.013539908453822136


HBox(children=(FloatProgress(value=0.0, max=10141.0), HTML(value='')))


Epoch 15 | took 197.44379210472107 seconds | accuracy: 0.0% | summed loss: 4368.7001953125 | avg loss: 0.013461414724588394


HBox(children=(FloatProgress(value=0.0, max=10141.0), HTML(value='')))


Epoch 16 | took 196.95662140846252 seconds | accuracy: 0.0% | summed loss: 4351.294921875 | avg loss: 0.013407782651484013


HBox(children=(FloatProgress(value=0.0, max=10141.0), HTML(value='')))


Epoch 17 | took 197.00840592384338 seconds | accuracy: 0.0% | summed loss: 4332.24169921875 | avg loss: 0.013349073939025402


HBox(children=(FloatProgress(value=0.0, max=10141.0), HTML(value='')))


Epoch 18 | took 197.41109371185303 seconds | accuracy: 0.0% | summed loss: 4320.9345703125 | avg loss: 0.013314232230186462


HBox(children=(FloatProgress(value=0.0, max=10141.0), HTML(value='')))


Epoch 19 | took 196.8234088420868 seconds | accuracy: 0.0% | summed loss: 4307.232421875 | avg loss: 0.013272011652588844


HBox(children=(FloatProgress(value=0.0, max=10141.0), HTML(value='')))


Epoch 20 | took 196.6126766204834 seconds | accuracy: 0.0% | summed loss: 4293.98876953125 | avg loss: 0.013231203891336918


HBox(children=(FloatProgress(value=0.0, max=10141.0), HTML(value='')))


Epoch 21 | took 197.0632574558258 seconds | accuracy: 0.0% | summed loss: 4283.36669921875 | avg loss: 0.013198473490774632


HBox(children=(FloatProgress(value=0.0, max=10141.0), HTML(value='')))


Epoch 22 | took 198.01160049438477 seconds | accuracy: 0.0% | summed loss: 4275.79345703125 | avg loss: 0.013175137341022491


HBox(children=(FloatProgress(value=0.0, max=10141.0), HTML(value='')))


Epoch 23 | took 196.98233103752136 seconds | accuracy: 0.0% | summed loss: 4267.05224609375 | avg loss: 0.013148203492164612


HBox(children=(FloatProgress(value=0.0, max=10141.0), HTML(value='')))


Epoch 24 | took 197.89132285118103 seconds | accuracy: 0.0% | summed loss: 4255.12744140625 | avg loss: 0.013111459091305733


HBox(children=(FloatProgress(value=0.0, max=10141.0), HTML(value='')))


Epoch 25 | took 196.39027309417725 seconds | accuracy: 0.0% | summed loss: 4249.2861328125 | avg loss: 0.013093460351228714


HBox(children=(FloatProgress(value=0.0, max=10141.0), HTML(value='')))


Epoch 26 | took 196.86012053489685 seconds | accuracy: 0.0% | summed loss: 4237.89453125 | avg loss: 0.013058358803391457


HBox(children=(FloatProgress(value=0.0, max=10141.0), HTML(value='')))


Epoch 27 | took 196.48006534576416 seconds | accuracy: 0.0% | summed loss: 4237.85546875 | avg loss: 0.013058238662779331


HBox(children=(FloatProgress(value=0.0, max=10141.0), HTML(value='')))


Epoch 28 | took 196.54633951187134 seconds | accuracy: 0.0% | summed loss: 4228.40625 | avg loss: 0.013029121793806553


HBox(children=(FloatProgress(value=0.0, max=10141.0), HTML(value='')))


Epoch 29 | took 196.80881595611572 seconds | accuracy: 0.0% | summed loss: 4222.107421875 | avg loss: 0.013009713031351566


HBox(children=(FloatProgress(value=0.0, max=10141.0), HTML(value='')))


Epoch 30 | took 197.29919242858887 seconds | accuracy: 0.0% | summed loss: 4215.85546875 | avg loss: 0.012990448623895645


HBox(children=(FloatProgress(value=0.0, max=10141.0), HTML(value='')))


Epoch 31 | took 196.76988339424133 seconds | accuracy: 0.0% | summed loss: 4207.13818359375 | avg loss: 0.01296358834952116


HBox(children=(FloatProgress(value=0.0, max=10141.0), HTML(value='')))


Epoch 32 | took 196.87943363189697 seconds | accuracy: 0.0% | summed loss: 4204.03515625 | avg loss: 0.012954026460647583


HBox(children=(FloatProgress(value=0.0, max=10141.0), HTML(value='')))


Epoch 33 | took 197.66310858726501 seconds | accuracy: 0.0% | summed loss: 4200.78662109375 | avg loss: 0.012944016605615616


HBox(children=(FloatProgress(value=0.0, max=10141.0), HTML(value='')))


Epoch 34 | took 197.76801681518555 seconds | accuracy: 0.0% | summed loss: 4194.1416015625 | avg loss: 0.012923541478812695


HBox(children=(FloatProgress(value=0.0, max=10141.0), HTML(value='')))


Epoch 35 | took 198.85902571678162 seconds | accuracy: 0.0% | summed loss: 4197.607421875 | avg loss: 0.01293422095477581


HBox(children=(FloatProgress(value=0.0, max=10141.0), HTML(value='')))


Epoch 36 | took 198.85053277015686 seconds | accuracy: 0.0% | summed loss: 4184.537109375 | avg loss: 0.012893946841359138


HBox(children=(FloatProgress(value=0.0, max=10141.0), HTML(value='')))


Epoch 37 | took 196.97609901428223 seconds | accuracy: 0.0% | summed loss: 4177.7744140625 | avg loss: 0.012873108498752117


HBox(children=(FloatProgress(value=0.0, max=10141.0), HTML(value='')))


Epoch 38 | took 197.7860369682312 seconds | accuracy: 0.0% | summed loss: 4177.26611328125 | avg loss: 0.012871542014181614


HBox(children=(FloatProgress(value=0.0, max=10141.0), HTML(value='')))


Epoch 39 | took 197.83114790916443 seconds | accuracy: 0.0% | summed loss: 4172.21875 | avg loss: 0.01285598985850811


HBox(children=(FloatProgress(value=0.0, max=10141.0), HTML(value='')))


Epoch 40 | took 197.4448320865631 seconds | accuracy: 0.0% | summed loss: 4168.41796875 | avg loss: 0.01284427847713232


HBox(children=(FloatProgress(value=0.0, max=10141.0), HTML(value='')))


Epoch 41 | took 197.51915907859802 seconds | accuracy: 0.0% | summed loss: 4164.83349609375 | avg loss: 0.01283323299139738


HBox(children=(FloatProgress(value=0.0, max=10141.0), HTML(value='')))


Epoch 42 | took 197.95931720733643 seconds | accuracy: 0.0% | summed loss: 4159.82861328125 | avg loss: 0.012817811220884323


HBox(children=(FloatProgress(value=0.0, max=10141.0), HTML(value='')))


Epoch 43 | took 197.34106922149658 seconds | accuracy: 0.0% | summed loss: 4158.81640625 | avg loss: 0.01281469315290451


HBox(children=(FloatProgress(value=0.0, max=10141.0), HTML(value='')))


Epoch 44 | took 196.96903657913208 seconds | accuracy: 0.0% | summed loss: 4158.13037109375 | avg loss: 0.012812579050660133


HBox(children=(FloatProgress(value=0.0, max=10141.0), HTML(value='')))


Epoch 45 | took 197.699036359787 seconds | accuracy: 0.0% | summed loss: 4143.9931640625 | avg loss: 0.012769017368555069


HBox(children=(FloatProgress(value=0.0, max=10141.0), HTML(value='')))


Epoch 46 | took 197.20813059806824 seconds | accuracy: 0.0% | summed loss: 4144.81103515625 | avg loss: 0.012771537527441978


HBox(children=(FloatProgress(value=0.0, max=10141.0), HTML(value='')))


Epoch 47 | took 197.41102170944214 seconds | accuracy: 0.0% | summed loss: 4139.40771484375 | avg loss: 0.012754888273775578


HBox(children=(FloatProgress(value=0.0, max=10141.0), HTML(value='')))


Epoch 48 | took 197.37067890167236 seconds | accuracy: 0.0% | summed loss: 4139.2958984375 | avg loss: 0.01275454368442297


HBox(children=(FloatProgress(value=0.0, max=10141.0), HTML(value='')))


Epoch 49 | took 197.63027691841125 seconds | accuracy: 0.0% | summed loss: 4137.22802734375 | avg loss: 0.01274817157536745


HBox(children=(FloatProgress(value=0.0, max=10141.0), HTML(value='')))


Epoch 50 | took 357.60401678085327 seconds | accuracy: 28.95% | summed loss: 4130.20751953125 | avg loss: 0.012726538814604282
Took 10034.125909805298 seconds


In [22]:
steam_eval_dataset = SteamDataset(df[train_num:])
steam_eval_data_loader = data.DataLoader(steam_eval_dataset, batch_size=BATCH_SIZE, num_workers=0, drop_last=True)

In [23]:
# evaluate
correct = 0
eval_loss = 0
with torch.no_grad():
    for i, (test_X, test_Y) in enumerate(steam_eval_data_loader):
        preds = attention_model(test_X)
        for idx, each_pred in enumerate(preds):
            if each_pred[0] >= 0.5 and test_Y[idx] == 0:
                correct += 1
            elif each_pred[0] < 0.5 and test_Y[idx] == 1:
                correct += 1
        loss = loss_function(preds, test_Y)
        eval_loss += loss
        
        
print ("Eval accuracy: {}".format(correct / len(steam_eval_dataset)))
print ("Eval summed loss: {} | avg loss: {}".format(eval_loss, eval_loss / len(steam_eval_dataset)))



Eval accuracy: 0.8517637488599108
Eval summed loss: 1157.548095703125 | avg loss: 0.01426711492240429


In [24]:
PATH = 'models/word200_date4-01_batch30_epoch500.pt'
torch.save(attention_model.state_dict(), PATH)