In [1]:
import numpy as np
from numpy import savetxt
import pandas as pd
import matplotlib.pyplot as plt
import time

# pytorch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils import data


from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [2]:
df = pd.read_csv("data/cleaned_steam_data_3-29.csv", encoding='utf8', index_col=0)

In [3]:
df.head()

Unnamed: 0,funny,helpful,hour_played,is_early_access_review,recommendation,review,title_ACE COMBAT™ 7: SKIES UNKNOWN,title_ARK: Survival Evolved,title_ASTRONEER,title_Battlefleet Gothic: Armada 2,...,encoded_185,encoded_186,encoded_187,encoded_188,encoded_189,encoded_190,encoded_191,encoded_192,encoded_193,encoded_194
0,2,4,578,0,1,&gt Played as German Reich&gt Declare war on B...,0,0,0,0,...,33572,15335,18484,25557,18484,12429,8541,25557,14122,23610
1,0,0,184,0,1,yes.,0,0,0,0,...,0,0,0,0,0,0,0,0,0,30552
2,0,0,892,0,1,Very good game although a bit overpriced in my...,0,0,0,0,...,14122,39490,27271,39764,28167,8875,1494,40182,6951,39745
7,295,219,71,0,1,I have never been told to kill myself more tha...,0,0,0,0,...,0,0,0,0,0,34155,14517,19363,40032,14122
9,380,271,414,0,1,if you think cs go is toxic try this game,0,0,0,0,...,0,0,0,0,10818,2776,9421,19118,3218,14122


In [4]:
title_columns = [x for x in df.columns.tolist() if x.startswith('title_')]
drop_cols = ['funny', 'is_early_access_review', 'recommendation', 'review', 'cleaned_reviews', 'hour_played', 'Year', 'Month', 'Day']
drop_cols += title_columns

df.drop(drop_cols, axis=1, inplace=True)
df.head()

Unnamed: 0,helpful,encoded_1,encoded_2,encoded_3,encoded_4,encoded_5,encoded_6,encoded_7,encoded_8,encoded_9,...,encoded_185,encoded_186,encoded_187,encoded_188,encoded_189,encoded_190,encoded_191,encoded_192,encoded_193,encoded_194
0,4,0,0,0,0,0,0,0,0,0,...,33572,15335,18484,25557,18484,12429,8541,25557,14122,23610
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,30552
2,0,0,0,0,0,0,0,0,0,0,...,14122,39490,27271,39764,28167,8875,1494,40182,6951,39745
7,219,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,34155,14517,19363,40032,14122
9,271,0,0,0,0,0,0,0,0,0,...,0,0,0,0,10818,2776,9421,19118,3218,14122


In [5]:
# convert to classification problem --> turn helpful to 0s 1s
new_helpful = []
num_positive = 0
for val in df.helpful.tolist():
    if val > 0:
        new_helpful.append(1)
        num_positive += 1
    else:
        new_helpful.append(0)
df.drop(['helpful'], axis=1, inplace=True)
df['helpful'] = new_helpful
df.head()

Unnamed: 0,encoded_1,encoded_2,encoded_3,encoded_4,encoded_5,encoded_6,encoded_7,encoded_8,encoded_9,encoded_10,...,encoded_186,encoded_187,encoded_188,encoded_189,encoded_190,encoded_191,encoded_192,encoded_193,encoded_194,helpful
0,0,0,0,0,0,0,0,0,0,0,...,15335,18484,25557,18484,12429,8541,25557,14122,23610,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,30552,0
2,0,0,0,0,0,0,0,0,0,0,...,39490,27271,39764,28167,8875,1494,40182,6951,39745,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,34155,14517,19363,40032,14122,1
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,10818,2776,9421,19118,3218,14122,1


In [26]:
MAX_SEQ_LEN = len(df.columns.tolist())-1
VOCAB_SIZE = 41248 # should ideally just transport this from prev
EMBED_DIM = 128
LSTM_DIM = 64

In [27]:
class Attention(nn.Module):
    def __init__(self, feature_dim, step_dim, bias=True, **kwargs):
        super(Attention, self).__init__(**kwargs)
        
        self.supports_masking = True

        self.bias = bias
        self.feature_dim = feature_dim
        self.step_dim = step_dim
        self.features_dim = 0
        
        weight = torch.zeros(feature_dim, 1)
        nn.init.kaiming_uniform_(weight)
        self.weight = nn.Parameter(weight)
        
        if bias:
            self.b = nn.Parameter(torch.zeros(step_dim))
    
    def forward(self, x, mask=None):
        feature_dim = self.feature_dim 
        step_dim = self.step_dim

        eij = torch.mm(
            x.contiguous().view(-1, feature_dim), 
            self.weight
        ).view(-1, step_dim)
        
        if self.bias:
            eij = eij + self.b
            
        eij = torch.tanh(eij)
        a = torch.exp(eij)
        
        if mask is not None:
            a = a * mask

        a = a / (torch.sum(a, 1, keepdim=True) + 1e-10)

        weighted_input = x * torch.unsqueeze(a, -1)
        return torch.sum(weighted_input, 1)

In [28]:
# build pytorch model
DROPOUT = 0.1
BATCH_SIZE = 16

class Attention_Net(nn.Module):
    def __init__(self):
        super(Attention_Net, self).__init__()
        
        # define architecture
        self.embedding = nn.Embedding(VOCAB_SIZE, EMBED_DIM)
#         self.embedding_dropout = nn.Dropout2d(DROPOUT) # take this out potentially
        
        self.lstm = nn.LSTM(EMBED_DIM, LSTM_DIM, bidirectional=True, batch_first=True)
        
        # attention layer
        self.attention_layer = Attention(LSTM_DIM * 2, MAX_SEQ_LEN)
        
        self.linear = nn.Linear(LSTM_DIM * 2, 2)
        
        self.softmax = nn.Softmax()
        
        
        
    def forward(self, x):
        h_embedding = self.embedding(x)
        h_embedding = torch.squeeze(torch.unsqueeze(h_embedding, 0)).view(BATCH_SIZE, MAX_SEQ_LEN, -1)
        h_lstm, _ = self.lstm(h_embedding)
        h_lstm_atten = self.attention_layer(h_lstm)
        out = self.linear(h_lstm_atten)
        softmax_out = self.softmax(out)
        return softmax_out
            
        

In [29]:
class SteamDataset(data.Dataset):
    def __init__(self, data):
        #'Initialization'
        self.data = data
        text_cols = [x for x in df.columns.tolist() if x.startswith("encoded")]
        self.train = torch.tensor(data[text_cols].values).cuda()
        labels = data['helpful'].tolist()
        
        one_hot_labels = []
        for val in labels:
            if val == 0:
                one_hot_labels.append([1, 0])
#                 one_hot_labels.append([0])
            else:
                one_hot_labels.append([0, 1])
#                 one_hot_labels.append([1])
        self.one_hot_labels = torch.tensor(np.array(one_hot_labels)).squeeze().type(torch.FloatTensor).cuda() # change to longtensor if using custom loss
        

    def __len__(self):
        #'Denotes the total number of samples'
        return len(self.data)

    def __getitem__(self, index):
        #'Generates one sample of data'
        
        # Load data and get label
        X = self.train[index]
        Y = self.one_hot_labels[index]
        return X, Y

In [30]:
train_num = int(0.8 * len(df))
steam_dataset = SteamDataset(df[:train_num])

In [31]:
steam_data_loader = data.DataLoader(steam_dataset, batch_size=16, num_workers=0, drop_last=True)
steam_data_loader

<torch.utils.data.dataloader.DataLoader at 0x1b8048cc9c8>

In [32]:
num_negative = len(df) - num_positive
print ('positive examples = ', num_positive)
print ('negative examples = ', num_negative)

weights = torch.tensor([1/(num_negative / len(df)), 1 / (num_positive / len(df))]).cuda()
weights

positive examples =  32126
negative examples =  373543


tensor([ 1.0860, 12.6274], device='cuda:0')

In [33]:
attention_model = Attention_Net().cuda()
# loss_function = nn.CrossEntropyLoss(weight=weights)
# loss_function = nn.CrossEntropyLoss()
optimizer = optim.SGD(attention_model.parameters(), lr=0.001)

In [34]:
def weighted_binary_cross_entropy(output, target, weights=None):
        
    if weights is not None:
        assert len(weights) == 2
        
        loss = weights[1] * (target * torch.log(output)) + \
               weights[0] * ((1 - target) * torch.log(1 - output))
    else:
        loss = target * torch.log(output) + (1 - target) * torch.log(1 - output)

    return torch.neg(torch.mean(loss))

In [None]:
# training loop
EPOCHS = 5
checkpoint_num = 10000
start = time.time()

running_loss = 0
for i in range(EPOCHS):
    print ("On epoch: {}".format(i+1))
    idx = 0
    second_start = time.time()
    for train_X, train_Y in steam_data_loader:
        if (idx+1) % checkpoint_num == 0:
            print ('For {} train batches | took {} seconds | loss: {}'
                   .format(checkpoint_num, time.time() - second_start, running_loss / (checkpoint_num*BATCH_SIZE)))
            second_start = time.time()
            running_loss = 0
            
        attention_model.zero_grad()
        pred_y = attention_model(train_X)       

        loss = weighted_binary_cross_entropy(pred_y, train_Y, weights)
        # print ('pred_y = ', pred_y)
        # print ('train_Y = ', train_Y)
        # loss = loss_function(pred_y, train_Y)
        loss.backward()
        optimizer.step()
        running_loss += loss
        idx+=1
        
print ("Took {} seconds".format(time.time() - start))
        

On epoch: 1




For 10000 train batches | took 153.12880277633667 seconds | loss: 0.10620594024658203
For 10000 train batches | took 156.99644255638123 seconds | loss: 0.08813270181417465
On epoch: 2
For 10000 train batches | took 170.56140208244324 seconds | loss: 0.10770267248153687


In [153]:
# evaluate
with torch.no_grad():
    
    preds = attention_model(train_X[0])
    print ('predicted helpful = ', preds)
    print ('actual helpful = ', train_Y[0])

predicted helpful =  tensor([[0.6429]])
actual helpful =  tensor([4.])


In [49]:
torch.cuda.is_available()

True

In [51]:
torch.cuda.current_device()

0

In [50]:
torch.cuda.get_device_name(0)

'GeForce GTX 1060 with Max-Q Design'

In [56]:
# Returns the current GPU memory usage by 
# tensors in bytes for a given device
torch.cuda.memory_allocated()


631242752

In [57]:
# Returns the current GPU memory managed by the
# caching allocator in bytes for a given device
torch.cuda.memory_cached()

631242752

In [59]:
next(attention_model.parameters()).is_cuda

True