In [1]:
import numpy as np
from numpy import savetxt
import pandas as pd
import matplotlib.pyplot as plt
import time
from tqdm.notebook import tqdm
import sys

# pytorch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.nn.init as init
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils import data


from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [2]:
print (torch.cuda.is_available())
print (torch.cuda.current_device())
print (torch.cuda.get_device_name(0))
print (torch.cuda.memory_allocated())
print (torch.cuda.memory_cached())

True
0
GeForce GTX 970
0
0


In [3]:
df = pd.read_csv("data/cleaned_steam_data_4-15_15Kwords.csv", encoding='utf8', index_col=0)

In [4]:
title_columns = [x for x in df.columns.tolist() if x.startswith('title_')]
drop_cols = ['funny', 'is_early_access_review', 'helpful', 'review', 'cleaned_reviews', 'hour_played', 'Year', 'Month', 'Day']
drop_cols += title_columns

df.drop(drop_cols, axis=1, inplace=True)
df.head()

Unnamed: 0,recommendation,encoded_1,encoded_2,encoded_3,encoded_4,encoded_5,encoded_6,encoded_7,encoded_8,encoded_9,...,encoded_185,encoded_186,encoded_187,encoded_188,encoded_189,encoded_190,encoded_191,encoded_192,encoded_193,encoded_194
0,1,0,0,0,0,0,0,0,0,0,...,5883,8055,5987,3547,5987,5030,3547,3547,11106,5315
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,9665
2,1,0,0,0,0,0,0,0,0,0,...,11106,3090,13779,7175,7891,1064,3380,1917,5409,3118
7,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,13408,11246,9544,6270,11106
9,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,11132,13289,12336,8729,12154,11106


# Grid Search Parameters

In [5]:
LEARNING_RATES_LIST = [0.0001, 0.0005, 0.00001]
EPOCHS_LIST = [50]
BATCH_SIZES_LIST = [64]
LSTM_DIMS_LIST = [64, 128]
EMBED_DIMS_LIST = [100]

**Storage for best model**

In [6]:
best_model = None
highest_acc = 0

In [7]:
MAX_SEQ_LEN = len(df.columns.tolist())-1
VOCAB_SIZE = 400000# 14845 # should ideally just transport this from prev

In [8]:
class Attention_Net(nn.Module):
    def __init__(self, EMBED_DIM, LSTM_DIM, VOCAB_SIZE, BATCH_SIZE):
        super(Attention_Net, self).__init__()
        

        # define architecture
        self.embedding = nn.Embedding(VOCAB_SIZE, EMBED_DIM) # add pretrained embeding

        self.lstm = nn.LSTM(EMBED_DIM, 
                            LSTM_DIM, 
                            bidirectional=True,
                            dropout=0.2,
                            batch_first=True)

        # attention layer
#         self.attention_layer = Attention(LSTM_DIM * 2, MAX_SEQ_LEN)
        # try tanh

        self.linear = nn.Linear(LSTM_DIM*2, 2)

    def forward(self, x):
        embedding = self.embedding(x)
        embedding = torch.squeeze(torch.unsqueeze(embedding, 0)).view(BATCH_SIZE, MAX_SEQ_LEN, -1)
        lstm_out, (hidden, cell) = self.lstm(embedding)
#         attention = self.attention_layer(lstm_out)

        out = self.linear(lstm_out[:, -1, :])
        return out

In [9]:
class SteamDataset(data.Dataset):
    def __init__(self, data):
        #'Initialization'
        self.data = data
        text_cols = [x for x in df.columns.tolist() if x.startswith("encoded")]
        self.train = torch.tensor(data[text_cols].values).type(torch.LongTensor).cuda()
        labels = data['recommendation'].tolist()

        self.one_hot_labels = torch.tensor(np.array(labels)).squeeze().type(torch.LongTensor).cuda() # change to longtensor if using custom loss


    def __len__(self):
        #'Denotes the total number of samples'
        return len(self.data)

    def __getitem__(self, index):
        #'Generates one sample of data'

        # Load data and get label
        X = self.train[index]
        Y = self.one_hot_labels[index]
        return X, Y

In [None]:
# build pytorch model
DROPOUT = 0.1
for LEARNING_RATE in LEARNING_RATES_LIST:
    for EPOCHS in EPOCHS_LIST:
        for EMBED_DIM in EMBED_DIMS_LIST:
            for LSTM_DIM in LSTM_DIMS_LIST:
                for BATCH_SIZE in BATCH_SIZES_LIST:

                    train_num = int(0.8 * len(df))
                    steam_dataset = SteamDataset(df[:train_num])
                    steam_data_loader = data.DataLoader(steam_dataset, batch_size=BATCH_SIZE, num_workers=0, drop_last=True, shuffle=True)

                    PATH = 'models/amzn_date4-20_gridsearch_batch64_epoch50_lstm64_lr0.0001.pt'
                    attention_model = Attention_Net(EMBED_DIM, LSTM_DIM, VOCAB_SIZE, BATCH_SIZE).cuda()
                    attention_model.load_state_dict(torch.load(PATH))

                    loss_function = nn.CrossEntropyLoss()
                    optimizer = optim.Adam(attention_model.parameters(), lr=LEARNING_RATE) # even lower for transfer learning

                    # training loop
                    start = time.time()

                    for i in range(EPOCHS):
                        second_start = time.time()
                        running_loss = 0
                        correct = 0
                        attention_model.train()

                        with tqdm(total=len(steam_data_loader), file=sys.stdout) as pbar:
                            for idx, (train_X, train_Y) in enumerate(steam_data_loader):

                                optimizer.zero_grad()

                                pred_y = attention_model(train_X) 
                                loss = loss_function(pred_y, train_Y)
                                loss.backward()
                                optimizer.step()
                                running_loss += loss

                                # calc accuracy
                                pred1_mask = pred_y[:, 1] > 0.5
                                masked_trainY_1 = train_Y[pred1_mask]
                                masked_trainY_0 = train_Y[~pred1_mask]
                                ones_predicted_correct = torch.sum(masked_trainY_1)
                                zeros_predicted_correct = torch.sum(masked_trainY_0)
                                correct += ones_predicted_correct.add(zeros_predicted_correct)
                                correct_ = correct.cpu().numpy()

                                # update progress bar
                                pbar.set_description('ep{} | loss: {} | acc: {}%'.format(i+1, torch.round(running_loss), round(correct_ / ((idx+1) * BATCH_SIZE)*100, 1)))
                                pbar.update(1)
                                tqdm._instances.clear()



                        print ('Epoch {} | took {} seconds | summed loss: {} | avg loss: {}'
                                       .format(i+1, time.time() - second_start, running_loss, running_loss / (len(steam_data_loader) * BATCH_SIZE)))

                    print ("Took {} seconds".format(time.time() - start))

                    print (attention_model)

                    steam_eval_dataset = SteamDataset(df[train_num:])
                    steam_eval_data_loader = data.DataLoader(steam_eval_dataset, batch_size=BATCH_SIZE, num_workers=0, drop_last=True)

                    # evaluate
                    correct = 0
                    eval_loss = 0
                    attention_model.eval()
                    with torch.no_grad():
                        for i, (test_X, test_Y) in enumerate(steam_eval_data_loader):
                            preds = attention_model(test_X).squeeze()
                            preds = torch.nn.functional.softmax(preds)
                            for idx, each_pred in enumerate(preds):
                                if each_pred[0] >= 0.5 and test_Y[idx] == 0:
                                    correct += 1
                                elif each_pred[0] < 0.5 and test_Y[idx] == 1:
                                    correct += 1
                            loss = loss_function(preds, test_Y)
                            eval_loss += loss
                            
                    accuracy = correct / len(amzn_eval_dataset)
                    if accuracy > highest_acc:
                        highest_acc = accuracy
                        best_model = attention_model.state_dict()
                        best_batch = BATCH_SIZE
                        best_lstm_dim = LSTM_DIM
                        best_epochs = EPOCHS
                        best_lr = LEARNING_RATE


                    print ("Eval accuracy: {}".format(correct / len(steam_eval_dataset)))
                    print ("Eval summed loss: {} | avg loss: {}".format(eval_loss, eval_loss / len(steam_eval_dataset)))

  "num_layers={}".format(dropout, num_layers))


HBox(children=(IntProgress(value=0, max=5070), HTML(value='')))

In [None]:
PATH = 'models/transfer_learning_grid_search_date4-21_batch' + str(best_batch) + '_epoch' + str(best_epochs) + '_lstm' + str(best_lstm_dim) + '_lr' + str(best_lr) + '.pt'
print(highest_acc)
torch.save(best_model, PATH)