In [1]:
import numpy as np
from numpy import savetxt
import pandas as pd
import matplotlib.pyplot as plt
import time
from tqdm.notebook import tqdm
import sys

# pytorch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils import data


from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [2]:
print (torch.cuda.is_available())
print (torch.cuda.current_device())
print (torch.cuda.get_device_name(0))
print (torch.cuda.memory_allocated())
print (torch.cuda.memory_cached())

True
0
GeForce GTX 1060 with Max-Q Design
0
0


In [3]:
df = pd.read_csv("data/cleaned_amzn_data_4-15_10Kwords.csv", encoding='utf8', index_col=0)

In [4]:
drop_cols = ['review', 'cleaned_reviews']

try:
    df.drop(drop_cols, axis=1, inplace=True)
except:
    print ("Probably dropped already")
df = df.rename(columns={'overall': 'recommendation'})
df.head()

Unnamed: 0,recommendation,encoded_1,encoded_2,encoded_3,encoded_4,encoded_5,encoded_6,encoded_7,encoded_8,encoded_9,...,encoded_185,encoded_186,encoded_187,encoded_188,encoded_189,encoded_190,encoded_191,encoded_192,encoded_193,encoded_194
0,0,0,0,0,0,0,0,0,0,0,...,4059,9289,8594,9289,4934,7474,3382,652,2097,2876
1,1,0,0,0,0,0,0,0,0,0,...,3340,8561,9289,214,5126,6257,2827,6823,1256,8798
2,0,0,0,0,0,0,0,0,0,0,...,1745,5242,506,2434,7599,8764,5242,7146,6949,3506
4,1,0,0,0,0,0,0,0,0,0,...,7514,5853,5815,9606,595,8561,243,2076,2734,9289
5,1,0,0,0,0,0,0,0,0,0,...,8375,3595,1356,2298,8561,7502,2298,1329,6555,6758


In [5]:
MAX_SEQ_LEN = len(df.columns.tolist())-1
VOCAB_SIZE = 10746 # should ideally just transport this from prev
EMBED_DIM = 32
LSTM_DIM = 16

In [6]:
# build pytorch model
DROPOUT = 0.1
BATCH_SIZE = 64

class Attention_Net(nn.Module):
    def __init__(self):
        super(Attention_Net, self).__init__()
        
        # Number of input features is 12.
        self.layer_1 = nn.Linear(MAX_SEQ_LEN, 64) 
        self.layer_2 = nn.Linear(64, 64)
        self.layer_out = nn.Linear(64, 1) 
        
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.1)
        self.batchnorm1 = nn.BatchNorm1d(64)
        self.batchnorm2 = nn.BatchNorm1d(64)
        
        self.sig = nn.Sigmoid()
        
    def forward(self, inputs):
        x = self.relu(self.layer_1(inputs))
        x = self.batchnorm1(x)
        x = self.relu(self.layer_2(x))
        x = self.batchnorm2(x)
        x = self.dropout(x)
        x = self.layer_out(x)
        return self.sig(x)
#         # define architecture
# #         self.embedding = nn.Embedding(VOCAB_SIZE, EMBED_DIM)
        
# #         self.lstm = nn.LSTM(EMBED_DIM, 
# #                             LSTM_DIM, 
# #                             bidirectional=True,
# #                             dropout=0.2,
# #                             batch_first=True)
#         self.first_lin = nn.Linear(MAX_SEQ_LEN, 64)
#         self.leakyrelu = nn.LeakyReLU()
#         self.linear = nn.Linear(64, 1)
# #         self.linear = nn.Linear(LSTM_DIM * 2, 1)
#         self.activation = nn.Sigmoid()
        
        
        
#     def forward(self, x):
#         out = self.first_lin(x)
#         out = self.leakyrelu(out)
#         out = self.linear(out)
#         out = self.activation(out)
#         return out
# #         h_embedding = self.embedding(x)
# #         h_embedding = torch.squeeze(torch.unsqueeze(h_embedding, 0)).view(BATCH_SIZE, MAX_SEQ_LEN, -1)
# #         lstm_out, _ = self.lstm(h_embedding)
        
# #         relu = self.leakyrelu(lstm_out[:, -1, :])
# #         out = self.linear(relu)
# #         activated_out = self.activation(out)
# #         return activated_out

In [7]:
class AmznDataset(data.Dataset):
    def __init__(self, data):
        #'Initialization'
        self.data = data
        text_cols = [x for x in df.columns.tolist() if x.startswith("encoded")]
        self.train = torch.tensor(data[text_cols].values).type(torch.FloatTensor).cuda()
        labels = data['recommendation'].tolist()
        
        self.one_hot_labels = torch.tensor(np.array(labels)).squeeze().type(torch.FloatTensor).cuda() # change to longtensor if using custom loss
        

    def __len__(self):
        #'Denotes the total number of samples'
        return len(self.data)

    def __getitem__(self, index):
        #'Generates one sample of data'
        
        # Load data and get label
        X = self.train[index]
        Y = self.one_hot_labels[index]
        return X, Y

In [8]:
train_num = int(0.8 * len(df))
amzn_dataset = AmznDataset(df[:train_num])
amzn_data_loader = data.DataLoader(amzn_dataset, batch_size=BATCH_SIZE, num_workers=0, drop_last=True, shuffle=True)
amzn_data_loader

<torch.utils.data.dataloader.DataLoader at 0x23996fbca48>

In [9]:
attention_model = Attention_Net().cuda()
# loss_function = nn.CrossEntropyLoss(weight=weights)
# loss_function = nn.CrossEntropyLoss()
loss_function = nn.BCELoss().cuda()
optimizer = optim.SGD(attention_model.parameters(), lr=0.01)

In [11]:
# training loop
EPOCHS = 5
start = time.time()

for i in range(EPOCHS):
    second_start = time.time()
    running_loss = 0
    correct = 0
    
    
    with tqdm(total=len(amzn_data_loader), file=sys.stdout) as pbar:
        for idx, (train_X, train_Y) in enumerate(amzn_data_loader):
            attention_model.train()
            optimizer.zero_grad()
            
            
            pred_y = attention_model(train_X)  
#             print ('pred_y = ', pred_y)
            loss = loss_function(pred_y, train_Y)
            loss.backward()
            optimizer.step()
            
            running_loss += loss

            pbar.set_description('ep{} | loss: {}'.format(i+1, torch.round(running_loss)))


            pbar.update(1)
            tqdm._instances.clear()
            
            
    
    

    print ('Epoch {} | took {} seconds | summed loss: {} | avg loss: {}'
                   .format(i+1, time.time() - second_start, running_loss, running_loss / (len(amzn_data_loader) * BATCH_SIZE)))

print ("Took {} seconds".format(time.time() - start))

HBox(children=(FloatProgress(value=0.0, max=2422.0), HTML(value='')))


Epoch 1 | took 18.701964855194092 seconds | summed loss: 1348.1436767578125 | avg loss: 0.008697252720594406


HBox(children=(FloatProgress(value=0.0, max=2422.0), HTML(value='')))


Epoch 2 | took 18.79474139213562 seconds | summed loss: 1328.5531005859375 | avg loss: 0.008570868521928787


HBox(children=(FloatProgress(value=0.0, max=2422.0), HTML(value='')))


Epoch 3 | took 18.585352897644043 seconds | summed loss: 1327.464111328125 | avg loss: 0.008563842624425888


HBox(children=(FloatProgress(value=0.0, max=2422.0), HTML(value='')))


Epoch 4 | took 18.67605948448181 seconds | summed loss: 1327.1907958984375 | avg loss: 0.008562079630792141


HBox(children=(FloatProgress(value=0.0, max=2422.0), HTML(value='')))


Epoch 5 | took 18.85761046409607 seconds | summed loss: 1326.9930419921875 | avg loss: 0.008560803718864918
Took 93.62165307998657 seconds


In [12]:
amzn_eval_dataset = AmznDataset(df[train_num:])
amzn_eval_data_loader = data.DataLoader(amzn_eval_dataset, batch_size=BATCH_SIZE, num_workers=0, drop_last=True)

In [14]:
# evaluate
correct = 0
eval_loss = 0
with torch.no_grad():
    for i, (test_X, test_Y) in enumerate(amzn_eval_data_loader):
        preds = attention_model(test_X)
#         print ("preds = ", preds)
#         print ("test_Y = ", test_Y)
        for idx, each_pred in enumerate(preds):
            if each_pred[0] >= 0.5 and test_Y[idx] == 1:
                correct += 1
            elif each_pred[0] < 0.5 and test_Y[idx] == 0:
                correct += 1
        loss = loss_function(preds, test_Y)
        eval_loss += loss
        
        
print ("Eval accuracy: {}".format(correct / len(amzn_eval_dataset)))
print ("Eval summed loss: {} | avg loss: {}".format(eval_loss, eval_loss / len(amzn_eval_dataset)))

Eval accuracy: 0.7792687017779268
Eval summed loss: 315.85833740234375 | avg loss: 0.008150552399456501


In [None]:
PATH = 'models/amzn_date4-15_batch32_epoch5.pt'
torch.save(attention_model.state_dict(), PATH)

In [73]:
np.sum(df.recommendation.tolist()) / len(df)

0.7620507421399227

In [None]:
# 77.8% eval acc -- batch=128, lr=0.0001
# 77.8% eval acc -- batch=128, lr=0.001
# 77.9% eval acc -- batch=64, lr=0.001, embed_dim=16, lstm_dim=16