In [1]:
import numpy as np
from numpy import savetxt
import pandas as pd
import matplotlib.pyplot as plt
import time
from tqdm.notebook import tqdm
import sys

# gensim for pretrained embedding
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.test.utils import datapath, get_tmpfile


# pytorch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.nn.init as init
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils import data
from torch.autograd import Variable

# torchtext
import torchtext.vocab as vocab


from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [2]:
print (torch.cuda.is_available())
print (torch.cuda.current_device())
print (torch.cuda.get_device_name(0))
print (torch.cuda.memory_allocated())
print (torch.cuda.memory_cached())

True
0
GeForce GTX 1060 with Max-Q Design
0
0


In [3]:
df = pd.read_csv("data/cleaned_amzn_data_4-15_10Kwords.csv", encoding='utf8', index_col=0)

In [4]:
drop_cols = ['review', 'cleaned_reviews']

try:
    df.drop(drop_cols, axis=1, inplace=True)
except:
    print ("Probably dropped already")
df = df.rename(columns={'overall': 'recommendation'})
df.head()

Unnamed: 0,recommendation,encoded_1,encoded_2,encoded_3,encoded_4,encoded_5,encoded_6,encoded_7,encoded_8,encoded_9,...,encoded_185,encoded_186,encoded_187,encoded_188,encoded_189,encoded_190,encoded_191,encoded_192,encoded_193,encoded_194
0,0,0,0,0,0,0,0,0,0,0,...,4059,9289,8594,9289,4934,7474,3382,652,2097,2876
1,1,0,0,0,0,0,0,0,0,0,...,3340,8561,9289,214,5126,6257,2827,6823,1256,8798
2,0,0,0,0,0,0,0,0,0,0,...,1745,5242,506,2434,7599,8764,5242,7146,6949,3506
4,1,0,0,0,0,0,0,0,0,0,...,7514,5853,5815,9606,595,8561,243,2076,2734,9289
5,1,0,0,0,0,0,0,0,0,0,...,8375,3595,1356,2298,8561,7502,2298,1329,6555,6758


# Load pretrained embedding

In [5]:
# google word2vec embedding #
# embed_path = 'data/GoogleNews-vectors-negative300.bin.gz'
# word2vec = KeyedVectors.load_word2vec_format(embed_path, binary=True)
# weights = word2vec.wv.vectors
# weights


# glove embedding #
glove_input_file = 'data/glove.6b/glove.6B.100d.txt'
word2vec_output_file = 'data/glove_to_word2vec.txt'

try:
    pretrained_embedding = KeyedVectors.load_word2vec_format(word2vec_output_file, binary=False)
except:
    print ("Converting word2vec file. If this fails, please download the glove.6b.100d file")
    glove2word2vec(glove_input_file, word2vec_output_file)

In [6]:
weights = pretrained_embedding.wv.vectors
pretrained_embedding.wv.vectors.shape

  """Entry point for launching an IPython kernel.
  


(400000, 100)

In [7]:
MAX_SEQ_LEN = len(df.columns.tolist())-1
VOCAB_SIZE = 14845 # 10746 - but need to use max(amzn_vocab, steam_vocab)
EMBED_DIM = 100
LSTM_DIM = 64

In [38]:
# only need 2-3 lines for attention
class Attention(nn.Module):
    def __init__(self, feature_dim, step_dim, bias=True, **kwargs):
        super(Attention, self).__init__(**kwargs)
        
        self.supports_masking = True

        self.bias = bias
        self.feature_dim = feature_dim
        self.step_dim = step_dim
        self.features_dim = 0
        
        weight = torch.zeros(feature_dim, 1)
        nn.init.kaiming_uniform_(weight)
        self.weight = nn.Parameter(weight)
        
        if bias:
            self.b = nn.Parameter(torch.zeros(step_dim))
    
    def forward(self, x, mask=None):
        feature_dim = self.feature_dim 
        step_dim = self.step_dim

        eij = torch.mm(
            x.contiguous().view(-1, feature_dim), 
            self.weight
        ).view(-1, step_dim)
        
        if self.bias:
            eij = eij + self.b
            
        eij = torch.tanh(eij)
        a = torch.exp(eij)
        
        if mask is not None:
            a = a * mask

        a = a / (torch.sum(a, 1, keepdim=True) + 1e-10)

        weighted_input = x * torch.unsqueeze(a, -1)
        return torch.sum(weighted_input, 1)

In [8]:
# build pytorch model
DROPOUT = 0.1
BATCH_SIZE = 128

class Attention_Net(nn.Module):
    def __init__(self):
        super(Attention_Net, self).__init__()
        
        # define architecture
        # self.embedding = nn.Embedding(VOCAB_SIZE, EMBED_DIM) # add pretrained embeding
        weights_ = Variable(torch.from_numpy(weights))
        print (weights_.size())
        self.embedding = nn.Embedding.from_pretrained(weights_)
    
        self.lstm = nn.LSTM(EMBED_DIM, 
                            LSTM_DIM, 
                            bidirectional=True,
                            dropout=0.2,
                            batch_first=True)
        
        # attention layer
#         self.attention_layer = Attention(LSTM_DIM * 2, MAX_SEQ_LEN)
        # try tanh

        self.linear = nn.Linear(LSTM_DIM*2, 2)
        
    def forward(self, x):
        embedding = self.embedding(x)
        embedding = torch.squeeze(torch.unsqueeze(embedding, 0)).view(BATCH_SIZE, MAX_SEQ_LEN, -1)
        lstm_out, (hidden, cell) = self.lstm(embedding)
#         attention = self.attention_layer(lstm_out)
        
        out = self.linear(lstm_out[:, -1, :])
        return out

In [9]:
class AmznDataset(data.Dataset):
    def __init__(self, data):
        #'Initialization'
        self.data = data
        text_cols = [x for x in df.columns.tolist() if x.startswith("encoded")]
        self.train = torch.tensor(data[text_cols].values).type(torch.LongTensor).cuda()
        labels = data['recommendation'].tolist()
        
        self.one_hot_labels = torch.tensor(np.array(labels)).squeeze().type(torch.LongTensor).cuda() # change to longtensor if using custom loss
        

    def __len__(self):
        #'Denotes the total number of samples'
        return len(self.data)

    def __getitem__(self, index):
        #'Generates one sample of data'
        
        # Load data and get label
        X = self.train[index]
        Y = self.one_hot_labels[index]
        return X, Y

In [10]:
train_num = int(0.8 * len(df))
amzn_dataset = AmznDataset(df[:train_num])
amzn_data_loader = data.DataLoader(amzn_dataset, batch_size=BATCH_SIZE, num_workers=0, drop_last=True, shuffle=True)
amzn_data_loader

<torch.utils.data.dataloader.DataLoader at 0x2568283cd88>

In [11]:
attention_model = Attention_Net().cuda()
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(attention_model.parameters(), lr=0.0001) # even lower for transfer learning

torch.Size([400000, 100])


  "num_layers={}".format(dropout, num_layers))


In [12]:
# training loop
EPOCHS = 10
start = time.time()

for i in range(EPOCHS):
    second_start = time.time()
    running_loss = 0
    correct = 0
    attention_model.train()
    
    with tqdm(total=len(amzn_data_loader), file=sys.stdout) as pbar:
        for idx, (train_X, train_Y) in enumerate(amzn_data_loader):
            
            optimizer.zero_grad()

            pred_y = attention_model(train_X) 
            loss = loss_function(pred_y, train_Y)
            loss.backward()
            optimizer.step()
            running_loss += loss

            # calc accuracy
            pred1_mask = pred_y[:, 1] > 0.5
            masked_trainY_1 = train_Y[pred1_mask]
            masked_trainY_0 = train_Y[~pred1_mask]
            ones_predicted_correct = torch.sum(masked_trainY_1)
            zeros_predicted_correct = torch.sum(masked_trainY_0)
            correct += ones_predicted_correct.add(zeros_predicted_correct)
            correct_ = correct.cpu().numpy()
            
            # update progress bar
            pbar.set_description('ep{} | loss: {} | acc: {}%'.format(i+1, torch.round(running_loss), round(correct_ / ((idx+1) * BATCH_SIZE)*100, 1)))
            pbar.update(1)
            tqdm._instances.clear()
            
            

    print ('Epoch {} | took {} seconds | summed loss: {} | avg loss: {}'
                   .format(i+1, time.time() - second_start, running_loss, running_loss / (len(amzn_data_loader) * BATCH_SIZE)))

print ("Took {} seconds".format(time.time() - start))

HBox(children=(FloatProgress(value=0.0, max=1211.0), HTML(value='')))


Epoch 1 | took 28.01642632484436 seconds | summed loss: 657.1602172851562 | avg loss: 0.004239524714648724


HBox(children=(FloatProgress(value=0.0, max=1211.0), HTML(value='')))


Epoch 2 | took 27.78093409538269 seconds | summed loss: 606.8274536132812 | avg loss: 0.003914813976734877


HBox(children=(FloatProgress(value=0.0, max=1211.0), HTML(value='')))


Epoch 3 | took 27.57372522354126 seconds | summed loss: 576.8763427734375 | avg loss: 0.0037215908523648977


HBox(children=(FloatProgress(value=0.0, max=1211.0), HTML(value='')))


Epoch 4 | took 28.220147609710693 seconds | summed loss: 551.7191772460938 | avg loss: 0.003559294855222106


HBox(children=(FloatProgress(value=0.0, max=1211.0), HTML(value='')))


Epoch 5 | took 28.911523818969727 seconds | summed loss: 531.7894897460938 | avg loss: 0.0034307229798287153


HBox(children=(FloatProgress(value=0.0, max=1211.0), HTML(value='')))


Epoch 6 | took 29.00606060028076 seconds | summed loss: 516.321044921875 | avg loss: 0.003330931765958667


HBox(children=(FloatProgress(value=0.0, max=1211.0), HTML(value='')))


Epoch 7 | took 29.039900302886963 seconds | summed loss: 503.4665222167969 | avg loss: 0.0032480035442858934


HBox(children=(FloatProgress(value=0.0, max=1211.0), HTML(value='')))


Epoch 8 | took 29.103739023208618 seconds | summed loss: 492.67901611328125 | avg loss: 0.003178410232067108


HBox(children=(FloatProgress(value=0.0, max=1211.0), HTML(value='')))


Epoch 9 | took 29.23008894920349 seconds | summed loss: 484.40185546875 | avg loss: 0.0031250121537595987


HBox(children=(FloatProgress(value=0.0, max=1211.0), HTML(value='')))


Epoch 10 | took 29.237423181533813 seconds | summed loss: 476.58685302734375 | avg loss: 0.0030745952390134335
Took 286.12692427635193 seconds


In [13]:
print (attention_model)

Attention_Net(
  (embedding): Embedding(400000, 100)
  (lstm): LSTM(100, 64, batch_first=True, dropout=0.2, bidirectional=True)
  (linear): Linear(in_features=128, out_features=2, bias=True)
)


In [14]:
amzn_eval_dataset = AmznDataset(df[train_num:])
amzn_eval_data_loader = data.DataLoader(amzn_eval_dataset, batch_size=BATCH_SIZE, num_workers=0, drop_last=True)

In [15]:
# evaluate
correct = 0
eval_loss = 0
attention_model.eval()
with torch.no_grad():
    for i, (test_X, test_Y) in enumerate(amzn_eval_data_loader):
        preds = attention_model(test_X).squeeze()
        preds = torch.nn.functional.softmax(preds)
        for idx, each_pred in enumerate(preds):
            if each_pred[0] >= 0.5 and test_Y[idx] == 0:
                correct += 1
            elif each_pred[0] < 0.5 and test_Y[idx] == 1:
                correct += 1
        loss = loss_function(preds, test_Y)
        eval_loss += loss
        
        
print ("Eval accuracy: {}".format(correct / len(amzn_eval_dataset)))
print ("Eval summed loss: {} | avg loss: {}".format(eval_loss, eval_loss / len(amzn_eval_dataset)))

  


Eval accuracy: 0.833922535029546
Eval summed loss: 148.5374755859375 | avg loss: 0.003832928603515029


In [16]:
PATH = 'models/amzn_date4-16_batch128_epoch10_acc83_lstm64_pretrainedembedding_noattention.pt'
torch.save(attention_model.state_dict(), PATH)

In [73]:
np.sum(df.recommendation.tolist()) / len(df)

0.7620507421399227

In [None]:
# 77.8% eval acc -- batch=128, lr=0.0001
# 77.8% eval acc -- batch=128, lr=0.001
# 77.9% eval acc -- batch=64, lr=0.001, embed_dim=16, lstm_dim=16
# 85.9% eval acc -- batch=128, lr=0.0001, embed=128, lstm_dim=64
# 86.1% eval acc -- batch=128, lr=0.0001, embed=128, lstm_dim=64, + attention

# scratchpaper

In [15]:
input = torch.randn(3, requires_grad=True)
target = torch.empty(3).random_(2)

input

tensor([-1.5980,  0.9168,  0.8958], requires_grad=True)

In [16]:
target

tensor([1., 0., 1.])