In [1]:
import json


reviews = []
keys = ['reviewerID', 'reviewerName', 'asin', 'reviewText', 'overall']
with open("data/Movies_and_TV_5.json") as f:
    lines = f.readlines()
    print("original reviews count: {}".format(len(lines)))
    for line in lines:
        try:
            review = json.loads(line)
            review = {x:review[x] for x in keys}
            reviews.append(review)
        except KeyError:
            continue

print("useful reviews count: {}".format(len(reviews)))
print(reviews[:5])

original reviews count: 3410019
useful reviews count: 3408356
[{'reviewerID': 'A2M1CU2IRZG0K9', 'reviewerName': 'Terri', 'asin': '0005089549', 'reviewText': "So sorry I didn't purchase this years ago when it first came out!!  This is very good and entertaining!  We absolutely loved it and anticipate seeing it repeatedly.  We actually wore out the cassette years back, so we also purchased this same product on cd.  Best purchase we made out of all!  Would purchase on dvd if we could find one.", 'overall': 5.0}, {'reviewerID': 'AFTUJYISOFHY6', 'reviewerName': 'Melissa D. Abercrombie', 'asin': '0005089549', 'reviewText': 'Believe me when I tell you that you will receive a blessing watching this video of the Cathedral Quartet.  They bring back most of the singers that were ever in their group and it is a really great VHS.', 'overall': 5.0}, {'reviewerID': 'A3JVF9Y53BEOGC', 'reviewerName': 'Anthony Thompson', 'asin': '000503860X', 'reviewText': "I have seen X live many times, both in the ear

In [2]:
import pandas as pd


df = pd.DataFrame(reviews)
print(df)

             reviewerID            reviewerName        asin  \
0        A2M1CU2IRZG0K9                   Terri  0005089549   
1         AFTUJYISOFHY6  Melissa D. Abercrombie  0005089549   
2        A3JVF9Y53BEOGC        Anthony Thompson  000503860X   
3        A12VPEOEZS1KTC                JadeRain  000503860X   
4         ATLZNVLYKP9AZ               T. Fisher  000503860X   
...                 ...                     ...         ...   
3408351  A2MFARH7H7FSY1             Ali A. Diba  B01HJ1INB0   
3408352   AY532XZ00UTKN         Amazon Customer  B01HJ1INB0   
3408353  A2VWQXWF1GO2ZE              Noam Eitan  B01HJ1INB0   
3408354  A22AB77OD50U4M         Amazon Customer  B01HJ1INB0   
3408355   ATGEMV4Z0GPOH                   Peter  B01HJ1INB0   

                                                reviewText  overall  
0        So sorry I didn't purchase this years ago when...      5.0  
1        Believe me when I tell you that you will recei...      5.0  
2        I have seen X live many 

In [10]:
reviewer_id_count = df['reviewerID'].nunique()
print("reviewerID unique count: {}".format(reviewer_id_count))
reviewer_name_count = df['reviewerName'].nunique()
print("reviewerName unique count: {}".format(reviewer_name_count))
asin_count = df['asin'].nunique()
print("asin unique count: {}".format(asin_count))
review_per_reviewer = float(df.shape[0])/reviewer_id_count
print("reviews per reviewerID: {}".format(review_per_reviewer))
review_per_asin = float(df.shape[0])/asin_count
print("reviews per asin: {}".format(review_per_asin))

neg_reviews = df.loc[df['overall'] <= 3.0]
print(neg_reviews)

reviewerID unique count: 297483
reviewerName unique count: 234659
asin unique count: 60175
reviews per reviewerID: 11.457313527159535
reviews per asin: 56.640731200664725
             reviewerID         reviewerName        asin  \
8        A3139J3877Y61F     SingingButterfly  0005419263   
16       A38KRRY00H5TEY                Trish  0005092663   
26       A17TPT3FWAE5T1        Matthew Spady  0005019281   
39       A101IGU6UDKW3X             DorothyZ  0005019281   
45       A340KTL9KUGYB7        Mike Donnelly  0005019281   
...                 ...                  ...         ...   
3408305   A2K3ZJWLF0V4K                  Rae  B01HH2QNY0   
3408307  A1MMM9GD3YUFSN         Kelly Starks  B01HH2QNY0   
3408312  A1TO2Q0I8LNSAR  KC from the Midwest  B01HH20HHE   
3408314  A1OIBTGGI47NCO                   SD  B01HH7KC60   
3408339  A1BTBXPOUHQA9H          christina88  B01HIQ3TO2   

                                                reviewText  overall  
8                  Good songs. The DVD

In [5]:
# the parameter below should be the same with imdb_lstm.ipynb
import numpy as np

n_vocab = 5000
max_review_length = 200

with open("data/aclimdb/imdb.vocab", 'r') as f:
    vocab = f.read().splitlines()[:n_vocab]

vocab_to_int = {word:i+1 for i, word in enumerate(vocab)}
int_to_vocab = {i:word for word, i in vocab_to_int.items()}

def get_encode(word):
    if word in vocab_to_int:
        return vocab_to_int[word]
    else:
        return np.nan

encoded_x = [[get_encode(word)  for word in review] for review in preprocessed_x]
encoded_x = [[word for word in review if (not np.isnan(word))][:max_review_length] for review in encoded_x]
print(encoded_x[0][:10])

def pad_text(encoded_reviews, seq_length):
    reviews = []
    for review in encoded_reviews:
        if len(review) >= seq_length:
            reviews.append(review[:seq_length])
        else:
            reviews.append([0]*(seq_length-len(review)) + review)
    return np.array(reviews)


padded_x = pad_text(encoded_x, max_review_length)

y = np.where(df['overall']>3.8, 1, 0)
print(y.shape)
print(y)

from torch.utils.data import TensorDataset, DataLoader
from torch import IntTensor
import torch


batch_size = 1

data = TensorDataset(torch.tensor(padded_x, dtype=torch.int64), torch.tensor(y, dtype=torch.float32))
loader = DataLoader(data, batch_size=batch_size, shuffle=False)

[769, 855, 4331, 151, 575, 366, 45, 49, 432, 412]
(3408356,)
[1 1 1 ... 1 1 1]


In [6]:
# the model should be the same as imdb_lstm.ipynb
from torch import nn

n_embedding = 200 # embedding vector size
n_hidden = 200
n_layers = 1
n_output = 1

p_drop = 0.5

class SentimentLstm(nn.Module):
    def __init__(self):
        super().__init__()

        self.embedding = nn.Embedding(n_vocab+1, n_embedding)
        self.lstm = nn.LSTM(n_embedding, n_hidden, n_layers, batch_first=True, dropout=p_drop)
        self.dropout = nn.Dropout(p_drop)
        #self.lstm = nn.LSTM(n_embedding, n_hidden, n_layers, batch_first=True)
        self.fc = nn.Linear(n_hidden, n_output)
        self.sigmoid = nn.Sigmoid()

    def forward(self, inputs):
        embedded = self.embedding(inputs)
        outputs, hiddens = self.lstm(embedded)
        outputs = self.dropout(outputs)
        outputs = outputs.contiguous().view(-1, n_hidden)
        outputs = self.fc(outputs)
        outputs = self.sigmoid(outputs)
        outputs = outputs.view(batch_size, -1)
        outputs = outputs[:, -1]

        return outputs, hiddens


In [8]:
import torch
import numpy as np

device = 'cuda' if torch.cuda.is_available() else 'cpu'
criterion = nn.BCELoss()


model = torch.load("lstm.model")
model.eval()

losses = []
num_correct = 0
index = 0

with open("amazon.json", 'w') as f:
    for inputs, labels in loader:
        inputs, labels = inputs.to(device), labels.to(device)
        output, _ = model(inputs)
        loss = criterion(output.squeeze(), labels.float())
        losses.append(loss.item())
        
        preds = torch.round(output.squeeze())
        correct_tensor = preds.eq(labels.float().view_as(preds))
        correct = np.squeeze(correct_tensor.numpy())
        num_correct += np.sum(correct)

        for i in range(batch_size):
            if batch_size == 1:
                sentiment = preds.int().item()
            else:
                sentiment = preds[i].int().item()
            row = df.iloc[index]
            d = {'reviewerID': row['reviewerID'], 'reviewerName': row['reviewerName'], 'asin': row['asin'], 'reviewText': row['reviewText'], 'overall': row['overall'], 'sentiment': sentiment}
            s = json.dumps(d)+"\n"
            f.write(s)
            index += 1

    
print("Loss: {:.4f}".format(np.mean(losses)))
print("Accuracy: {:.2f}".format(num_correct/len(loader.dataset)))

Loss: 0.8927
Accuracy: 0.80
