In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Read data
data = pd.read_csv("1429_1.csv")
data.columns = ['id', 'name', 'asins', 'brand', 'categories', 'keys', 'manufacturer',
       'reviews_date', 'reviews_dateAdded', 'reviews_dateSeen',
       'reviews_didPurchase', 'reviews_doRecommend', 'reviews_id',
       'reviews_numHelpful', 'reviews_rating', 'reviews_sourceURLs',
       'reviews_text', 'reviews_title', 'reviews_userCity',
       'reviews_userProvince', 'reviews_username']
data = data.dropna(subset=["reviews_text"])
reviews = data["reviews_text"].tolist()
reviews = [i.lower() for i in reviews]

In [3]:
# BERTokenizer to transform the reivews
from transformers import BertTokenizer
BT = BertTokenizer.from_pretrained('bert-base-uncased')

newData = []
for rev in reviews:
    newData.append(BT.convert_tokens_to_ids(rev.split()))

I1212 01:00:43.796275 4397680064 file_utils.py:39] PyTorch version 1.3.0.post2 available.
I1212 01:00:48.509127 4397680064 tokenization_utils.py:375] loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at /Users/xianglu/.cache/torch/transformers/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084


In [4]:
# padding function to make the tokenizer same length
def padding(text, length):
    
    res = []
    
    for review in text:
        tlength = len(review)
        diff = length - tlength
        if diff == 0.0:
            res.append(review)
        if diff < 0.0:
            res.append(review[:length])
        if diff > 0.0:
            pad = list(np.zeros(diff))
            res.append(pad + review)
        
    res = np.asarray(res,dtype=int)    
    return res

In [5]:
# generate the sentiment for review ratings.
def sentiment_generation(rating):
    if rating>4.0:
        return "Positive"
    elif rating<3.0:
        return "Negative"
    else:
        return "Neutral"
    
data["Sentiment_generated"] = data.apply(lambda x: sentiment_generation(x.reviews_rating), axis=1)

In [6]:
# Create feature vectors
features2 = padding(newData, 30) 

## Sentiment label
labels = data["Sentiment_generated"].tolist() 

## Encode label
encoded_labels = [1.0 if label =='Positive' else 0.0 for label in labels]
encoded_labels = np.array(encoded_labels)

In [7]:
#Split the data into training, validation and testing set
import numpy as np
from sklearn.model_selection import train_test_split

train_x, test_x, train_y, test_y = train_test_split(features2, encoded_labels, test_size=0.2, random_state=1)
test_x, val_x, test_y, val_y = train_test_split(test_x, test_y,test_size=0.5, random_state=1)

In [8]:
print("Length of training x: ", len(train_x))
print("Length of training y: ", len(train_y))
print("Length of testing x: ", len(test_x))
print("Length of testing y: ", len(test_y))
print("Length of eval x: ", len(val_x))
print("Length of eval y: ", len(val_y))

Length of training x:  27727
Length of training y:  27727
Length of testing x:  3466
Length of testing y:  3466
Length of eval x:  3466
Length of eval y:  3466


In [9]:
import torch
from torch.utils.data import DataLoader, TensorDataset

# Tensor datasets
train_data = TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))
valid_data = TensorDataset(torch.from_numpy(val_x), torch.from_numpy(val_y))
test_data = TensorDataset(torch.from_numpy(test_x), torch.from_numpy(test_y))


train_loader = DataLoader(train_data, shuffle=True, batch_size=30)
valid_loader = DataLoader(valid_data, shuffle=True, batch_size=30)
test_loader = DataLoader(test_data, shuffle=True, batch_size=30)

## Full DataSet
train_dataiter = iter(train_loader)
test_dataiter = iter(test_loader)
validate_dataiter = iter(valid_loader)

## Smaller DataSet

# Tensor datasets
train_data_v = TensorDataset(torch.from_numpy(train_x[:800]), torch.from_numpy(train_y[:800]))
valid_data_v = TensorDataset(torch.from_numpy(val_x[:100]), torch.from_numpy(val_y[:100]))
test_data_v = TensorDataset(torch.from_numpy(test_x[:100]), torch.from_numpy(test_y[:100]))

train_loader_v = DataLoader(train_data_v, shuffle=True, batch_size=30)
valid_loader_v = DataLoader(valid_data_v, shuffle=True, batch_size=30)
test_loader_v = DataLoader(test_data_v, shuffle=True, batch_size=30)

In [10]:

from transformers import BertTokenizer, BertModel

Bmodel = BertModel.from_pretrained('bert-base-uncased')

I1212 01:00:51.684047 4397680064 configuration_utils.py:152] loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json from cache at /Users/xianglu/.cache/torch/transformers/4dad0251492946e18ac39290fcfe91b89d370fee250efe9521476438fe8ca185.bf3b9ea126d8c0001ee8a1e8b92229871d06d36d8808208cc2449280da87785c
I1212 01:00:51.687937 4397680064 configuration_utils.py:169] Model config {
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 2,
  "output_attentions": false,
  "output_hidden_states": false,
  "output_past": true,
  "pruned_heads": {},
  "torchscript": false,
  "type_vocab_size": 2,
  "use_bfloat16": false,
  "vocab_size": 30522
}

I121

In [11]:
# Multi-layer Bidirectional GRU Model
import torch.nn as nn

class BERT_GRU(nn.Module):
    
    def __init__(self,hidden_dim, output_dim, n_layers, dropout):
        
        super().__init__()
        
        # Multi-layer 
        self.rnn = nn.GRU(768, hidden_dim, num_layers = n_layers, bidirectional = True, batch_first = True)

        self.FC = nn.Linear(hidden_dim + hidden_dim, output_dim)
        
        self.dropout = nn.Dropout(dropout)
       
        self.relu = nn.ReLU()
        
        self.sig = nn.Sigmoid()
        
        self.hidden_dim = hidden_dim
        
        self.output_dim = output_dim
        
    def forward(self, x):
                
        
        out, hidden = self.rnn(Bmodel(x)[0])
        
        first = hidden[-1,:,:]
        
        # Set the dimensions for bidirectional modification
        to_cat = torch.cat((first,first), dim = 1)
        hidden = self.dropout(to_cat)
                
        output = self.FC(hidden)
        
        return output

In [51]:
# Grid Search
hidden_dim = [32,64,128,256,512]
dropout = [0.1,0.15,0.2,0.25,0.3]


In [15]:
import torch.optim as optim

# Set optimizer and loss function
optimizer = optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss()


In [16]:
def calculate(preds, y):
    
    Newpreds = torch.round(torch.sigmoid(preds))
    d = (Newpreds == y).float()  
    result = d.sum() / len(d)
    
    
    return result


In [55]:
# Grid Search to find the best model
k = 0
z = 0
for i in range(9):
    print("Hidden Dimension: ",hidden_dim[k],"    Dropout Rate:",dropout[z])
   
    model = BERT_GRU(hidden_dim[k], 1, 2, dropout[z])
    if k == z:
        k += 1
    else:
        z +=1
    tr = iter(train_loader_v)
    te = iter(test_loader_v)
    v = iter(valid_loader_v)
    
    
    ##########################################################################################
    
    Tloss = 0
    Taccuracy = 0
    
    model.train()
    
    # iterations
    for i,j in tr:
        
        optimizer.zero_grad()
     
        predictions = model(i).squeeze(1)
        
        loss = criterion(predictions, j)
        
        accuracy = calculate(predictions, j)

        loss.backward()
        
        optimizer.step()   
        
        Tloss += loss.item()
        Taccuracy += accuracy.item()

    value1 = Tloss / len(tr)
    value2 = Taccuracy / len(tr)
    
    ##########################################################################################
    
    Tloss = 0
    Taccuracy = 0
    
    model.eval()
 
    # iterations
    with torch.no_grad():
        for i,j in v:
        
            predictions = model(i).squeeze(1)
            
            loss = criterion(predictions, j)
            
            accuracy = calculate(predictions, j)
            
            Tloss += loss.item()
            Taccuracy += accuracy.item()
            
    valid1 = Tloss / len(v)
    valid2 = Taccuracy / len(v)       
    
    
    #########################################################################################
      
    print("Train Loss: ",value1,"     Train Accuracy:",value2)
 
    
    print("Valid Loss: ",valid1,"     Valid Accuracy:",valid2)

    print('\t')

Hidden Dimension:  32     Dropout Rate: 0.1
Train Loss:  0.7549700871265183      Train Accuracy: 0.27500000409781933
Valid Loss:  0.7175967765040696      Valid Accuracy: 0.3583333417773247
	
Hidden Dimension:  64     Dropout Rate: 0.1
Train Loss:  0.6248903599878153      Train Accuracy: 0.7749999910593033
Valid Loss:  0.6608168303966522      Valid Accuracy: 0.6916666626930237
	
Hidden Dimension:  64     Dropout Rate: 0.15
Train Loss:  0.7080828769132494      Train Accuracy: 0.45000001043081284
Valid Loss:  0.6926709280659755      Valid Accuracy: 0.5583333447575569
	
Hidden Dimension:  128     Dropout Rate: 0.15
Train Loss:  0.6355923667550087      Train Accuracy: 0.75
Valid Loss:  0.6508804433668653      Valid Accuracy: 0.6833333373069763
	
Hidden Dimension:  128     Dropout Rate: 0.2
Train Loss:  0.7345867486981055      Train Accuracy: 0.3500000052154064
Valid Loss:  0.7188207088038325      Valid Accuracy: 0.3500000089406967
	
Hidden Dimension:  256     Dropout Rate: 0.2
Train Loss:  

In [17]:
# Compare to single layer model
for i in range(1):
    print("Number of Layer: 1" )
   
    model = BERT_GRU( 64, 1, 1, 0.1)
    
    tr = iter(train_loader_v)
    te = iter(test_loader_v)
    v = iter(valid_loader_v)
    
    
    ##########################################################################################
    
    Tloss = 0
    Taccuracy = 0
    
    model.train()
    
    # iterations
    for i,j in tr:
        
        optimizer.zero_grad()
     
        predictions = model(i).squeeze(1)
        
        loss = criterion(predictions, j)
        
        accuracy = calculate(predictions, j)

        loss.backward()
        
        optimizer.step()   
        
        Tloss += loss.item()
        Taccuracy += accuracy.item()

    value1 = Tloss / len(tr)
    value2 = Taccuracy / len(tr)
    
    ##########################################################################################
    
    Tloss = 0
    Taccuracy = 0
    
    model.eval()
 
    # iterations
    with torch.no_grad():
        for i,j in v:
        
            predictions = model(i).squeeze(1)
            
            loss = criterion(predictions, j)
            
            accuracy = calculate(predictions, j)
            
            Tloss += loss.item()
            Taccuracy += accuracy.item()
            
    valid1 = Tloss / len(v)
    valid2 = Taccuracy / len(v)       
    
    
    #########################################################################################
      
    print("Train Loss: ",value1,"     Train Accuracy:",value2)
 
    
    print("Valid Loss: ",valid1,"     Valid Accuracy:",valid2)

    print('\t')

Number of Layer: 1
Train Loss:  0.6868575837187194      Train Accuracy: 0.5641975391794134
Valid Loss:  0.6925353186825911      Valid Accuracy: 0.5166666731238365
	


In [82]:
model = BERT_GRU( 64, 1, 2, 0.1)

In [142]:
# Model Evaulation and Analysis
v = iter(test_loader_v)

total_true = 0 
true_positive = 0 
pred_positive = 0 
act_positive = 0 
false_positive = 0 
false_negative = 0 


for batch,batch2 in v:
    

    predictions = model(batch).squeeze(1)
    
    rounded_preds = torch.round(torch.sigmoid(predictions))
      
    ## total number of correct classifications
    total_true += (rounded_preds == batch2).float().sum()
    
    ## correctly classify positive sentiments
    true_positive += (rounded_preds * batch2 == 1).float().sum()
    
    ## all positive sentiments based on predictions
    pred_positive += (rounded_preds == 1).float().sum()
    
    ## actual number of positive sentiments
    act_positive += (batch2 == 1).float().sum()
    
    ## false positive: actually 0, but get 1
    false_positive += (rounded_preds - batch2 == 1).float().sum()
    
    ## false negative: actually 1, but get 0 
    false_negative += (batch2 - rounded_preds == 1).float().sum()

    
precision = true_positive/(true_positive+false_positive)
recall = true_positive/(true_positive+false_negative)

In [143]:
precision

tensor(0.6714)

In [144]:
recall

tensor(0.6912)