In [1]:
import torch
import pandas as pd
import numpy as np

In [2]:
use_cuda = torch.cuda.is_available()
device = torch.device('cuda' if use_cuda else 'cpu')
device

device(type='cuda')

In [3]:
path = '/content/drive/MyDrive/fake reviews dataset.csv'

df = pd.read_csv(path)
print(df.shape)
df.head()

(40432, 4)


Unnamed: 0,category,rating,label,text_
0,Home_and_Kitchen_5,5.0,CG,"Love this! Well made, sturdy, and very comfor..."
1,Home_and_Kitchen_5,5.0,CG,"love it, a great upgrade from the original. I..."
2,Home_and_Kitchen_5,5.0,CG,This pillow saved my back. I love the look and...
3,Home_and_Kitchen_5,1.0,CG,"Missing information on how to use it, but it i..."
4,Home_and_Kitchen_5,5.0,CG,Very nice set. Good quality. We have had the s...


In [4]:
y = df['label']

# 진짜면 1 가짜면 0
def to_one_zero(x):
    if x == 'CG':
      return 0
    elif x == 'OR':
      return 1
y = y.apply(to_one_zero)
# to tensor
y = torch.tensor(y.values)
y

tensor([0, 0, 0,  ..., 1, 0, 1])

In [5]:
# tokenize

from nltk.stem.porter import PorterStemmer
import re
ps = PorterStemmer()
corpus = []
for i in range(0, len(df)):
    review = re.sub('[^a-zA-Z]', ' ', df['text_'][i])
    review = review.lower()
    review = review.split()
    
    review = [ps.stem(word) for word in review]# if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [6]:
for i in range(5):
    print(corpus[i])

love thi well made sturdi and veri comfort i love it veri pretti
love it a great upgrad from the origin i ve had mine for a coupl of year
thi pillow save my back i love the look and feel of thi pillow
miss inform on how to use it but it is a great product for the price i
veri nice set good qualiti we have had the set for two month now and have not been


In [7]:
# build vocab dictionary

import os
from collections import Counter

vocabs = [vocab for seq in corpus for vocab in seq.split()]
# vocabs = ['love', 'well', 'made', 'sturdi', 'comfort', 'love', 'pretti', 'love', 'great', ... ]  (corpus[:5] 했을때 기준)

# Count words in the whole Corpus
vocab_count = Counter(vocabs)
# vocab_count = Counter({'love': 4, 'great': 2, 'pillow': 2, 'set': 2, 'well': 1, 'made': 1, ... ] (corpus[:5] 했을때 기준)

# ordering words by their counts
vocab_count = vocab_count.most_common(len(vocab_count))

vocab_to_int = {word : index+2 for index, (word, count) in enumerate(vocab_count)}
vocab_to_int.update({'__PADDING__': 0}) # index 0 for padding
vocab_to_int.update({'__UNKNOWN__': 1}) # index 1 for unknown word such as broken character

# 가장 빈도수 높은 단어(어근)순
print(vocab_to_int)



In [24]:
from torch.autograd import Variable

# Tokenize & Vectorize sequences
vectorized_seqs = []
for seq in corpus: 
  vectorized_seqs.append([vocab_to_int.get(word,1) for word in seq.split()])

# Save the lengths of sequences
seq_lengths = torch.LongTensor(list(map(len, vectorized_seqs)))

# Add padding(0)
seq_tensor = Variable(torch.zeros((len(vectorized_seqs), seq_lengths.max()))).long()
for idx, (seq, seqlen) in enumerate(zip(vectorized_seqs, seq_lengths)):
  seq_tensor[idx, :seqlen] = torch.LongTensor(seq)

seq_lengths[seq_lengths==0] = seq_lengths[seq_lengths==0] + 1
# sequence = vectorized list of words in review
# sequence의 개수 = 리뷰의 갯수
print(seq_lengths.max())
print(seq_lengths.min())
print(seq_tensor[0])
print(seq_lengths[0])

tensor(496)
tensor(1)
tensor([ 22,   9,  38, 107, 226,   5,  18, 130,   3,  22,   6,  18, 172,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,

In [9]:
# Build Data loader

In [10]:
# 시퀸스 수가 각각 다르기 때문에 custom data loader가 필요, 쓸데없는거 학습시키지 않기 위함, 0 padding 같은거

In [11]:
import torch.utils.data.sampler as splr # for iterable data loader

class CustomDataLoader(object):
  def __init__(self, seq_tensor, seq_lengths, label_tensor, batch_size):
    self.batch_size = batch_size
    self.seq_tensor = seq_tensor
    self.seq_lengths = seq_lengths
    self.label_tensor = label_tensor
    self.sampler = splr.BatchSampler(splr.RandomSampler(self.label_tensor), self.batch_size, False)
    self.sampler_iter = iter(self.sampler)
    
  def __iter__(self):
    self.sampler_iter = iter(self.sampler) # reset sampler iterator
    return self

  def _next_index(self):
    return next(self.sampler_iter) # may raise StopIteration

  def __next__(self):
    index = self._next_index()

    subset_seq_tensor = self.seq_tensor[index]
    subset_seq_lengths = self.seq_lengths[index]
    subset_label_tensor = self.label_tensor[index]

    subset_seq_lengths, perm_idx = subset_seq_lengths.sort(0, descending=True)
    subset_seq_tensor = subset_seq_tensor[perm_idx]
    subset_label_tensor = subset_label_tensor[perm_idx]

    return subset_seq_tensor, subset_seq_lengths, subset_label_tensor

  def __len__(self):
    return len(self.sampler)

In [12]:
# train, validation, test data loaders

In [13]:
# shuffle data
shuffled_idx = torch.randperm(y.shape[0])

seq_tensor = seq_tensor[shuffled_idx]
seq_lenghts = seq_lengths[shuffled_idx]
label = y[shuffled_idx]

# divide data into 3 sets
PCT_TRAIN = 0.7 # 70% of data will be train set 
PCT_VALID = 0.2 # 20% of data will be validation set
# The rest of data will be test set

length = len(label)
train_seq_tensor = seq_tensor[:int(length*PCT_TRAIN)] 
train_seq_lengths = seq_lengths[:int(length*PCT_TRAIN)]
train_label = label[:int(length*PCT_TRAIN)]

valid_seq_tensor = seq_tensor[int(length*PCT_TRAIN):int(length*(PCT_TRAIN+PCT_VALID))] 
valid_seq_lengths = seq_lengths[int(length*PCT_TRAIN):int(length*(PCT_TRAIN+PCT_VALID))] 
valid_label = label[int(length*PCT_TRAIN):int(length*(PCT_TRAIN+PCT_VALID))]

test_seq_tensor = seq_tensor[int(length*(PCT_TRAIN+PCT_VALID)):]
test_seq_lengths = seq_lengths[int(length*(PCT_TRAIN+PCT_VALID)):]
test_label = label[int(length*(PCT_TRAIN+PCT_VALID)):]

print(train_seq_tensor.shape)
print(valid_seq_tensor.shape)
print(test_seq_tensor.shape)

# Instantiate data loaders
batch_size = 80
train_loader = CustomDataLoader(train_seq_tensor, train_seq_lengths, train_label, batch_size)
valid_loader = CustomDataLoader(valid_seq_tensor, valid_seq_lengths, valid_label, batch_size)
test_loader = CustomDataLoader(test_seq_tensor, test_seq_lengths, test_label, batch_size)

torch.Size([28302, 496])
torch.Size([8086, 496])
torch.Size([4044, 496])


In [14]:
# Model

# 1. Embedding
# 2. Pack the sequences (get rid of paddings)
# 3. LSTM
# 4. Unpack the sequences (recover paddings)
# 5. Fully Connected Layer
# 6. Sigmoid Activation

In [15]:
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

class net_LSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_size, n_layers,\
                 drop_lstm=0.1, drop_out = 0.1):

        super().__init__()

        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        
        # embedding 
        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        # LSTM layers
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, 
                            dropout=drop_lstm, batch_first=True)
        
        # dropout layer
        self.dropout = nn.Dropout(drop_out)
        
        # linear and sigmoid layers
        self.fc = nn.Linear(hidden_dim, output_size)
        self.sig = nn.Sigmoid()
        

    def forward(self, x, seq_lengths):

        # embeddings
        embedded_seq_tensor = self.embedding(x)
                
        # pack, remove pads
        packed_input = pack_padded_sequence(embedded_seq_tensor, seq_lengths.cpu().numpy(), batch_first=True)
        
        # lstm
        packed_output, (ht, ct) = self.lstm(packed_input, None)
          # If `(h_0, c_0)` is not provided, both **h_0** and **c_0** default to zero

        # unpack, recover padded sequence
        output, input_sizes = pad_packed_sequence(packed_output, batch_first=True)
       
        # collect the last output in each batch
        last_idxs = (input_sizes - 1).to(device) # last_idxs = input_sizes - torch.ones_like(input_sizes)
        output = torch.gather(output, 1, last_idxs.view(-1, 1).unsqueeze(2).repeat(1, 1, self.hidden_dim)).squeeze() # [batch_size, hidden_dim]
        
        # dropout and fully-connected layer
        output = self.dropout(output)
        output = self.fc(output).squeeze()
               
        # sigmoid function
        output = self.sig(output)
        
        return output

In [16]:
# Instantiate the model w/ hyperparams

vocab_size = len(vocab_to_int)
embedding_dim = 100 # int(vocab_size ** 0.25) # 15
hidden_dim = 15
output_size = 1
n_layers = 2
net = net_LSTM(vocab_size, embedding_dim, hidden_dim, output_size, n_layers, \
                 0.2, 0.2)
net = net.to(device)
print(net)

net_LSTM(
  (embedding): Embedding(26441, 100)
  (lstm): LSTM(100, 15, num_layers=2, batch_first=True, dropout=0.2)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc): Linear(in_features=15, out_features=1, bias=True)
  (sig): Sigmoid()
)


In [17]:
# loss and optimization functions
criterion = nn.BCELoss()

lr=0.03
optimizer = torch.optim.Adam(net.parameters(), lr=lr)

scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                       mode = 'min', 
                                                      factor = 0.5,
                                                      patience = 2)

In [18]:
# Train & Validate

In [19]:
import numpy as np

# training params

epochs = 6 

counter = 0
print_every = 10
clip=5 # gradient clipping


net.train()
# train for some number of epochs
val_losses = []
for e in range(epochs):
  
    scheduler.step(e)

    for seq_tensor, seq_tensor_lengths, label in iter(train_loader):
        counter += 1
               
        seq_tensor = seq_tensor.to(device)
        seq_tensor_lengths = seq_tensor_lengths.to(device)
        label = label.to(device)
 
        # get the output from the model
        output = net(seq_tensor, seq_tensor_lengths)
    
        # get the loss and backprop
        loss = criterion(output, label.float())
        optimizer.zero_grad() 
        loss.backward()
        
        # prevent the exploding gradient
        nn.utils.clip_grad_norm_(net.parameters(), clip)
        optimizer.step()

        # loss stats
        if counter % print_every == 0:
            # Get validation loss
            
            val_losses_in_itr = []
            sums = []
            sizes = []
            
            net.eval()
            
            for seq_tensor, seq_tensor_lengths, label in iter(valid_loader):

                seq_tensor = seq_tensor.to(device)
                seq_tensor_lengths = seq_tensor_lengths.to(device)
                label = label.to(device)
                output = net(seq_tensor, seq_tensor_lengths)
                
                # losses
                val_loss = criterion(output, label.float())     
                val_losses_in_itr.append(val_loss.item())
                
                # accuracy
                binary_output = (output >= 0.5).short() # short(): torch.int16
                right_or_not = torch.eq(binary_output, label)
                sums.append(torch.sum(right_or_not).float().item())
                sizes.append(right_or_not.shape[0])
            
            accuracy = sum(sums) / sum(sizes)
            
            net.train()
            print("Epoch: {:2d}/{:2d}\t".format(e+1, epochs),
                  "Steps: {:3d}\t".format(counter),
                  "Loss: {:.6f}\t".format(loss.item()),
                  "Val Loss: {:.6f}\t".format(np.mean(val_losses_in_itr)),
                  "Accuracy: {:.3f}".format(accuracy))

Epoch:  1/ 6	 Steps:  10	 Loss: 0.672791	 Val Loss: 0.684251	 Accuracy: 0.567
Epoch:  1/ 6	 Steps:  20	 Loss: 0.698805	 Val Loss: 0.655038	 Accuracy: 0.604
Epoch:  1/ 6	 Steps:  30	 Loss: 0.635641	 Val Loss: 0.652692	 Accuracy: 0.623
Epoch:  1/ 6	 Steps:  40	 Loss: 0.637525	 Val Loss: 0.635390	 Accuracy: 0.638
Epoch:  1/ 6	 Steps:  50	 Loss: 0.551635	 Val Loss: 0.594987	 Accuracy: 0.692
Epoch:  1/ 6	 Steps:  60	 Loss: 0.536466	 Val Loss: 0.531017	 Accuracy: 0.753
Epoch:  1/ 6	 Steps:  70	 Loss: 0.582798	 Val Loss: 0.536601	 Accuracy: 0.746
Epoch:  1/ 6	 Steps:  80	 Loss: 0.709269	 Val Loss: 0.597238	 Accuracy: 0.707
Epoch:  1/ 6	 Steps:  90	 Loss: 0.432677	 Val Loss: 0.500793	 Accuracy: 0.782
Epoch:  1/ 6	 Steps: 100	 Loss: 0.440712	 Val Loss: 0.466917	 Accuracy: 0.794
Epoch:  1/ 6	 Steps: 110	 Loss: 0.548889	 Val Loss: 0.470999	 Accuracy: 0.797
Epoch:  1/ 6	 Steps: 120	 Loss: 0.574950	 Val Loss: 0.561690	 Accuracy: 0.716
Epoch:  1/ 6	 Steps: 130	 Loss: 0.536581	 Val Loss: 0.488550	 Ac

In [20]:
# Test

In [23]:
test_losses = []
sums = []
sizes = []

net.eval()

test_losses = []

for seq_tensor, seq_tensor_lengths, label in iter(test_loader):

    seq_tensor = seq_tensor.to(device)
    seq_tensor_lengths = seq_tensor_lengths.to(device)
    label = label.to(device)
    output = net(seq_tensor, seq_tensor_lengths)

    # losses
    test_loss = criterion(output, label.float())     
    test_losses.append(test_loss.item())

    # accuracy
    binary_output = (output >= 0.5).short() # short(): torch.int16
    right_or_not = torch.eq(binary_output, label)
    sums.append(torch.sum(right_or_not).float().item())
    sizes.append(right_or_not.shape[0])

accuracy = np.sum(sums) / np.sum(sizes)
print("Test Loss: {:.6f}\t".format(np.mean(test_losses)),
      "Accuracy: {:.3f}".format(accuracy))

Test Loss: 0.264796	 Accuracy: 0.895
