In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [11]:
import torch.nn as nn
import torch
import torch.optim as optim
from torch.utils.data import DataLoader

In [3]:
dataset_path='vispamdetection_dataset'

data = pd.read_csv(dataset_path+'/reviews.csv/reviews.csv')
vocab = pd.read_json('vocab/vocab.json', orient='index')


In [4]:
data.head()

Unnamed: 0,link,rating,comment,label,spam_label
0,https://shopee.vn/Ví-nam-đựng-Card-Feasty-da-n...,5,Tuyệt vời cho các e bé nghe đáng kể vs anh đi ...,1,3
1,https://shopee.vn/Ví-nam-đựng-Card-Feasty-da-n...,5,Chiến dịch tiêm vaccine Covid-19 toàn dân bắt ...,1,3
2,https://shopee.vn/Ví-nam-đựng-Card-Feasty-da-n...,5,Nsnwnwnxnwkkxmmxmsmwmsmsxnndnwnxnsnwnznn ncn1m...,1,3
3,https://shopee.vn/Ví-nam-đựng-Card-Feasty-da-n...,2,"Chất lượng sản phẩm ko như quảng cáo, kich thu...",0,0
4,https://shopee.vn/Ví-nam-đựng-Card-Feasty-da-n...,5,"Giao hàng nhanh. Shop đóng gói cẩn thận, thế n...",0,0


In [5]:
vocab.head()

Unnamed: 0,0
a,1
A,2
a-ba-giua,3
a-ba-toa,4
a bàng,5


In [6]:
test_dataset = data.head()
vocab_dict = vocab.to_dict()[0]

In [8]:
def encode_sentence(s, vocab):
    return [vocab.get(i, 0) for i in s.lower().split()]

encode_sentence(test_dataset['comment'][0], vocab_dict)

[67253,
 70515,
 12407,
 7842,
 24599,
 4375,
 42710,
 20367,
 31911,
 0,
 626,
 21898,
 1024,
 63864,
 68450,
 42883,
 21810,
 65213,
 38985,
 3535,
 132,
 24647,
 41248,
 45700,
 7842,
 2904,
 65128,
 38335,
 33418,
 9382,
 60104,
 21740,
 62693,
 64313,
 30495,
 25148,
 24154,
 34762,
 33418,
 59574,
 60165,
 16277,
 3133,
 27122,
 14268,
 66406,
 64653,
 27898,
 29585,
 30750,
 68339,
 28780]

In [12]:
class RNNModel(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, output_size):
        super(RNNModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.rnn = nn.RNN(embed_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        
    def forward(self, x):
        x = self.embedding(x)
        _, hidden = self.rnn(x)
        output = self.fc(hidden.squeeze(0))
        return torch.sigmoid(output)

In [13]:
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, output_size):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        
    def forward(self, x):
        x = self.embedding(x)
        _, (hidden, _) = self.lstm(x)
        output = self.fc(hidden.squeeze(0))
        return torch.sigmoid(output)

In [14]:
class GRUModel(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, output_size):
        super(GRUModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.gru = nn.GRU(embed_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        
    def forward(self, x):
        x = self.embedding(x)
        _, hidden = self.gru(x)
        output = self.fc(hidden.squeeze(0))
        return torch.sigmoid(output)

In [None]:
def train_model(model, train_loader, val_loader, criterion, optimizer, epochs=10, device="cpu"):

    model = model.to(device)
    for epoch in range(epochs):
        
        model.train()
        train_loss = 0
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs.squeeze(), labels.float())
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                loss = criterion(outputs.squeeze(), labels.float())
                val_loss += loss.item()

        print(f"Epoch {epoch + 1}/{epochs}, Train Loss: {train_loss / len(train_loader):.4f}, Validation Loss: {val_loss / len(val_loader):.4f}")

In [17]:
vocab_size = len(vocab_dict) + 1  # Vocabulary size (+1 for unknown tokens)
embed_dim = 100  # Size of word embeddings
hidden_dim = 128  # Number of hidden units
output_dim = 1  # For binary classification

# Instantiate each model
rnn_model = RNNModel(vocab_size, embed_dim, hidden_dim, output_dim)
lstm_model = LSTMModel(vocab_size, embed_dim, hidden_dim, output_dim)
gru_model = GRUModel(vocab_size, embed_dim, hidden_dim, output_dim)


In [18]:
# Binary Cross Entropy Loss for binary classification
criterion = nn.BCELoss()

# Optimizers for each model
rnn_optimizer = optim.Adam(rnn_model.parameters(), lr=0.001)
lstm_optimizer = optim.Adam(lstm_model.parameters(), lr=0.001)
gru_optimizer = optim.Adam(gru_model.parameters(), lr=0.001)
