In [1]:
import torch
import torch.nn as nn


from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import random
import torch.optim.lr_scheduler as lr_scheduler
import math
import nltk.data
import nltk
from sentence_transformers import SentenceTransformer # embedding câu

class LSTMModel(nn.Module):
    def __init__(self, embedding_dim = 384, hidden_dim = 128, output_dim = 1, dropout = 0.2, numlayers = 1, bidirectional = False):
        super(LSTMModel, self).__init__()
        self.num_layers = numlayers
        self.D = 2 if bidirectional else 1

        self.hidden_dim = hidden_dim # có giá trị tự do
        self.embedding_dim = embedding_dim # chiều của embedding, vd: [1,2,3,...300]: 1 embedding có kích thước là 300
        self.lstm = nn.LSTM(embedding_dim, hidden_dim,batch_first=True, num_layers = numlayers, dropout = dropout, bidirectional=bidirectional) # đầu vào của LSTM là có kích thước embedding và đầu ra có kích thước hidden, xây dựng 1 mô hình LSTM
        # input_size = embedding_dim; hidden_size, num_layer
        # tuning ở ngay trên
        self.fc = nn.Linear(self.D*hidden_dim, output_dim) # 1 fully conected để làm đầu ra



    def forward(self, inputs):

        '''
        đầu vào input lần lượt là: batch_size, sequence_length, embedding_dim
        '''
        # inputs = torch.nn.utils.rnn.pack_padded_sequence(inputs, )
        batch_size = inputs.batch_sizes[0].item()#input ở đây chính là 1 batch mà chúng ta cho vào, và tập huấn luyện của chúng ta chứa những inputs này

        hidden = self.init_hidden(batch_size)
        lstm_out, _ = self.lstm(inputs, hidden)

        # Giải nén lstm_out
        padded_outputs, _ = pad_packed_sequence(lstm_out, batch_first=True)


        padded_outputs = padded_outputs[:, -1, :]

        output = self.fc(padded_outputs)
        return output
    def init_hidden(self, batch_size):
        return (torch.zeros(self.D * self.num_layers, batch_size, self.hidden_dim),# gồm 1 phần chứa batch_size 'phần', 'phần' chứa hidden_dim units
                torch.zeros(self.D * self.num_layers, batch_size, self.hidden_dim))


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def text2embedding(model_embedding, sentence):
    model = model_embedding
    # Sentences are encoded by calling model.encode()
    embedding = model.encode(sentence)

    return torch.Tensor(embedding) if len(embedding.shape) >= 2 else torch.Tensor([embedding])

In [3]:
model_embedding = SentenceTransformer('paraphrase-MiniLM-L6-v2')
def predict(model,model_embedding,text, speed = 1):
    tokenizer = nltk.data.load('english.pickle')
    tokens = tokenizer.tokenize(text)

    max_sequence_length = math.ceil(len(tokens) / speed)

    indices = random.sample(range(len(tokens)), max_sequence_length)
    indices.sort()
    tokens = [tokens[i] for i in indices]

    new_data = text2embedding(model_embedding,tokens)
    new_data = new_data.unsqueeze(0)

    sequence_lengths = [len(seq) for seq in new_data]

    new_data = pack_padded_sequence(new_data, sequence_lengths, batch_first=True)

    model.eval()
    with torch.no_grad():
        inputs = new_data
        outputs = model(inputs)#gọi hàm forward

        proba_label = torch.sigmoid(outputs)
        predict_label = (torch.sigmoid(outputs) > 0.5).float()


    return proba_label.item(), 'positive' if predict_label == 1 else 'negative'

    

In [4]:
path_best_parameter_model = 'best_params_model.pkl'
model = LSTMModel()
model.load_state_dict(torch.load(path_best_parameter_model))





<All keys matched successfully>

In [5]:
text = '''
As someone who relies on multiple electronic devices daily, I was in need of a reliable and affordable power strip. The AmazonBasics 6-Outlet, 200 Joule Surge Protector Power Strip has proven to be a great solution, providing functionality and peace of mind at an attractive price point.

The AmazonBasics power strip offers six outlets, which is more than enough to accommodate my various devices, including my computer, monitor, speakers, and phone charger. The 2-foot cord is not the longest, but it has been sufficient for my needs and helps to minimize cable clutter.

The 200 Joule surge protection rating provides a basic level of protection for my devices, safeguarding them against power surges and spikes. While it may not be the highest level of protection available, it is suitable for everyday use and offers reassurance that my valuable electronics are secure.

The power strip's design is simple and unobtrusive, making it easy to blend into any room or workspace. The white color and slim profile do not draw attention, allowing it to integrate seamlessly into my setup.

One minor drawback is the lack of USB ports for charging devices directly, but this is not a deal-breaker considering the budget-friendly price and the primary purpose of the power strip.

Overall, the AmazonBasics 6-Outlet, 200 Joule Surge Protector Power Strip is an excellent value, offering reliable functionality and basic surge protection at an affordable price. For those seeking a simple and effective solution for managing multiple devices, this power strip is a solid choice.
'''

In [6]:
predict(model, model_embedding, text)

(0.9763833284378052, 'positive')

In [18]:
import pandas as pd
import numpy as np
import joblib

In [19]:
data_val = pd.read_csv('data/data_val.csv')
data_val.head()

Unnamed: 0,User,Number of star,Date,Type of Product,Review Title,Review Content,Image in Review,Label
0,Green Mario,2,01/02/2023,10ft - Black,Nope.,Probably the worst-quality thing I've ever bou...,1,Negative
1,Ann,5,01/10/2024,Mercury White,"Pretty, simple, and great quality!",I own this mouse in both colors now and use it...,1,Positive
2,Matthew DeClue,4,04/19/2023,Black - Arctis 5,Great sound and durable,"3rd set I've owned, not sure I'd buy a differe...",0,Positive
3,Brenda,1,02/06/2023,10ft - Black,It stopped working!,The plug and cord both stopped working. I've o...,1,Negative
4,Ivy,5,03/12/2024,Classic Black,Best mouse,It had the best connection and it had good RGB...,0,Positive


In [20]:
X_val = data_val['Review Title'] + ' ' + data_val['Review Content']
y_val = (data_val['Label'] == 'Positive').astype(int)

In [21]:
X_val.head()

0    Nope. Probably the worst-quality thing I've ev...
1    Pretty, simple, and great quality! I own this ...
2    Great sound and durable 3rd set I've owned, no...
3    It stopped working! The plug and cord both sto...
4    Best mouse It had the best connection and it h...
dtype: object

In [22]:
y_val.head()

0    0
1    1
2    1
3    0
4    1
Name: Label, dtype: int32

In [23]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

In [24]:
df = pd.DataFrame()

In [25]:
tfidf_svc_model = joblib.load('model/sam_model.pkl')
y_pred = tfidf_svc_model.predict(X_val)

acc1 = accuracy_score(y_true=y_val, y_pred=y_pred)
rec1 = recall_score(y_true=y_val, y_pred=y_pred)
pre1 = precision_score(y_true=y_val, y_pred=y_pred)
f11 = f1_score(y_true=y_val, y_pred=y_pred)

df['Tfidf - SVC'] = np.array([acc1, rec1, pre1, f11])

acc1, rec1, pre1, f11

(0.696078431372549, 0.8225806451612904, 0.7183098591549296, 0.7669172932330828)

In [26]:
y_pred = []
for comment in X_val:
    proba, label = predict(model, model_embedding, text)
    if label == 'positive':
        label = 1
    else:
        label = 0
    y_pred.append(label)
y_pred = np.array(y_pred)

acc2 = accuracy_score(y_true=y_val, y_pred=y_pred)
rec2 = recall_score(y_true=y_val, y_pred=y_pred)
pre2 = precision_score(y_true=y_val, y_pred=y_pred)
f12 = f1_score(y_true=y_val, y_pred=y_pred)

df['LSTM'] = np.array([acc2, rec2, pre2, f12])

acc2, rec2, pre2, f12

(0.6078431372549019, 1.0, 0.6078431372549019, 0.7560975609756097)

In [None]:
df.to_csv('result compare tfidf-svc model with LSTM model.csv', index=False)

In [15]:
data_test = pd.read_csv('data/data_test.csv')
data_test.head()

Unnamed: 0,User,Number of star,Date,Type of Product,Review Title,Review Content,Image in Review,Label
0,Cameron Germano,3,01/04/2023,Mercury White,You Get What You Pay For,"This mouse is a great value, but it is priced ...",0,Positive
1,Prophet Paladin,3,21/04/2023,10ft - Silver,not 100w,Only charges up to 20W (with a 50W charger),1,Negative
2,D.r. M.,4,30/01/2024,10ft - Black,Dual type connections,Bought this to replace an a to c cable that qu...,1,Positive
3,Gindjurra,5,10/30/2023,Classic Black,The best all around basic gaming mouse I've ev...,"Like the headline says - and to expand on it, ...",0,Positive
4,James Seedorff,2,11/09/2021,10ft - Black,"Feels like bait and switch, not 100w capable",PROS: Long USB C to USB C cable.CONS: Does not...,1,Negative


In [16]:
X_test = data_test['Review Title'] + ' ' + data_test['Review Content']
y_test = (data_test['Label'] == 'Positive').astype(int)

In [17]:
y_pred = []
for comment in X_test:
    proba, label = predict(model, model_embedding, text)
    if label == 'positive':
        label = 1
    else:
        label = 0
    y_pred.append(label)
y_pred = np.array(y_pred)

acc2 = accuracy_score(y_true=y_test, y_pred=y_pred)
rec2 = recall_score(y_true=y_test, y_pred=y_pred)
pre2 = precision_score(y_true=y_test, y_pred=y_pred)
f12 = f1_score(y_true=y_test, y_pred=y_pred)

acc2, rec2, pre2, f12

(0.6080402010050251, 1.0, 0.6080402010050251, 0.75625)