# Review Classification
#### In this project I am going to classify reviews in to positive and negetive reviews. I am going to use Embedding, LSTM and a fully connected layer at the end. For sake of learning I am not going to use any pretrained model. 

##### I am running on RTX 4070 gpu. At first I check how many unique words are in my dataset so I choose between word level and char level. I could use pretrained ones but I want to do it all by myself.  

In [47]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from tqdm.auto import tqdm
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
import nltk
from sklearn.model_selection import train_test_split
nltk.download('punkt')
nltk.download('stopwords')

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\panah\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\panah\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [10]:
file_path = 'Datasets/dataset.csv'

df = pd.read_csv(file_path)

print(df.head())

                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


In [76]:
all_reviews = ' '.join(df['review'].astype(str).tolist())

tokens = word_tokenize(all_reviews.lower()) 

translator = str.maketrans('', '', string.punctuation)

tokens = [word.translate(translator) for word in tokens]

stop_words = set(stopwords.words('english'))

filtered_words = [word for word in tokens if word.isalpha() and word not in stop_words]

unique_words = set(filtered_words)

num_unique_words = len(unique_words)

print(f"Number of unique words: {num_unique_words}")

Number of unique words: 133264


In [81]:
class TextDataset(Dataset):
    def __init__(self, file_path, vocab_size=133000, max_length=100):
        self.df = pd.read_csv(file_path)
        self.max_length = max_length
        self.tokenizer = word_tokenize
        self.stop_words = set(stopwords.words('english'))
        self.translator = str.maketrans('', '', string.punctuation)
        self.vocab_size = vocab_size

        # Prepare the tokenizer and word index
        all_reviews = ' '.join(self.df['review'].astype(str).tolist()).lower()
        tokens = word_tokenize(all_reviews)
        tokens = [word.translate(self.translator) for word in tokens]
        tokens = [word for word in tokens if word.isalpha() and word not in self.stop_words]
        
        # Create word index
        unique_words = set(tokens)
        self.word_to_idx = {word: idx+1 for idx, word in enumerate(unique_words)}  # +1 to reserve 0 for padding
        self.word_to_idx['<PAD>'] = 0  # Add padding token

    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        review = self.df.loc[idx, 'review']
        label = self.df.loc[idx, 'sentiment']
        tokens = self.tokenizer(review.lower())
        tokens = [word.translate(self.translator) for word in tokens]
        tokens = [word for word in tokens if word.isalpha() and word not in self.stop_words]
        
        # Pad or truncate tokens to max_length
        if len(tokens) < self.max_length:
            tokens.extend(['<PAD>'] * (self.max_length - len(tokens)))
        else:
            tokens = tokens[:self.max_length]
        
        text_indices = [self.word_to_idx.get(word, 0) for word in tokens]
        
        return {
            'text': torch.tensor(text_indices, dtype=torch.long),
            'label': torch.tensor(1 if label == 'positive' else 0, dtype=torch.float)
        }

In [82]:
dataset = TextDataset(file_path=file_path)

# Define batch size and create DataLoader
batch_size = 32
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

In [96]:
class SimpleModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(SimpleModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size + 1, embedding_dim)  # +1 to account for padding token
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, (hidden, cell) = self.lstm(embedded)
        hidden = hidden[-1]  # Select the hidden state from the last LSTM layer
        output = torch.sigmoid(self.fc(hidden))
        return output

# Check for GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Model parameters
embedding_dim = 100
hidden_dim = 64
output_dim = 1  # Binary classification

model = SimpleModel(num_unique_words, embedding_dim, hidden_dim, output_dim)
model = model.to(device)  # Send model to GPU

criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

Using device: cuda


In [97]:
epochs = 10
evaluate_every = 20  # Evaluate the model every 20 batches

for epoch in range(epochs):
    total_loss = 0
    model.train()  # Set model to training mode
    eval = False
    for batch_idx, batch in enumerate(dataloader):
        if batch_idx  > 1300 and eval == False:
            model.eval()  # Set model to evaluation mode         
            total_eval_loss = 0
            with torch.no_grad():
                for eval_idx ,eval_batch in enumerate(dataloader):  # Use the same dataloader for evaluation
                    if eval_idx > 1300:
                        eval_texts = eval_batch['text'].to(device)
                        eval_labels = eval_batch['label'].to(device)
                        eval_outputs = model(eval_texts).squeeze()
                        eval_loss = criterion(eval_outputs, eval_labels)
                        total_eval_loss += eval_loss.item()
            
            avg_eval_loss = total_eval_loss / (len(dataloader) - 1300)
            print(f"Epoch {epoch+1}/{epochs}, Batch {batch_idx+1}/{len(dataloader)}, Evaluation Loss: {avg_eval_loss}")
            
            model.train()  # Set model back to training mode after evaluation
            eval = True
        else:
            texts = batch['text'].to(device)  # Send data to GPU
            labels = batch['label'].to(device)  # Send labels to GPU
            
            optimizer.zero_grad()
            outputs = model(texts).squeeze()
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
        
        # Evaluate the model every 'evaluate_every' batches

    
    avg_loss = total_loss / (len(dataloader) - 263)
    print(f"Epoch {epoch+1}/{epochs}, Average Train Loss: {avg_loss}")


Epoch 1/10, Batch 1302/1563, Evaluation Loss: 0.7163463253032119
Epoch 1/10, Average Train Loss: 0.7855103508325724
Epoch 2/10, Batch 1302/1563, Evaluation Loss: 0.5600296687263953
Epoch 2/10, Average Train Loss: 0.6138906921560948
Epoch 3/10, Batch 1302/1563, Evaluation Loss: 0.3275364033396253
Epoch 3/10, Average Train Loss: 0.425521829065222
Epoch 4/10, Batch 1302/1563, Evaluation Loss: 0.2299263049129524
Epoch 4/10, Average Train Loss: 0.3018931974184055
Epoch 5/10, Batch 1302/1563, Evaluation Loss: 0.1786921857201101
Epoch 5/10, Average Train Loss: 0.2212680392554746
Epoch 6/10, Batch 1302/1563, Evaluation Loss: 0.12337760397803194
Epoch 6/10, Average Train Loss: 0.16283794107202154
Epoch 7/10, Batch 1302/1563, Evaluation Loss: 0.10005254101914711
Epoch 7/10, Average Train Loss: 0.11825593493771382
Epoch 8/10, Batch 1302/1563, Evaluation Loss: 0.06522865805371236
Epoch 8/10, Average Train Loss: 0.08612569265533239
Epoch 9/10, Batch 1302/1563, Evaluation Loss: 0.04645322074940138
E

In [98]:
torch.save(model.state_dict(), 'text_model_last_version.pth')

In [127]:
def preprocess_input(sentence, word_to_idx, max_length=100):
    # Tokenize, remove punctuation and stop words, and pad/truncate the sentence
    tokenizer = word_tokenize
    stop_words = set(stopwords.words('english'))
    translator = str.maketrans('', '', string.punctuation)
    
    tokens = tokenizer(sentence.lower())
    tokens = [word.translate(translator) for word in tokens]
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    
    # Pad or truncate tokens to max_length
    if len(tokens) < max_length:
        tokens.extend(['<PAD>'] * (max_length - len(tokens)))
    else:
        tokens = tokens[:max_length]
    
    text_indices = [word_to_idx.get(word, 0) for word in tokens]
    
    return torch.tensor(text_indices, dtype=torch.long).unsqueeze(0)  # Add batch dimension

def predict_sentiment(model, sentence, word_to_idx, device, max_length=100):
    model.eval()  # Set model to evaluation mode
    preprocessed_sentence = preprocess_input(sentence, word_to_idx, max_length).to(device)
    
    with torch.no_grad():
        output = model(preprocessed_sentence).squeeze()
        prediction = torch.round(output).item()
        
    return "positive" if prediction == 1 else "negative"

# Example usage
# Load the trained model
model = SimpleModel(len(dataset.word_to_idx), embedding_dim, hidden_dim, output_dim)
model.load_state_dict(torch.load('text_model.pth'))
model = model.to(device)

# Make a prediction
sentences = ["This product is amazing!", "not interesting", "I Enjoyed iT", "It was suprisingly Boring","not good", "I can say i want to watch it again", 'incredible', 'hate']
for sentence in sentences:
    prediction = predict_sentiment(model, sentence, dataset.word_to_idx, device)
    print(f"The sentence is: {sentence} : {prediction}")

The sentence is: This product is amazing! : positive
The sentence is: not interesting : negative
The sentence is: I Enjoyed iT : positive
The sentence is: It was suprisingly Boring : negative
The sentence is: not good : negative
The sentence is: I can say i want to watch it again : positive
The sentence is: incredible : positive
The sentence is: hate : negative


### Status: Done! the model is perfoming pretty well in prediction