In [1]:
import random
import torch
import torch.nn as nn
import torch.optim as optim
import nltk
from nltk.corpus import movie_reviews
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from collections import Counter

In [14]:
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers):
        super().__init__()  # Corrected line
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        return out

In [15]:
# Define preprocessing functions
nltk.download('punkt')
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return tokens

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [30]:
import nltk
nltk.download('movie_reviews')

from nltk.corpus import movie_reviews
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


In [11]:
# Create a vocabulary and encode the reviews
all_words = [word.lower() for word in movie_reviews.words()]
word_counts = Counter(all_words)
vocab = sorted(word_counts, key=word_counts.get, reverse=True)
vocab_to_index = {word: i for i, word in enumerate(vocab)}

def encode_review(text, vocab_to_index, sequence_length):
    tokens = preprocess_text(text)
    encoded_review = []
    for word in tokens:
        if word in vocab_to_index:
            encoded_review.append(vocab_to_index[word])
    padding = [0] * (sequence_length - len(encoded_review))
    return padding + encoded_review

In [31]:
# Hyperparameters (You may need to adjust these based on experimentation)
input_size = 1
hidden_size = 256
output_size = 2
num_layers = 2
sequence_length = 200
learning_rate = 0.001
num_epochs = 10
batch_size = 64

In [32]:
# Define a weighted loss function
class_weight = torch.tensor([1.0, 1.0])  # Adjust class weights based on data balance
criterion = nn.CrossEntropyLoss(weight=class_weight)

In [34]:
# Create the model and define the loss and optimizer
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = LSTMModel(input_size, hidden_size, output_size, num_layers).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [35]:
# Prepare the training data
reviews = []
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        review = movie_reviews.raw(fileid)
        encoded_review = encode_review(review, vocab_to_index, sequence_length)
        label = 1 if category == 'pos' else 0
        reviews.append((encoded_review, label))

In [36]:
# Shuffle the data and split into training and testing sets
random.shuffle(reviews)
split_ratio = 0.8
train_size = int(len(reviews) * split_ratio)
train_data = reviews[:train_size]
test_data = reviews[train_size:]

In [37]:
def pad_sequence(sequence, sequence_length):
    if len(sequence) < sequence_length:
        padded_sequence = [0] * (sequence_length - len(sequence)) + sequence
    else:
        padded_sequence = sequence[-sequence_length:]
    return padded_sequence

In [38]:
# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    total_correct = 0
    total_samples = 0

    for i in range(0, len(train_data), batch_size):
        batch = train_data[i:i+batch_size]
        X = [pad_sequence(item[0], sequence_length) for item in batch]
        y = [item[1] for item in batch]

        X = torch.tensor(X, dtype=torch.float32).view(-1, sequence_length, input_size).to(device)
        y = torch.tensor(y, dtype=torch.long).to(device)

        outputs = model(X)
        optimizer.zero_grad()
        loss = criterion(outputs, y)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        _, predicted = torch.max(outputs.data, 1)
        total_correct += (predicted == y).sum().item()
        total_samples += len(y)

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss:.4f}, Accuracy: {total_correct/total_samples:.4f}')

Epoch [1/10], Loss: 17.4907, Accuracy: 0.5112
Epoch [2/10], Loss: 17.3687, Accuracy: 0.4981
Epoch [3/10], Loss: 17.3551, Accuracy: 0.5000
Epoch [4/10], Loss: 17.2822, Accuracy: 0.5256
Epoch [5/10], Loss: 17.1840, Accuracy: 0.5450
Epoch [6/10], Loss: 17.1015, Accuracy: 0.5487
Epoch [7/10], Loss: 17.0983, Accuracy: 0.5587
Epoch [8/10], Loss: 17.1184, Accuracy: 0.5406
Epoch [9/10], Loss: 17.0568, Accuracy: 0.5675
Epoch [10/10], Loss: 16.9727, Accuracy: 0.5681


In [39]:
# Test the model
model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for i in range(0, len(test_data), batch_size):
        batch = test_data[i:i+batch_size]
        X = [pad_sequence(item[0], sequence_length) for item in batch]
        y = [item[1] for item in batch]

        X = torch.tensor(X, dtype=torch.float32).view(-1, sequence_length, input_size).to(device)
        y = torch.tensor(y, dtype=torch.long).to(device)

        outputs = model(X)
        _, predicted = torch.max(outputs.data, 1)
        total += y.size(0)
        correct += (predicted == y).sum().item()

    accuracy = 100 * correct / total
    print(f'Test Accuracy: {accuracy:.2f}%')

Test Accuracy: 57.00%


In [40]:
# User input prediction
def predict_sentiment(text):
    model.eval()
    with torch.no_grad():
        encoded_review = encode_review(text, vocab_to_index, sequence_length)
        X = torch.tensor(encoded_review, dtype=torch.float32).view(1, sequence_length, input_size).to(device)
        output = model(X)
        _, predicted = torch.max(output.data, 1)
        sentiment = "Positive" if predicted.item() == 1 else "Negative"
        return sentiment

In [45]:
user_input = input("Enter a movie review: ")
predicted_sentiment = predict_sentiment(user_input)
print(f"Predicted sentiment: {predicted_sentiment}")

Enter a movie review: i love you
Predicted sentiment: Positive
