# Building a Sentiment Analysis Model with LSTM on a Small Text Dataset
The goal is to build a sentiment analysis model using a Long Short-Term Memory (LSTM) network to classify the sentiment of text data (e.g., reviews, tweets, or comments) as positive, negative. The model will learn to capture sequential dependencies in text and make accurate predictions based on the context and tone of the input.

# 1: Import Necessary Libraries

In [1]:
import pandas as pd
import numpy as np
import re
from gensim.models import Word2Vec
import torch
import torch.nn as nn
import torch.optim as optim


# 2: Data Preprocessing

## Step1:Loading the Dataset

In [2]:
# Example dataset
import pandas as pd

import pandas as pd

# Balanced dataset with positive and negative reviews
data = {
    "review": [
        # Positive reviews
        "The movie was absolutely amazing! A masterpiece.",
        "Fantastic performances by the entire cast.",
        "Loved the story and the cinematography. A must-watch!",
        "The action scenes were thrilling and well-executed.",
        "An emotional and heartwarming experience.",
        "Brilliant direction and an outstanding soundtrack.",
        "A gripping storyline that kept me on the edge of my seat.",
        "One of the best movies I've ever seen!",
        "The characters were relatable and well-developed.",
        "A perfect blend of humor and drama.",
        
        # Negative reviews
        "The movie was a complete waste of time.",
        "Terrible acting and a poorly written script.",
        "The story was boring and predictable.",
        "I couldn’t relate to any of the characters.",
        "The pacing was slow and the plot lacked depth.",
        "Way too many plot holes, very disappointing.",
        "The humor felt forced and awkward.",
        "It was overhyped and did not live up to expectations.",
        "The ending was rushed and unsatisfying.",
        "Poor direction and lackluster performances."
    ],
    "sentiment": [
        # Sentiments for positive reviews
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        # Sentiments for negative reviews
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0
    ]  # 1 = Positive, 0 = Negative
}

# Convert to DataFrame
df = pd.DataFrame(data)

# Display the DataFrame
print(df)



                                               review  sentiment
0    The movie was absolutely amazing! A masterpiece.          1
1          Fantastic performances by the entire cast.          1
2   Loved the story and the cinematography. A must...          1
3   The action scenes were thrilling and well-exec...          1
4           An emotional and heartwarming experience.          1
5   Brilliant direction and an outstanding soundtr...          1
6   A gripping storyline that kept me on the edge ...          1
7              One of the best movies I've ever seen!          1
8   The characters were relatable and well-developed.          1
9                 A perfect blend of humor and drama.          1
10            The movie was a complete waste of time.          0
11       Terrible acting and a poorly written script.          0
12              The story was boring and predictable.          0
13        I couldn’t relate to any of the characters.          0
14     The pacing was slo

## Step2:Tokenization

In [3]:
# Tokenize function using regex
def tokenize(text):
    return re.findall(r'\b\w+\b', text.lower())

# Apply the tokenization function
df['tokens'] = df['review'].apply(tokenize)
df['tokens']


0     [the, movie, was, absolutely, amazing, a, mast...
1      [fantastic, performances, by, the, entire, cast]
2     [loved, the, story, and, the, cinematography, ...
3     [the, action, scenes, were, thrilling, and, we...
4        [an, emotional, and, heartwarming, experience]
5     [brilliant, direction, and, an, outstanding, s...
6     [a, gripping, storyline, that, kept, me, on, t...
7       [one, of, the, best, movies, i, ve, ever, seen]
8     [the, characters, were, relatable, and, well, ...
9            [a, perfect, blend, of, humor, and, drama]
10      [the, movie, was, a, complete, waste, of, time]
11    [terrible, acting, and, a, poorly, written, sc...
12          [the, story, was, boring, and, predictable]
13    [i, couldn, t, relate, to, any, of, the, chara...
14    [the, pacing, was, slow, and, the, plot, lacke...
15    [way, too, many, plot, holes, very, disappoint...
16             [the, humor, felt, forced, and, awkward]
17    [it, was, overhyped, and, did, not, live, 

## Step 3:Train Word2Vec on tokenized sentences

In [4]:
# Train Word2Vec on tokenized sentences
word2vec_model = Word2Vec(sentences=df['tokens'], vector_size=50, window=3, min_count=1)
print(word2vec_model)


Word2Vec<vocab=97, vector_size=50, alpha=0.025>


## Step 4:Convert sentences into fixed-length vectors

In [5]:
# Convert sentences into fixed-length vectors
def sentence_to_vectors(tokens, model, max_len=5):
    
    vectors = []
    for word in tokens:
        if word in model.wv:  # Check if the word is in the model
            vectors.append(model.wv[word])  # Append the word's vector
    # Pad or truncate to max_len
    vectors = vectors[:max_len] + [[0] * model.vector_size] * (max_len - len(vectors))
    return np.array(vectors)

# Convert reviews to vectors
max_len = 5  # Fixed input length
X = np.array([sentence_to_vectors(tokens, word2vec_model, max_len) for tokens in df['tokens']])
y = np.array(df['sentiment'])
#df['vectors'] = df['tokens'].apply(lambda tokens: sentence_to_vectors(tokens, word2vec_model, max_len))

# Display the DataFrame with vectors
#print(df[['review', 'tokens', 'vectors']])



## Step 5 Converting numpy arrays to tensors

In [7]:
# Convert data to PyTorch tensors
X_tensor = torch.tensor(X, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.long)
print(X_tensor)
print(y_tensor)

tensor([[[-0.0011,  0.0005,  0.0102,  ...,  0.0192,  0.0100,  0.0185],
         [ 0.0028, -0.0052, -0.0142,  ...,  0.0010,  0.0164, -0.0141],
         [ 0.0156, -0.0190, -0.0004,  ..., -0.0048, -0.0190,  0.0090],
         [ 0.0036,  0.0141,  0.0059,  ..., -0.0067,  0.0032,  0.0032],
         [ 0.0148,  0.0200,  0.0177,  ..., -0.0037,  0.0072, -0.0141]],

        [[-0.0104, -0.0148, -0.0058,  ..., -0.0054,  0.0077,  0.0007],
         [ 0.0163, -0.0089,  0.0180,  ..., -0.0059,  0.0183,  0.0017],
         [ 0.0113,  0.0110,  0.0037,  ..., -0.0176,  0.0069,  0.0042],
         [-0.0011,  0.0005,  0.0102,  ...,  0.0192,  0.0100,  0.0185],
         [-0.0188, -0.0099, -0.0194,  ...,  0.0146,  0.0109,  0.0185]],

        [[ 0.0122, -0.0135,  0.0014,  ..., -0.0033, -0.0189, -0.0052],
         [-0.0011,  0.0005,  0.0102,  ...,  0.0192,  0.0100,  0.0185],
         [-0.0165,  0.0186, -0.0004,  ..., -0.0048, -0.0063, -0.0047],
         [-0.0163,  0.0091, -0.0083,  ..., -0.0141,  0.0018,  0.0128],
  

# 3.LSTM Model Building

## Step 1:Defining the LSTM Model

In [8]:
# Define the LSTM model
class SimpleLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SimpleLSTM, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)  # Fully connected layer

    def forward(self, x):
        _, (hidden, _) = self.lstm(x)  # Get the hidden state from LSTM
        out = self.fc(hidden[-1])     # Pass hidden state through a fully connected layer
        return out


## Step2:Initialize the model parameters

In [14]:
# Initialize the model
input_size = 50  # Size of the word vector
hidden_size = 32
output_size = 2  # Binary classification (positive/negative)as we  are working with one-hot encoded labels.

## Step3:Initialize the model,loss and optimizer

In [15]:
model = SimpleLSTM(input_size, hidden_size, output_size)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

## Step 4:Training the Model

In [16]:
# Training loop
num_epochs = 50
for epoch in range(num_epochs):
    model.train()  # Set the model to training mode
    optimizer.zero_grad()  # Reset gradients

    # Forward pass
    outputs = model(X_tensor)
    # Calculate loss
    loss = criterion(outputs, y_tensor)  

    # Backward pass and optimization
    loss.backward()
    optimizer.step()

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item():.4f}")


Epoch 1/50, Loss: 0.6939
Epoch 2/50, Loss: 0.6931
Epoch 3/50, Loss: 0.6920
Epoch 4/50, Loss: 0.6909
Epoch 5/50, Loss: 0.6901
Epoch 6/50, Loss: 0.6892
Epoch 7/50, Loss: 0.6877
Epoch 8/50, Loss: 0.6860
Epoch 9/50, Loss: 0.6840
Epoch 10/50, Loss: 0.6815
Epoch 11/50, Loss: 0.6781
Epoch 12/50, Loss: 0.6738
Epoch 13/50, Loss: 0.6684
Epoch 14/50, Loss: 0.6607
Epoch 15/50, Loss: 0.6512
Epoch 16/50, Loss: 0.6377
Epoch 17/50, Loss: 0.6213
Epoch 18/50, Loss: 0.5988
Epoch 19/50, Loss: 0.5695
Epoch 20/50, Loss: 0.5329
Epoch 21/50, Loss: 0.5153
Epoch 22/50, Loss: 0.5274
Epoch 23/50, Loss: 0.4242
Epoch 24/50, Loss: 0.4788
Epoch 25/50, Loss: 0.3620
Epoch 26/50, Loss: 0.4155
Epoch 27/50, Loss: 0.2993
Epoch 28/50, Loss: 0.3544
Epoch 29/50, Loss: 0.2409
Epoch 30/50, Loss: 0.2973
Epoch 31/50, Loss: 0.1879
Epoch 32/50, Loss: 0.2361
Epoch 33/50, Loss: 0.1436
Epoch 34/50, Loss: 0.1892
Epoch 35/50, Loss: 0.1033
Epoch 36/50, Loss: 0.1367
Epoch 37/50, Loss: 0.0801
Epoch 38/50, Loss: 0.1070
Epoch 39/50, Loss: 0.

[NVSHARE][WARN]: Couldn't open file /var/run/secrets/kubernetes.io/serviceaccount/namespace to read Pod namespace
[NVSHARE][INFO]: Successfully initialized nvshare GPU
[NVSHARE][INFO]: Client ID = 90c91f0715813fef


# 4 Model Evaluation

## Step 1:Prediction Function

In [14]:
# Prediction function
def predict_sentiment(review, model, word2vec_model, max_len=5):
    # Tokenize the input review
    tokens = tokenize(review)

    # Convert tokens to vectors
    vectors = sentence_to_vectors(tokens, word2vec_model, max_len)

    # Convert to PyTorch tensor and add batch dimension
    input_tensor = torch.tensor(vectors, dtype=torch.float32).unsqueeze(0)  # Shape: (1, max_len, input_size)

    # Set the model to evaluation mode
    model.eval()

    # Make prediction
    with torch.no_grad():
        output = model(input_tensor)
        predicted = torch.argmax(output, dim=1)  # Get the index of the maximum value (class)

    return predicted.item()


# 5:Sample Predictions

In [16]:
# Sample reviews for prediction
sample_reviews = [
    "This movie was great and very entertaining!",
    "waste",
    "it was boring.",
    "one of the best movies."
]

# Predict and print sentiments for sample reviews
for review in sample_reviews:
    sentiment = predict_sentiment(review, model, word2vec_model)
    sentiment_label = "Positive" if sentiment == 1 else "Negative"
    print(f"Review: \"{review}\" => Sentiment: {sentiment_label}")


Review: "This movie was great and very entertaining!" => Sentiment: Negative
Review: "waste" => Sentiment: Negative
Review: "it was boring." => Sentiment: Negative
Review: "one of the best movies." => Sentiment: Positive
