In [3]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from datasets import load_dataset
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import re
from collections import defaultdict


In [4]:

# 1) Load datasets: IMDB (movie reviews) and Amazon Polarity (product reviews)
print("Loading IMDB dataset...")
imdb_dataset = load_dataset("imdb")
print("Loading Amazon Polarity dataset...")
amazon_dataset = load_dataset("amazon_polarity")


Loading IMDB dataset...
Loading Amazon Polarity dataset...


In [5]:

# Convert to DataFrames
imdb_train_df = pd.DataFrame({'text': imdb_dataset['train']['text'], 'label': imdb_dataset['train']['label']})
imdb_test_df = pd.DataFrame({'text': imdb_dataset['test']['text'], 'label': imdb_dataset['test']['label']})

amazon_train_df = pd.DataFrame({'text': amazon_dataset['train']['content'], 'label': amazon_dataset['train']['label']})
amazon_test_df = pd.DataFrame({'text': amazon_dataset['test']['content'], 'label': amazon_dataset['test']['label']})


In [6]:

# Combine the datasets (smaller subset for 8GB RAM)
print("Combining datasets...")
train_df = pd.concat([
    imdb_train_df.sample(2500, random_state=42),  # Subset of 2,500 IMDB reviews
    amazon_train_df.sample(2500, random_state=42)  # Subset of 2,500 Amazon reviews
], ignore_index=True)

test_df = pd.concat([
    imdb_test_df.sample(1000, random_state=42),  # Subset of 1,000 IMDB reviews
    amazon_test_df.sample(1000, random_state=42)  # Subset of 1,000 Amazon reviews
], ignore_index=True)


Combining datasets...


In [7]:

# 2) Load pre-trained GloVe embeddings
print("Loading GloVe embeddings...")
embedding_dim = 300  # Using 300-dimensional GloVe embeddings
embeddings_index = {}
with open('glove.6B.300d.txt', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
print(f"Loaded {len(embeddings_index)} word vectors.")


Loading GloVe embeddings...
Loaded 400000 word vectors.


In [8]:

# 3) Function to preprocess text and convert to embeddings
def text_to_embedding(text, embeddings_index, embedding_dim):
    # Simple preprocessing: lowercase, remove punctuation, split into words
    text = re.sub(r'[^\w\s]', '', text.lower())
    words = text.split()
    
    # Get embeddings for each word and average them
    embeddings = []
    for word in words:
        if word in embeddings_index:
            embeddings.append(embeddings_index[word])
    
    # If no words found in embeddings, return a zero vector
    if not embeddings:
        return np.zeros(embedding_dim)
    
    # Average the embeddings
    embeddings = np.array(embeddings)
    return np.mean(embeddings, axis=0)


In [9]:

# 4) Convert all texts to embeddings
print("Converting training texts to embeddings...")
X_train = np.array([text_to_embedding(text, embeddings_index, embedding_dim) for text in train_df['text']])
y_train = train_df['label'].values

print("Converting test texts to embeddings...")
X_test = np.array([text_to_embedding(text, embeddings_index, embedding_dim) for text in test_df['text']])
y_test = test_df['label'].values


Converting training texts to embeddings...
Converting test texts to embeddings...


In [10]:

# 5) Define a simple neural network for sentiment classification
class SentimentClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super(SentimentClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, 1)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.sigmoid(x)
        return x


In [11]:

# 6) Train the model
input_dim = embedding_dim  # 300 (from GloVe embeddings)
hidden_dim = 128  # Smaller hidden layer
model = SentimentClassifier(input_dim, hidden_dim)


In [12]:

# Convert data to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).view(-1, 1)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32).view(-1, 1)


In [13]:

# Define loss and optimizer
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


In [14]:

# Training loop
num_epochs = 10
batch_size = 64  # Smaller batch size to reduce memory usage
print("Training the model...")
for epoch in range(num_epochs):
    model.train()
    for i in range(0, len(X_train), batch_size):
        batch_X = X_train_tensor[i:i+batch_size]
        batch_y = y_train_tensor[i:i+batch_size]
        
        # Forward pass
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        
        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    # Evaluate on test set
    model.eval()
    with torch.no_grad():
        test_outputs = model(X_test_tensor)
        test_preds = (test_outputs >= 0.5).float()
        accuracy = accuracy_score(y_test, test_preds.numpy())
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item():.4f}, Test Accuracy: {accuracy:.4f}")


Training the model...
Epoch 1/10, Loss: 0.5277, Test Accuracy: 0.6970
Epoch 2/10, Loss: 0.4572, Test Accuracy: 0.7400
Epoch 3/10, Loss: 0.3987, Test Accuracy: 0.7705
Epoch 4/10, Loss: 0.3435, Test Accuracy: 0.7795
Epoch 5/10, Loss: 0.3011, Test Accuracy: 0.7820
Epoch 6/10, Loss: 0.2686, Test Accuracy: 0.7870
Epoch 7/10, Loss: 0.2424, Test Accuracy: 0.7865
Epoch 8/10, Loss: 0.2210, Test Accuracy: 0.7880
Epoch 9/10, Loss: 0.2025, Test Accuracy: 0.7895
Epoch 10/10, Loss: 0.1872, Test Accuracy: 0.7910


In [15]:

# 7) Evaluate the model on the test set
model.eval()
with torch.no_grad():
    test_outputs = model(X_test_tensor)
    test_preds = (test_outputs >= 0.5).float().numpy()
    test_probs = test_outputs.numpy()


In [16]:

# Print performance metrics
accuracy = accuracy_score(y_test, test_preds)
precision = precision_score(y_test, test_preds)
recall = recall_score(y_test, test_preds)
f1 = f1_score(y_test, test_preds)
print("\nFinal Test Set Metrics:")
print(f"Accuracy:  {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1-score:  {f1:.4f}")



Final Test Set Metrics:
Accuracy:  0.7910
Precision: 0.8135
Recall:    0.7513
F1-score:  0.7812


In [17]:

# 8) Function to predict sentiment on new text
def predict_sentiment(text, model, embeddings_index, embedding_dim):
    # Convert text to embedding
    embedding = text_to_embedding(text, embeddings_index, embedding_dim)
    embedding_tensor = torch.tensor(embedding, dtype=torch.float32).view(1, -1)
    
    # Predict sentiment
    model.eval()
    with torch.no_grad():
        output = model(embedding_tensor)
        prob = output.item()
        pred = 1 if prob >= 0.5 else 0
        sentiment = "Positive" if pred == 1 else "Negative"
    return sentiment, prob


In [18]:
torch.save(model.state_dict(), 'sentiment_classifier.pth')

In [19]:

# 9) Test on a variety of example texts
example_texts = [
    "I love this movie, it’s amazing!",  # Movie-related
    "This phone is terrible, it keeps crashing.",  # Product review
    "I had an amazing day at the park with my friends!",  # Casual talk
    "The lecture was boring and unhelpful.",  # Feedback
    "I’m so excited for the weekend, it’s going to be great!",  # Random thought
    "The food at this restaurant was disappointing and overpriced.",  # Restaurant review
    "I really enjoyed the concert last night, the music was fantastic!",  # Event
    "My new laptop is super fast and easy to use.",  # Product review
    "The weather today is awful, I hate this rain!",  # Weather
    "I’m feeling so happy after talking to my best friend."  # Emotion
]

print("\nTesting the model on a variety of example texts:")
for text in example_texts:
    sentiment, prob = predict_sentiment(text, model, embeddings_index, embedding_dim)
    print(f"Text: {text}")
    print(f"Sentiment: {sentiment}, Probability: {prob:.4f}\n")



Testing the model on a variety of example texts:
Text: I love this movie, it’s amazing!
Sentiment: Positive, Probability: 0.9991

Text: This phone is terrible, it keeps crashing.
Sentiment: Negative, Probability: 0.0456

Text: I had an amazing day at the park with my friends!
Sentiment: Positive, Probability: 0.9977

Text: The lecture was boring and unhelpful.
Sentiment: Negative, Probability: 0.0012

Text: I’m so excited for the weekend, it’s going to be great!
Sentiment: Positive, Probability: 0.9856

Text: The food at this restaurant was disappointing and overpriced.
Sentiment: Negative, Probability: 0.0059

Text: I really enjoyed the concert last night, the music was fantastic!
Sentiment: Positive, Probability: 0.9981

Text: My new laptop is super fast and easy to use.
Sentiment: Positive, Probability: 0.9945

Text: The weather today is awful, I hate this rain!
Sentiment: Negative, Probability: 0.0684

Text: I’m feeling so happy after talking to my best friend.
Sentiment: Positive

In [21]:

# 10) Interactive sentiment prediction
print("Enter your own text to predict its sentiment (or type 'exit' to stop):")
while True:
    text = input("Text: ")
    if text.lower() == 'exit':
        break
    sentiment, prob = predict_sentiment(text, model, embeddings_index, embedding_dim)
    print(f"Sentiment: {sentiment}, Probability: {prob:.4f}")

Enter your own text to predict its sentiment (or type 'exit' to stop):
Sentiment: Positive, Probability: 0.9426
Sentiment: Positive, Probability: 0.9426
Sentiment: Positive, Probability: 0.9938
