In [138]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [139]:
from nltk.corpus import twitter_samples

In [140]:
postive_tweets = twitter_samples.strings("positive_tweets.json")
negative_tweets = twitter_samples.strings("negative_tweets.json")

print("Number of positive tweets: ", len(postive_tweets))
print("Number of negative tweets: ", len(negative_tweets))
print("Total number of tweets: ", len(postive_tweets) + len(negative_tweets))

Number of positive tweets:  5000
Number of negative tweets:  5000
Total number of tweets:  10000


In [141]:
tweets = postive_tweets + negative_tweets
labels = [1] * len(postive_tweets) + [0] * len(negative_tweets)

In [168]:
from htwgnlp.preprocessing import TweetProcessor
from pytorch_nlp.dataset import SentimentDataset

tweet_processor = TweetProcessor()

dataset = SentimentDataset(
    tweets=tweets,
    labels=labels,
    processor=tweet_processor.process_tweet,
    max_length=128,
)

dataset[0]

(tensor([2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0]),
 tensor(1))

In [169]:
import torch
from torch.utils.data import DataLoader

train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size

train_dataset, test_dataset = torch.utils.data.random_split(
    dataset, [train_size, test_size]
)

print(f"Train Size: {len(train_dataset)}, Test Size: {len(test_dataset)}")

train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=2)

Train Size: 8000, Test Size: 2000


In [173]:
from pytorch_nlp.model import SentimentModel
from torchinfo import summary

model = SentimentModel(vocab_size=len(dataset.vocab), embedding_dim=64)

summary(model, input_size=(2, 128), dtypes=[torch.long])

Layer (type:depth-idx)                   Output Shape              Param #
SentimentModel                           [2, 1]                    --
├─Embedding: 1-1                         [2, 128, 64]              672,448
├─Linear: 1-2                            [2, 1]                    65
├─Sigmoid: 1-3                           [2, 1]                    --
Total params: 672,513
Trainable params: 672,513
Non-trainable params: 0
Total mult-adds (Units.MEGABYTES): 1.35
Input size (MB): 0.00
Forward/backward pass size (MB): 0.13
Params size (MB): 2.69
Estimated Total Size (MB): 2.82

In [151]:
import torch.nn as nn
import torch.optim as optim

learning_rate = 0.001
num_epochs = 10

criterion = nn.BCELoss()  # Binary cross-entropy loss for binary classification
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

SentimentModel(
  (embedding): Embedding(10507, 64, padding_idx=0)
  (fc): Linear(in_features=64, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

In [152]:
def train_model(model, train_loader, criterion, optimizer, device, num_epochs):
    model.train()  # Set the model to training mode
    for epoch in range(num_epochs):
        total_loss = 0.0
        for batch in train_loader:
            # Unpack the batch
            input_ids, labels = batch
            input_ids = input_ids.to(device)
            labels = labels.float().to(device)  # Convert labels to float for BCELoss

            # Forward pass
            outputs = model(input_ids)
            loss = criterion(outputs.squeeze(), labels)

            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        # Print epoch summary
        print(
            f"Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss / len(train_loader):.4f}"
        )

In [153]:
def evaluate_model(model, test_loader, device):
    model.eval()  # Set the model to evaluation mode
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in test_loader:
            # Unpack the batch
            input_ids, labels = batch
            input_ids = input_ids.to(device)
            labels = labels.float().to(device)

            # Forward pass
            outputs = model(input_ids)
            predictions = (outputs.squeeze() >= 0.5).float()  # Threshold at 0.5

            # Update accuracy
            correct += (predictions == labels).sum().item()
            total += labels.size(0)

    accuracy = correct / total
    print(f"Test Accuracy: {accuracy:.4f}")
    return accuracy

In [154]:
# Train the model
train_model(model, train_loader, criterion, optimizer, device, num_epochs)

# Evaluate the model
evaluate_model(model, test_loader, device)

Epoch [1/10], Loss: 0.3396
Epoch [2/10], Loss: 0.0333
Epoch [3/10], Loss: 0.0118
Epoch [4/10], Loss: 0.0075
Epoch [5/10], Loss: 0.0058
Epoch [6/10], Loss: 0.0048
Epoch [7/10], Loss: 0.0041
Epoch [8/10], Loss: 0.0036
Epoch [9/10], Loss: 0.0032
Epoch [10/10], Loss: 0.0029
Test Accuracy: 0.9975


0.9975