In [65]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [66]:
from nltk.corpus import twitter_samples

In [67]:
postive_tweets = twitter_samples.strings("positive_tweets.json")
negative_tweets = twitter_samples.strings("negative_tweets.json")

print("Number of positive tweets: ", len(postive_tweets))
print("Number of negative tweets: ", len(negative_tweets))
print("Total number of tweets: ", len(postive_tweets) + len(negative_tweets))

Number of positive tweets:  5000
Number of negative tweets:  5000
Total number of tweets:  10000


In [68]:
tweets = postive_tweets + negative_tweets
labels = [1] * len(postive_tweets) + [0] * len(negative_tweets)

In [69]:
from htwgnlp.preprocessing import TweetProcessor

tweet_processor = TweetProcessor()

In [70]:
def build_vocab(tweets):
    """
    Dynamically build a vocabulary from the dataset.

    Args:
        tweets (list of str): List of tweet strings.

    Returns:
        dict: Token-to-index mapping.
    """
    from collections import Counter

    # Tokenize all tweets and count token frequencies
    counter = Counter()
    for tweet in tweets:
        tokens = tweet_processor.process_tweet(tweet)
        counter.update(tokens)

    # Assign a unique index to each token, starting from 2
    # Reserve 0 for <PAD> and 1 for <UNK>
    vocab = {"<PAD>": 0, "<UNK>": 1}
    for token in counter:
        vocab[token] = len(vocab)

    return vocab


vocab = build_vocab(tweets)

In [71]:
vocab

{'<PAD>': 0,
 '<UNK>': 1,
 'followfriday': 2,
 'top': 3,
 'engag': 4,
 'member': 5,
 'commun': 6,
 'week': 7,
 ':)': 8,
 'hey': 9,
 'jame': 10,
 'odd': 11,
 ':/': 12,
 'pleas': 13,
 'call': 14,
 'contact': 15,
 'centr': 16,
 '02392441234': 17,
 'abl': 18,
 'assist': 19,
 'mani': 20,
 'thank': 21,
 'listen': 22,
 'last': 23,
 'night': 24,
 'bleed': 25,
 'amaz': 26,
 'track': 27,
 'scotland': 28,
 'congrat': 29,
 'yeaaah': 30,
 'yipppi': 31,
 'accnt': 32,
 'verifi': 33,
 'rqst': 34,
 'succeed': 35,
 'got': 36,
 'blue': 37,
 'tick': 38,
 'mark': 39,
 'fb': 40,
 'profil': 41,
 '15': 42,
 'day': 43,
 'one': 44,
 'irresist': 45,
 'flipkartfashionfriday': 46,
 'like': 47,
 'keep': 48,
 'love': 49,
 'custom': 50,
 'wait': 51,
 'long': 52,
 'hope': 53,
 'enjoy': 54,
 'happi': 55,
 'friday': 56,
 'lwwf': 57,
 'second': 58,
 'thought': 59,
 '’': 60,
 'enough': 61,
 'time': 62,
 'dd': 63,
 'new': 64,
 'short': 65,
 'enter': 66,
 'system': 67,
 'sheep': 68,
 'must': 69,
 'buy': 70,
 'jgh': 71,
 'go

In [None]:
from pytorch_nlp.dataset import SentimentDataset

dataset = SentimentDataset(
    tweets=tweets,
    labels=labels,
    vocab=vocab,
    processor=tweet_processor.process_tweet,
    max_length=128,
)

In [91]:
dataset[0]

(tensor([2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0]),
 tensor(1))

In [86]:
import torch

train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size

train_dataset, test_dataset = torch.utils.data.random_split(
    dataset, [train_size, test_size]
)

print(f"Train Size: {len(train_dataset)}, Test Size: {len(test_dataset)}")

Train Size: 8000, Test Size: 2000


In [90]:
from torch.utils.data import DataLoader

train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=2)

In [None]:
from pytorch_nlp.model import SentimentModel

model = SentimentModel(vocab_size=len(vocab), embedding_dim=64)

In [134]:
from torchinfo import summary

# Print the model summary
summary(
    model, input_size=(2, 128), dtypes=[torch.long]
)  # Input size includes batch dimension

Layer (type:depth-idx)                   Output Shape              Param #
SentimentModel                           [2, 1]                    --
├─Embedding: 1-1                         [2, 128, 64]              672,448
├─Linear: 1-2                            [2, 1]                    65
├─Sigmoid: 1-3                           [2, 1]                    --
Total params: 672,513
Trainable params: 672,513
Non-trainable params: 0
Total mult-adds (Units.MEGABYTES): 1.35
Input size (MB): 0.00
Forward/backward pass size (MB): 0.13
Params size (MB): 2.69
Estimated Total Size (MB): 2.82

In [136]:
import torch.nn as nn
import torch.optim as optim

learning_rate = 0.001
num_epochs = 10

criterion = nn.BCELoss()  # Binary cross-entropy loss for binary classification
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

SentimentModel(
  (embedding): Embedding(10507, 64, padding_idx=0)
  (fc): Linear(in_features=64, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

In [None]:
def train_model(model, train_loader, criterion, optimizer, device, num_epochs):
    model.train()  # Set the model to training mode
    for epoch in range(num_epochs):
        total_loss = 0.0
        for batch in train_loader:
            # Unpack the batch
            input_ids, labels = batch
            input_ids = input_ids.to(device)
            labels = labels.float().to(device)  # Convert labels to float for BCELoss

            # Forward pass
            outputs = model(input_ids)
            loss = criterion(outputs.squeeze(), labels)

            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        # Print epoch summary
        print(
            f"Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss / len(train_loader):.4f}"
        )

In [None]:
def evaluate_model(model, test_loader, device):
    model.eval()  # Set the model to evaluation mode
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in test_loader:
            # Unpack the batch
            input_ids, labels = batch
            input_ids = input_ids.to(device)
            labels = labels.float().to(device)

            # Forward pass
            outputs = model(input_ids)
            predictions = (outputs.squeeze() >= 0.5).float()  # Threshold at 0.5

            # Update accuracy
            correct += (predictions == labels).sum().item()
            total += labels.size(0)

    accuracy = correct / total
    print(f"Test Accuracy: {accuracy:.4f}")
    return accuracy

In [None]:
# Train the model
train_model(model, train_loader, criterion, optimizer, device, num_epochs)

# Evaluate the model
evaluate_model(model, test_loader, device)