In [92]:
import torchtext
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.datasets import AG_NEWS
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import Vocab
from torch.utils.data import DataLoader
from collections import Counter
from torchtext.data import Field, LabelField, TabularDataset, BucketIterator


In [97]:
train_dataset, test_dataset  = torchtext.datasets.AG_NEWS()

# Load and preprocess the dataset
tokenizer = get_tokenizer('basic_english')
counter = Counter()
for (_, text) in train_dataset:
    counter.update(tokenizer(text))
vocab = Vocab(counter, min_freq=1, specials=['<unk>'])



In [124]:

import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.datasets import AG_NEWS
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import Vocab
from torch.utils.data import DataLoader
from torchtext.data import Field, LabelField, TabularDataset, BucketIterator


# Define the RNN model
class RNNClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes):
        super(RNNClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.rnn = nn.RNN(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        embedded = self.embedding(x)
        output, _ = self.rnn(embedded)
        last_hidden = output[:, -1, :]
        logits = self.fc(last_hidden)
        return logits


In [133]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.datasets import AG_NEWS
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import Vocab
from torch.utils.data import DataLoader

# Define the RNN model
class RNNClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes):
        super(RNNClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.rnn = nn.RNN(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        embedded = self.embedding(x)
        output, _ = self.rnn(embedded)
        last_hidden = output[:, -1, :]
        logits = self.fc(last_hidden)
        return logits

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define hyperparameters
vocab_size = 20000
embed_dim = 100
hidden_dim = 128
num_classes = 4
batch_size = 64
epochs = 10
learning_rate = 0.001

# Load and preprocess the dataset

tokenizer = get_tokenizer('basic_english')
train_dataset, test_dataset = AG_NEWS(split=('train', 'test'))
#train_dataset, test_dataset  = torchtext.datasets.AG_NEWS()


counter = Counter()
for (_, text) in train_dataset:
    counter.update(tokenizer(text))
vocab = Vocab(counter, min_freq=1, specials=['<unk>'])
text_pipeline = lambda x: [vocab[token] for token in tokenizer(x)]
label_pipeline = lambda x: int(x) - 1  # Adjust labels to start from 0

# Convert datasets to a list of tuples
train_data = [(label_pipeline(label), text_pipeline(text)) for (label, text) in train_dataset]
test_data = [(label_pipeline(label), text_pipeline(text)) for (label, text) in test_dataset]

# Sort the data by text lengths in ascending order
train_data.sort(key=lambda x: len(x[1]))
test_data.sort(key=lambda x: len(x[1]))

# Create batches with equal-sized sequences
def create_batches(data, batch_size):
    batches = []
    for i in range(0, len(data), batch_size):
        batch = data[i:i+batch_size]
        labels, texts = zip(*batch)
        sequences = [torch.tensor(seq) for seq in texts]
        padded_sequences = nn.utils.rnn.pad_sequence(sequences, batch_first=True)
        batches.append((torch.tensor(labels), padded_sequences))
    return batches

# Create the data loaders
train_batches = create_batches(train_data, batch_size)
test_batches = create_batches(test_data, batch_size)

# Initialize the model
model = RNNClassifier(len(vocab), embed_dim, hidden_dim, num_classes).to(device)

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(epochs):
    model.train()
    for batch, (labels, inputs) in enumerate(train_batches):
        inputs = inputs.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()

        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        if batch % 100 == 0:
            print(f"Epoch: {epoch+1}/{epochs}, Batch: {batch+1}/{len(train_batches)}, Loss: {loss.item():.4f}")

    # Evaluation
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for labels, inputs in test_batches:
            inputs = inputs.to(device)
            labels = labels.to(device)

            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    accuracy = 100 * correct / total
    print(f"Epoch: {epoch+1}/{epochs}, Test Accuracy: {accuracy:.2f}%")

# Save the trained model
torch.save(model.state_dict(), 'rnn_classifier.pt')

Epoch: 1/10, Test Accuracy: 25.01%
Epoch: 2/10, Test Accuracy: 25.01%
Epoch: 3/10, Test Accuracy: 25.01%
Epoch: 4/10, Test Accuracy: 25.01%
Epoch: 5/10, Test Accuracy: 25.01%
Epoch: 6/10, Test Accuracy: 25.01%
Epoch: 7/10, Test Accuracy: 25.01%
Epoch: 8/10, Test Accuracy: 25.01%
Epoch: 9/10, Test Accuracy: 25.01%
Epoch: 10/10, Test Accuracy: 25.01%


In [137]:
# Load the saved model
model = RNNClassifier(len(vocab), embed_dim, hidden_dim, num_classes)
model.load_state_dict(torch.load('rnn_classifier.pt'))
model.eval()

# Sample input text
#input_text = "I really enjoyed the movie. The plot was engaging and the acting was superb."
input_text = "He is really rude"

# Tokenize the input text
tokenized_text = text_pipeline(input_text)

# Convert the tokenized text into a tensor
input_tensor = torch.tensor([tokenized_text])

# Make predictions
with torch.no_grad():
    output = model(input_tensor)
    _, predicted = torch.max(output.data, 1)
    class_index = predicted.item()

# Interpret the results
class_labels = ["Class A", "Class B", "Class C", "Class D"]
predicted_class = class_labels[class_index]

print(f"Input Text: {input_text}")
print(f"Predicted Class: {predicted_class}")


Input Text: He is really rude
Predicted Class: Class A
