# if it is not installed in the dockerfile, you'll need to install torchtext via the line below then restart the kernel

In [None]:
! pip install torchtext

In [1]:
from collections import Counter

from torchtext.data.utils import get_tokenizer
from torchtext.vocab import Vocab

import torch
import torch.nn as nn
import torch.nn.functional as F

# Create Dataset

In [2]:
# Sample statements
republican_statements = [
    "We need stronger border security.",
    "Lower taxes can stimulate the economy.",
    "It's essential to protect gun rights.",
    "Government regulations often hinder businesses.",
    "Healthcare should be market-driven."
]

democrat_statements = [
    "Climate change actions must be a priority.",
    "We need universal healthcare coverage.",
    "Raising the minimum wage is essential.",
    "Government should play a role in reducing income inequality.",
    "Support for public education should be increased."
]

# Labels: 0 for Republican, 1 for Democrat
statements = republican_statements + democrat_statements
labels = [0] * 5 + [1] * 5

# Preprocess Text

In [3]:
# Tokenization
tokenizer = get_tokenizer('basic_english')
tokenized_statements = [tokenizer(statement) for statement in statements]

# Building vocabulary
counter = Counter()
for statement in tokenized_statements:
    counter.update(statement)
vocab = Vocab(counter)

# Numericalizing tokens
text_pipeline = lambda x: [vocab[token] for token in tokenizer(x)]

# Model Definition

In [4]:
class TextClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class, hidden_dim, dropout_rate):
        super(TextClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        # Additional hidden layers
        self.fc1 = nn.Linear(embed_dim, hidden_dim)
        self.dropout = nn.Dropout(dropout_rate)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, num_class)

    def forward(self, text):
        embedded = self.embedding(text).mean(0)
        x = F.relu(self.fc1(embedded))
        x = self.dropout(x)
        x = F.relu(self.fc2(x))
        return self.fc3(x)

# Parameters
vocab_size = len(vocab)
embed_dim = 64
num_class = 2
hidden_dim = 128  # Example size of hidden dimension
dropout_rate = 0.2  # Example dropout rate

# Model instance
model = TextClassifier(vocab_size, embed_dim, num_class, hidden_dim, dropout_rate)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)

# Train the Model

In [6]:
# Training loop
for epoch in range(10):
    total_loss = 0
    for statement, label in zip(statements, labels):
        # Preparing data
        text = torch.tensor(text_pipeline(statement), dtype=torch.int64)
        label = torch.tensor([label], dtype=torch.int64)
        
        # Forward pass
        optimizer.zero_grad()
        output = model(text)
        loss = criterion(output.unsqueeze(0), label)

        # Backward and optimize
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    print(f'Epoch {epoch}, Loss: {total_loss/len(statements)}')

Epoch 0, Loss: 0.7008172452449799
Epoch 1, Loss: 0.7018247842788696
Epoch 2, Loss: 0.6961971521377563
Epoch 3, Loss: 0.6932121634483337
Epoch 4, Loss: 0.7003879606723785
Epoch 5, Loss: 0.6935909330844879
Epoch 6, Loss: 0.696438866853714
Epoch 7, Loss: 0.6941162049770355
Epoch 8, Loss: 0.7073514223098755
Epoch 9, Loss: 0.6941845178604126


# Test The Model

In [7]:
def predict(statement, model, vocab, text_pipeline):
    with torch.no_grad():
        text = torch.tensor(text_pipeline(statement), dtype=torch.int64)
        output = model(text)
        print("Output tensor shape:", output.shape)  # Add this line to check the shape of the output tensor
        if len(output.shape) == 1:  # If output is 1-dimensional
            return output.argmax(0).item()  # Use argmax(0)
        else:
            return output.argmax(1).item()

In [8]:
# Test
new_statement = "Government intervention is necessary for fair markets."
print("Prediction:", "Democrat" if predict(new_statement, model, vocab, text_pipeline) == 1 else "Republican")


Output tensor shape: torch.Size([2])
Prediction: Democrat
