# if it is not installed in the dockerfile, you'll need to install torchtext via the line below then restart the kernel

In [None]:
! pip install torchtext

In [3]:
from collections import Counter

from torchtext.data.utils import get_tokenizer
from torchtext.vocab import Vocab

import torch
import torch.nn as nn
import torch.nn.functional as F

# Create Dataset

In [4]:
# Sample statements
republican_statements = [
    "We need stronger border security.",
    "Lower taxes can stimulate the economy.",
    "It's essential to protect gun rights.",
    "Government regulations often hinder businesses.",
    "Healthcare should be market-driven."
]

democrat_statements = [
    "Climate change actions must be a priority.",
    "We need universal healthcare coverage.",
    "Raising the minimum wage is essential.",
    "Government should play a role in reducing income inequality.",
    "Support for public education should be increased."
]

# Labels: 0 for Republican, 1 for Democrat
statements = republican_statements + democrat_statements
labels = [0] * 5 + [1] * 5

# Preprocess Text

In [5]:
# Tokenization
tokenizer = get_tokenizer('basic_english')
tokenized_statements = [tokenizer(statement) for statement in statements]

# Building vocabulary
counter = Counter()
for statement in tokenized_statements:
    counter.update(statement)
vocab = Vocab(counter)

# Numericalizing tokens
text_pipeline = lambda x: [vocab[token] for token in tokenizer(x)]

# Model Definition

In [6]:
class TextClassifier(nn.Module):
    def __init__(self):
        super(TextClassifier, self).__init__()
        self.embedding = nn.Embedding(5000, 64) # vocab size, embedding dimensions
        # Additional hidden layers
        self.fc1 = nn.Linear(64, 128)
        self.dropout = nn.Dropout(.2)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, text):
        embedded = self.embedding(text).mean(0)
        x = F.relu(self.fc1(embedded))
        x = self.dropout(x)
        x = F.relu(self.fc2(x))
        return self.sigmoid(self.fc3(x))

# Model instance
model = TextClassifier()

# Loss and optimizer
criterion = nn.BCELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)

# Train the Model

In [9]:
# Training loop
for epoch in range(10):
    total_loss = 0
    for statement, label in zip(statements, labels):
        # Preparing data
        text = torch.tensor(text_pipeline(statement), dtype=torch.int64)
        label = torch.tensor([label], dtype=torch.int64)
        
        # Forward pass
        optimizer.zero_grad()
        output = model(text)
        loss = criterion(output.squeeze(), label)

        # Backward and optimize
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    print(f'Epoch {epoch}, Loss: {total_loss/len(statements)}')

ValueError: Using a target size (torch.Size([1])) that is different to the input size (torch.Size([])) is deprecated. Please ensure they have the same size.

# Test The Model

In [None]:
def predict(statement, model, vocab, text_pipeline):
    with torch.no_grad():
        text = torch.tensor(text_pipeline(statement), dtype=torch.int64)
        output = model(text)
        print("Output tensor shape:", output.shape)  # Add this line to check the shape of the output tensor
        if len(output.shape) == 1:  # If output is 1-dimensional
            return output.argmax(0).item()  # Use argmax(0)
        else:
            return output.argmax(1).item()

In [None]:
# Test
new_statement = "Government intervention is necessary for fair markets."
print("Prediction:", "Democrat" if predict(new_statement, model, vocab, text_pipeline) == 1 else "Republican")
