In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
from tqdm import tqdm

1. Topic and sequence classification with RoBERTa

In [9]:
# Read CSV File
df = pd.read_csv("/Users/gresasmolica/Downloads/processed_data_congress.csv")

# Flatten the token lists into a single string
df['flattext'] = df['processed_text_v2'].apply(lambda tokens: ' '.join(tokens))

# Encode the text
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
df['encoded'] = df['flattext'].apply(lambda text: tokenizer.encode(text, max_length=512, truncation=True, padding='max_length'))

# Encode the labels (Congress type: House, Senate, Joint)
label_encoder = LabelEncoder()
df['label_encoded'] = label_encoder.fit_transform(df['congress'])

# Split into train and test
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

In [12]:

class CongressDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        return torch.tensor(self.texts[idx]), torch.tensor(self.labels[idx])
    
# Create the dataloaders
train_dataset = CongressDataset(train_df['encoded'].tolist(), train_df['label_encoded'].tolist())
val_dataset = CongressDataset(val_df['encoded'].tolist(), val_df['label_encoded'].tolist())

train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=8, shuffle=False)

# Define Roberta-based model for sequence classification
class RoBERTaSequenceModel(nn.Module):
    def __init__(self, num_labels):
        super(RoBERTaSequenceModel, self).__init__()
        self.roberta = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=num_labels, hidden_dropout_prob=0.2)

    def forward(self, input_ids, attention_mask):
        return self.roberta(input_ids, attention_mask=attention_mask)['logits']

# Define the model for sequence classification
sequence_model = RoBERTaSequenceModel(num_labels=len(label_encoder.classes_))

# Define training parameters for sequence classification
sequence_optimizer = AdamW(sequence_model.parameters(), lr=1e-6, weight_decay=1e-5)
sequence_criterion = nn.CrossEntropyLoss()

# Move the model to device
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
sequence_model.to(device)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RoBERTaSequenceModel(
  (roberta): RobertaForSequenceClassification(
    (roberta): RobertaModel(
      (embeddings): RobertaEmbeddings(
        (word_embeddings): Embedding(50265, 768, padding_idx=1)
        (position_embeddings): Embedding(514, 768, padding_idx=1)
        (token_type_embeddings): Embedding(1, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.2, inplace=False)
      )
      (encoder): RobertaEncoder(
        (layer): ModuleList(
          (0-11): 12 x RobertaLayer(
            (attention): RobertaAttention(
              (self): RobertaSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): RobertaSelfOutput(
                

In [13]:
# Training loop for sequence classification
num_epochs = 10
for epoch in range(num_epochs):
    sequence_model.train()
    total_loss = 0

    for inputs, labels in tqdm(train_dataloader, desc=f'Epoch {epoch + 1}/{num_epochs}'):
        inputs, labels = inputs.to(device), labels.to(device)

        sequence_optimizer.zero_grad()
        outputs = sequence_model(inputs, attention_mask=(inputs != tokenizer.pad_token_id))
        loss = sequence_criterion(outputs, labels)
        loss.backward()
        sequence_optimizer.step()

        total_loss += loss.item()

    average_loss = total_loss / len(train_dataloader)
    print(f'Sequence Classification Training Loss: {average_loss}')

Epoch 1/10: 100%|██████████| 6/6 [01:06<00:00, 11.15s/it]


Sequence Classification Training Loss: 1.1497634847958882


Epoch 2/10: 100%|██████████| 6/6 [01:02<00:00, 10.36s/it]


Sequence Classification Training Loss: 1.1172093749046326


Epoch 3/10: 100%|██████████| 6/6 [01:10<00:00, 11.73s/it]


Sequence Classification Training Loss: 1.1260404586791992


Epoch 4/10: 100%|██████████| 6/6 [01:12<00:00, 12.02s/it]


Sequence Classification Training Loss: 1.128164529800415


Epoch 5/10: 100%|██████████| 6/6 [01:30<00:00, 15.07s/it]


Sequence Classification Training Loss: 1.1453304489453633


Epoch 6/10: 100%|██████████| 6/6 [01:04<00:00, 10.77s/it]


Sequence Classification Training Loss: 1.1035723288853962


Epoch 7/10: 100%|██████████| 6/6 [01:10<00:00, 11.83s/it]


Sequence Classification Training Loss: 1.1307255625724792


Epoch 8/10: 100%|██████████| 6/6 [01:08<00:00, 11.35s/it]


Sequence Classification Training Loss: 1.094250003496806


Epoch 9/10: 100%|██████████| 6/6 [01:12<00:00, 12.05s/it]


Sequence Classification Training Loss: 1.0888781150182087


Epoch 10/10: 100%|██████████| 6/6 [01:17<00:00, 12.99s/it]

Sequence Classification Training Loss: 1.0846728682518005





In [14]:
# Evaluate the sequence classification model
sequence_model.eval()
val_loss = 0
correct = 0

with torch.no_grad():
    for inputs, labels in tqdm(val_dataloader, desc=f'Sequence Classification Validation'):
        inputs, labels = inputs.to(device), labels.to(device)

        outputs = sequence_model(inputs, attention_mask=(inputs != tokenizer.pad_token_id))
        loss = sequence_criterion(outputs, labels)
        val_loss += loss.item()

        preds = torch.argmax(outputs, dim=1)
        correct += (preds == labels).sum().item()

average_val_loss = val_loss / len(val_dataloader)
accuracy = correct / len(val_dataset)
print(f'Sequence Classification Validation Loss: {average_val_loss}, Accuracy: {accuracy}')

Sequence Classification Validation:   0%|          | 0/2 [00:00<?, ?it/s]

Sequence Classification Validation: 100%|██████████| 2/2 [00:02<00:00,  1.32s/it]

Sequence Classification Validation Loss: 1.0791288614273071, Accuracy: 0.5454545454545454





In [15]:
# Define Roberta-based model for topic classification
class RoBERTaTopicModel(nn.Module):
    def __init__(self, num_labels):
        super(RoBERTaTopicModel, self).__init__()
        self.roberta = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=num_labels, hidden_dropout_prob=0.2)

    def forward(self, input_ids, attention_mask):
        return self.roberta(input_ids, attention_mask=attention_mask)['logits']

# Define the model for topic classification
topic_model = RoBERTaTopicModel(num_labels=len(label_encoder.classes_))

# Define training parameters for topic classification
topic_optimizer = AdamW(topic_model.parameters(), lr=1e-6, weight_decay=1e-5)
topic_criterion = nn.CrossEntropyLoss()

# Move the model to device
topic_model.to(device)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RoBERTaTopicModel(
  (roberta): RobertaForSequenceClassification(
    (roberta): RobertaModel(
      (embeddings): RobertaEmbeddings(
        (word_embeddings): Embedding(50265, 768, padding_idx=1)
        (position_embeddings): Embedding(514, 768, padding_idx=1)
        (token_type_embeddings): Embedding(1, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.2, inplace=False)
      )
      (encoder): RobertaEncoder(
        (layer): ModuleList(
          (0-11): 12 x RobertaLayer(
            (attention): RobertaAttention(
              (self): RobertaSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): RobertaSelfOutput(
                (de

In [16]:
# Training loop for topic classification
for epoch in range(num_epochs):
    topic_model.train()
    total_loss = 0

    for inputs, labels in tqdm(train_dataloader, desc=f'Epoch {epoch + 1}/{num_epochs}'):
        inputs, labels = inputs.to(device), labels.to(device)

        topic_optimizer.zero_grad()
        outputs = topic_model(inputs, attention_mask=(inputs != tokenizer.pad_token_id))
        loss = topic_criterion(outputs, labels)
        loss.backward()
        topic_optimizer.step()

        total_loss += loss.item()

    average_loss = total_loss / len(train_dataloader)
    print(f'Topic Classification Training Loss: {average_loss}')

Epoch 1/10: 100%|██████████| 6/6 [01:08<00:00, 11.39s/it]


Topic Classification Training Loss: 1.102862278620402


Epoch 2/10: 100%|██████████| 6/6 [01:13<00:00, 12.32s/it]


Topic Classification Training Loss: 1.0826696157455444


Epoch 3/10: 100%|██████████| 6/6 [01:04<00:00, 10.74s/it]


Topic Classification Training Loss: 1.111359675725301


Epoch 4/10: 100%|██████████| 6/6 [01:06<00:00, 11.16s/it]


Topic Classification Training Loss: 1.0332926511764526


Epoch 5/10: 100%|██████████| 6/6 [01:03<00:00, 10.66s/it]


Topic Classification Training Loss: 1.0380191306273143


Epoch 6/10: 100%|██████████| 6/6 [01:04<00:00, 10.73s/it]


Topic Classification Training Loss: 1.0147189696629841


Epoch 7/10: 100%|██████████| 6/6 [01:14<00:00, 12.41s/it]


Topic Classification Training Loss: 1.03283029794693


Epoch 8/10: 100%|██████████| 6/6 [01:05<00:00, 10.90s/it]


Topic Classification Training Loss: 1.0269252459208171


Epoch 9/10: 100%|██████████| 6/6 [01:07<00:00, 11.24s/it]


Topic Classification Training Loss: 1.0650557279586792


Epoch 10/10: 100%|██████████| 6/6 [01:05<00:00, 10.89s/it]

Topic Classification Training Loss: 1.00411061445872





In [18]:
# Evaluate the topic classification model
topic_model.eval()
val_loss = 0
correct = 0

with torch.no_grad():
    for inputs, labels in tqdm(val_dataloader, desc=f'Topic Classification Validation'):
        inputs, labels = inputs.to(device), labels.to(device)

        outputs = topic_model(inputs, attention_mask=(inputs != tokenizer.pad_token_id))
        loss = topic_criterion(outputs, labels)
        val_loss += loss.item()

        preds = torch.argmax(outputs, dim=1)
        correct += (preds == labels).sum().item()

average_val_loss = val_loss / len(val_dataloader)
accuracy = correct / len(val_dataset)
print(f'Topic Classification Validation Loss: {average_val_loss}, Accuracy: {accuracy}')

Topic Classification Validation: 100%|██████████| 2/2 [00:02<00:00,  1.37s/it]

Topic Classification Validation Loss: 1.0457175374031067, Accuracy: 0.36363636363636365



