In [1]:
# load libarries

import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection  import train_test_split
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [29]:
# load clean dataset

df = pd.read_csv("../datasets/combined.csv")



tokenizer = AutoTokenizer.from_pretrained("allenai/longformer-base-4096")

le = LabelEncoder()
df['label']=le.fit_transform(df['topic'])


In [30]:
x = list(df['label'].unique())

In [31]:
x = list(map(int, x))

In [32]:
x.sort()

In [33]:
len(x)

199

In [34]:

model_name = "allenai/longformer-base-4096"
num_classes = 199
device = torch.device("cuda")
max_length = 4096
learning_rate = 2e-5
batch_size = 2
num_epochs = 4

In [35]:
class BillDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=4096):
        self.data = data
        self.tokenizer = tokenizer
        # self.tokenized_data = tokenizer(list(data["summary"])) -- dont tokenize and do later to allow turncation and padding per sample
        self.max_length = max_length
        self.labels = list(data["label"])
        self.stances = list(data["nominate_mid_1"])
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        text = f"Title: {self.data.iloc[idx]['title']} Summary: {self.data.iloc[idx]['summary']}"
        encoded = self.tokenizer(text, padding='max_length', truncation=True, max_length = self.max_length, return_tensors = 'pt')
        return {'input_ids': encoded['input_ids'].squeeze(0), 
                'attention_mask': encoded['attention_mask'].squeeze(0), 
                'label': torch.tensor(self.labels[idx], dtype=torch.long), 
                'stance': torch.tensor(self.stances[idx], dtype=torch.float) }
    def get_title(self, idx):
        return str(self.data.iloc[idx]['title'])
    
   

In [36]:
train_titles, val_titles, train_inputs, val_inputs, train_topics, val_topics, train_stances, val_stances = train_test_split(df['title'].to_list(), df['summary'].to_list(), df['label'].to_list(), df['nominate_mid_1'], test_size=0.2, random_state=42)

In [37]:
train_data = {'title': train_titles, 'summary': train_inputs, 'label':train_topics, 'nominate_mid_1': train_stances}
val_data = {'title': val_titles, 'summary': val_inputs, 'label':val_topics, 'nominate_mid_1': val_stances}

In [38]:
train_data=pd.DataFrame(train_data).reset_index().drop(columns='index')
val_data = pd.DataFrame(val_data).reset_index().drop(columns='index')


In [39]:
dataset = BillDataset(data=df, tokenizer=tokenizer)

train_dataset = BillDataset(train_data, tokenizer, max_length)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataset = BillDataset(val_data, tokenizer,max_length)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)

loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [None]:
# a=0
# for a in range(len(train_dataset)):
#     stance = train_dataset[a]['stance'].item()
#     if abs(stance)==0.0:
#         print(stance)

num = sum(map(lambda x: 1 if x['stance'].item() == 0.0 else 0, train_dataset))

print(f"Number of items with stance 0.0: {num}")


Number of items with stance 0.0: 1961


: 

'To amend chapter 44 of title 18, United States Code, to prohibit the distribution of 3D printer plans for the printing of firearms, and for other purposes.'

In [9]:
class PolitcalModel(nn.Module):
    def __init__(self, model_name, num_classes):
        # Initialize Longformer
        super(PolitcalModel, self).__init__()
        self.model = AutoModel.from_pretrained(model_name)
        # Create a dropout layer
        self.dropout = nn.Dropout(p=0.2)
        # Create a classification head (Linear layer that maps hidden dim -> num_classes)
        self.topic_head = nn.Linear(self.model.config.hidden_size, num_classes )
        # Create a regression head (Linear layer that maps hidden dim -> 1)
        self.stance_head = nn.Linear(self.model.config.hidden_size, 1)
        
    def forward(self, input_ids, attention_mask):
        # Pass inputs through Longformer
        outputs = self.model(input_ids = input_ids, attention_mask = attention_mask)
        # Get the pooled output (usually from CLS token or mean of last layer)
        last_hidden_state = outputs.last_hidden_state
        x = last_hidden_state.mean(dim=1)
        # Pass that to dropout
        x = self.dropout(x)
        # Pass into classification head → topic logits
        topic_logits = self.topic_head(x)
        # Pass into regression head → stance prediction
        stance_pred = self.stance_head(x)
        # Return both outputs
        return topic_logits, stance_pred

In [104]:
model = PolitcalModel(model_name, num_classes).to(device)

In [94]:

optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', 0.3, patience=1)
topic_loss_fn = nn.CrossEntropyLoss()
stance_loss_fn = nn.MSELoss()


In [101]:
def train(model, data_loader, optimizer, device):
    model.train()
    i=0
    for batch in tqdm(data_loader, desc='Training', dynamic_ncols=True, leave=True):
        i+=1
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        stances = batch['stance'].to(device)
        
        topic_logits, stance_pred = model(input_ids, attention_mask)

        topic_loss = topic_loss_fn(topic_logits, labels)
        stance_loss = stance_loss_fn(stance_pred.squeeze(), stances)

        loss = topic_loss+ stance_loss
        loss.backward()

        optimizer.step()

        if i % 100==0:
            print(f"Step {i}, Loss: {loss.item():.4f}")

        
        


In [100]:
def validate(model, data_loader, device):
    model.eval()
    total_loss = 0
    total_correct=0
    total_samples=0
    total_mae=0
    
    with torch.no_grad():
        for batch in tqdm(data_loader, desc='Validating', dynamic_ncols=True, leave=True):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            stances = batch['stance'].to(device)

            topic_logits, stance_pred = model(input_ids, attention_mask)

            topic_preds = topic_logits.argmax(dim=1)
            total_correct += (topic_preds == labels).sum().item()
            total_samples += labels.size(0)

            stance_pred = stance_pred.squeeze()
            total_mae += torch.abs(stance_pred - stances).sum().item()

            topic_loss = topic_loss_fn(topic_logits, labels)
            stance_loss = stance_loss_fn(stance_pred, stances)

            total = topic_loss + stance_loss

            total_loss +=total.item()

    avg_loss = total_loss/len(data_loader)
    accuracy = total_correct/total_samples
    avg_mae = total_mae/total_samples

    return avg_loss, accuracy, avg_mae


In [None]:
best_val_loss = float('inf')


for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/ {num_epochs}")
    train(model, train_loader, optimizer, device)
    validation_loss, val_acc, val_mae = validate(model, val_loader, device)
    print(f"Val Loss: {validation_loss:.4f} | Accuracy: {val_acc:.4f} | MAE: {val_mae:.4f}")
    if validation_loss<best_val_loss:
        best_val_loss = validation_loss
        torch.save(model.state_dict(), "/content/drive/MyDrive/datasets/best_model.pt")

    torch.save({
    'epoch': epoch,
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'accuracy_class': val_acc,
    'mae': val_mae
    }, "checkpoint.pth")
    scheduler.step(validation_loss)
    torch.cuda.empty_cache()

torch.save(model.state_dict(), '/content/drive/MyDrive/datasets/final_model.pth')


Epoch 1/ 1


Training:   0%|          | 0/589 [00:03<?, ?it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 720.00 MiB. GPU 0 has a total capacity of 4.00 GiB of which 0 bytes is free. Of the allocated memory 18.23 GiB is allocated by PyTorch, and 211.06 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [79]:
model.eval()
batch = next(iter(test_loader))

In [80]:
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
labels = batch['label'].to(device)
stances = batch['stance'].to(device).float()

In [81]:
with torch.no_grad():
    topic_logits, stance_pred = model(input_ids, attention_mask)


In [82]:
topic_loss = topic_loss_fn(topic_logits, labels)
stance_loss = stance_loss_fn(stance_pred.squeeze(), stances)

In [83]:
stance_loss

tensor(0.3498, device='cuda:0')