## Politics inference

In [None]:
import pandas as pd
import torch 
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
dataset = pd.read_csv("../data/politics_texts.csv")
dataset

Unnamed: 0,text,speaker_party,speaker_role,speaker_name,debate_title,date,year
0,"Mr President, it was unexpected, but it is a ...",PPE,MEP,Joseph Daul,Election of the President of the European Parl...,2009-07-14,2009
1,"Mr President, on behalf of our group I would ...",S&D,MEP,Martin Schulz,Election of the President of the European Parl...,2009-07-14,2009
2,"Mr President, first of all congratulations fro...",ALDE,MEP,Guy Verhofstadt,Election of the President of the European Parl...,2009-07-14,2009
3,"Mr President, may I congratulate you on your p...",ECR,MEP,Timothy Kirkhope,Election of the President of the European Parl...,2009-07-14,2009
4,"Mr President, I am pleased that a neighbour f...",GUE/NGL,MEP,Lothar Bisky,Election of the President of the European Parl...,2009-07-14,2009
...,...,...,...,...,...,...,...
38765,"– Madam President, like the Roma community ac...",GUE/NGL,MEP,Mick Wallace,Segregation and discrimination of Roma childre...,2023-09-14,2023
38766,"– Madam President, I was happy to answer the ...",GUE/NGL,MEP,Clare Daly,Framework for ensuring a secure and sustainabl...,2023-09-14,2023
38767,"on behalf of the PPE Group. – Madam President,...",PPE,MEP,Rainer Wieland,"Parliamentarism, European citizenship and demo...",2023-09-14,2023
38768,"– Madam President, I too voted against this f...",GUE/NGL,MEP,Clare Daly,"Parliamentarism, European citizenship and demo...",2023-09-14,2023


In [3]:
dataset['speaker_party'].value_counts()

speaker_party
PPE           13733
S&D            8326
ALDE           3993
GUE/NGL        3068
Greens/EFA     2987
NI             2314
ID             2222
ECR            2127
Name: count, dtype: int64

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}")

Device: cuda


In [5]:
model_name = "roberta-base"

tokenizer = AutoTokenizer.from_pretrained(model_name)

new_encodings = tokenizer(
    list(dataset['text']), 
    truncation=True,
    padding='max_length',
    max_length=512,  
    return_tensors="pt"
)

In [6]:
class EmotionDatasetInference(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, index):
        return {key: value[index] for key, value in self.encodings.items()}
    
    def __len__(self):
        return self.encodings['input_ids'].size(0)

In [7]:
data = EmotionDatasetInference(new_encodings)

In [8]:
for i in range(3):
    example = data[i]
    print(f"\nSample {i}")
    print(f"Tokens: {tokenizer.convert_ids_to_tokens(example['input_ids'][:10])}")


Sample 0
Tokens: ['<s>', 'ĠMr', 'ĠPresident', ',', 'Ġit', 'Ġwas', 'Ġunexpected', ',', 'Ġbut', 'Ġit']

Sample 1
Tokens: ['<s>', 'ĠMr', 'ĠPresident', ',', 'Ġon', 'Ġbehalf', 'Ġof', 'Ġour', 'Ġgroup', 'ĠI']

Sample 2
Tokens: ['<s>', 'Mr', 'ĠPresident', ',', 'Ġfirst', 'Ġof', 'Ġall', 'Ġcongratulations', 'Ġfrom', 'Ġthe']


In [None]:
save_path = "../best_model_RoBERTa"

In [10]:
emotions = [
    "admiration", "amusement", "anger", "annoyance", "approval",
    "caring", "confusion", "curiosity", "desire", "disappointment",
    "disapproval", "disgust", "embarrassment", "excitement", "fear",
    "gratitude", "grief", "joy", "love", "nervousness", "optimism",
    "pride", "realization", "relief", "remorse", "sadness", "surprise", "neutral"
]

In [11]:
len(emotions)

28

In [12]:
model = AutoModelForSequenceClassification.from_pretrained(save_path)
model = model.to(device)
model.eval()

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [None]:
batch_size = 64
test_loader = DataLoader(data, batch_size=batch_size)

all_predictions = []

with torch.no_grad():
    for batch in tqdm(test_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits  # [batch_size, num_labels]

        probs = torch.sigmoid(logits)  # Multilabel classification
        preds = (probs > 0.5).int()

        for i in range(preds.size(0)):
            if preds[i].sum() == 0:
                top_idx = torch.argmax(probs[i]).item()
                preds[i, top_idx] = 1

        for pred in preds:
            labels = [model.config.id2label[idx] for idx in range(pred.size(0)) if pred[idx] == 1]
            all_predictions.append(labels)

dataset['predicted_emotions'] = all_predictions

In [None]:
dataset.to_csv("../data/dataset_politics_preds.csv")