In [3]:
import pandas as pd

In [4]:
splits = {'train': 'data/train-00000-of-00001.parquet', 'test': 'data/test-00000-of-00001.parquet'}


In [5]:
def get_cleaned_df(split: str) -> pd.DataFrame:
  df = pd.read_parquet("hf://datasets/ailsntua/QEvasion/" + splits[split])
  df.drop(columns=['title', 'date', 'url', 'president'], axis=1, inplace=True)
  df.drop(columns=['annotator1', 'annotator2', 'annotator3', 'annotator_id', 'gpt3.5_summary', 'gpt3.5_prediction'], axis=1, inplace=True)
  conversion_dict = {
    # 'president': 'category',
    'clarity_label': 'category',
    'evasion_label': 'category',
    'question_order': 'int8',
    'index': 'int16',
    'interview_question': 'string',
    'interview_answer': 'string',
    'question': 'string',
  }

  df = df.astype(conversion_dict)
  return df

In [6]:
from sklearn.model_selection import train_test_split

full_df = get_cleaned_df('train')

train_df, test_df = train_test_split(
    full_df,
    test_size=0.10,      # 10% test, 90% train
    random_state=42,     # reproducible split
    shuffle=True
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [7]:
print(f"Train dataset shape: {train_df.shape}")
print(f"Test dataset shape: {test_df.shape}")

Train dataset shape: (3103, 10)
Test dataset shape: (345, 10)


In [8]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3103 entries, 3247 to 3174
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype   
---  ------                 --------------  -----   
 0   question_order         3103 non-null   int8    
 1   interview_question     3103 non-null   string  
 2   interview_answer       3103 non-null   string  
 3   question               3103 non-null   string  
 4   inaudible              3103 non-null   bool    
 5   multiple_questions     3103 non-null   bool    
 6   affirmative_questions  3103 non-null   bool    
 7   index                  3103 non-null   int16   
 8   clarity_label          3103 non-null   category
 9   evasion_label          3103 non-null   category
dtypes: bool(3), category(2), int16(1), int8(1), string(3)
memory usage: 121.7 KB


In [9]:
train_df["evasion_label"].unique().tolist()

['Implicit',
 'Dodging',
 'Explicit',
 'Deflection',
 'General',
 'Clarification',
 'Claims ignorance',
 'Partial/half-answer',
 'Declining to answer']

In [10]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 345 entries, 2900 to 879
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype   
---  ------                 --------------  -----   
 0   question_order         345 non-null    int8    
 1   interview_question     345 non-null    string  
 2   interview_answer       345 non-null    string  
 3   question               345 non-null    string  
 4   inaudible              345 non-null    bool    
 5   multiple_questions     345 non-null    bool    
 6   affirmative_questions  345 non-null    bool    
 7   index                  345 non-null    int16   
 8   clarity_label          345 non-null    category
 9   evasion_label          345 non-null    category
dtypes: bool(3), category(2), int16(1), int8(1), string(3)
memory usage: 14.0 KB


In [11]:
test_df["evasion_label"].unique().tolist()

['Implicit',
 'General',
 'Explicit',
 'Deflection',
 'Dodging',
 'Declining to answer',
 'Claims ignorance',
 'Clarification',
 'Partial/half-answer']

In [12]:
train_df.sample(3)

Unnamed: 0,question_order,interview_question,interview_answer,question,inaudible,multiple_questions,affirmative_questions,index,clarity_label,evasion_label
3386,2,"Q. First, my respect to both of you, Mr. Bush,...",Precisely what we ought to do is help resolve ...,What do you think we should do to help resolve...,False,False,False,3386,Clear Reply,Explicit
2992,1,"Q. Mr. President, you made what I would descri...",Yes.,What is the U.S. going to do about the perpetr...,False,False,False,2992,Ambivalent,Dodging
2456,13,Q. So is August 2 a yellow light or a red light?,"I think people should think of--look, I'm the ...",August 2 - Yellow or Red Light?,False,False,False,2456,Ambivalent,General


In [13]:
train_df['text'] = train_df['interview_question'] + ' [SEP] ' + train_df['interview_answer']
test_df['text'] = test_df['interview_question'] + ' [SEP] ' + test_df['interview_answer']


train_df['text'].sample(2).tolist()

["Q. Mr. President, you mentioned the prospect that your successor would be dealing with the war. You'll be making your first trip to Vietnam in roughly a week. Some people are still—are looking at the war as another Vietnam war. Are they wrong to do so? And if so, why? [SEP] I think they are. I think they are. First of all, Iraq is—after the overthrow of the tyrant, voted on a Constitution that is intended to unite the whole country. And then they had elections under that Constitution, where nearly 12 million people voted for this unity Government. Secondly—which is different from Vietnam.Secondly, in terms of our troops, this is a volunteer army. Vietnam wasn't a volunteer army, as you know. And in this Volunteer Army, the troops understand the consequences of Iraq and the global war on terror. That's why reenlistment rates are up, and that's why enlistment is high.Thirdly, the support for our troops is strong here in the United States, and it wasn't during the Vietnam era. So I see 

In [14]:
from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoders for clarity_label and evasion_label
clarity_encoder = LabelEncoder()
evasion_encoder = LabelEncoder()

# Fit and transform clarity_label for both dataframes
train_df['clarity_label_encoded'] = clarity_encoder.fit_transform(train_df['clarity_label'])
test_df['clarity_label_encoded'] = clarity_encoder.transform(test_df['clarity_label'])

# Fit and transform evasion_label for both dataframes
train_df['evasion_label_encoded'] = evasion_encoder.fit_transform(train_df['evasion_label'])
test_df['evasion_label_encoded'] = evasion_encoder.transform(test_df['evasion_label'])


train_df[['clarity_label', 'clarity_label_encoded', 'evasion_label', 'evasion_label_encoded']].sample(3)

Unnamed: 0,clarity_label,clarity_label_encoded,evasion_label,evasion_label_encoded
307,Clear Reply,2,Explicit,5
2032,Ambivalent,0,Dodging,4
2293,Ambivalent,0,General,6


In [15]:
from transformers import AutoTokenizer

# Initialize a tokenizer
model_name = 'roberta-large'
tokenizer = AutoTokenizer.from_pretrained(model_name)

print(f"Tokenizer initialized: {tokenizer.name_or_path}")

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Tokenizer initialized: roberta-large


In [16]:
train_tokenized = tokenizer(train_df['text'].tolist(), truncation=True, padding=True)
test_tokenized = tokenizer(test_df['text'].tolist(), truncation=True, padding=True)


In [17]:
!pip install datasets



In [18]:
import torch
from datasets import Dataset

In [19]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, clarity_labels, evasion_labels):
        self.encodings = encodings
        self.clarity_labels = clarity_labels
        self.evasion_labels = evasion_labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['clarity_labels'] = torch.tensor(self.clarity_labels[idx], dtype=torch.long)
        item['evasion_labels'] = torch.tensor(self.evasion_labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.clarity_labels)

train_dataset = CustomDataset(
    train_tokenized,
    train_df['clarity_label_encoded'].tolist(),
    train_df['evasion_label_encoded'].tolist()
)

test_dataset = CustomDataset(
    test_tokenized,
    test_df['clarity_label_encoded'].tolist(),
    test_df['evasion_label_encoded'].tolist()
)

print(f"Train dataset size: {len(train_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")

Train dataset size: 3103
Test dataset size: 345


In [20]:
from transformers import AutoModel

# Load a pre-trained model
model = AutoModel.from_pretrained(model_name)

# Determine the number of unique labels for each task
num_clarity_labels = train_df['clarity_label_encoded'].nunique()
num_evasion_labels = train_df['evasion_label_encoded'].nunique()

print(f"Pre-trained model '{model_name}' loaded successfully.")
print(f"Number of unique clarity labels: {num_clarity_labels}")
print(f"Number of unique evasion labels: {num_evasion_labels}")

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Pre-trained model 'roberta-large' loaded successfully.
Number of unique clarity labels: 3
Number of unique evasion labels: 9


In [21]:
import torch.nn as nn

class MultiTaskModel(nn.Module):
    def __init__(self, base_model, num_clarity_labels, num_evasion_labels):
        super().__init__()
        self.base_model = base_model
        hidden_size = self.base_model.config.hidden_size # Get hidden size from base model config

        self.clarity_classifier = nn.Linear(hidden_size, num_clarity_labels)
        self.evasion_classifier = nn.Linear(hidden_size, num_evasion_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
        # Extract the hidden state of the first token ([CLS] token)
        cls_embedding = outputs.last_hidden_state[:, 0]

        clarity_logits = self.clarity_classifier(cls_embedding)
        evasion_logits = self.evasion_classifier(cls_embedding)

        return clarity_logits, evasion_logits


In [22]:
model = MultiTaskModel(model, num_clarity_labels, num_evasion_labels)

# Define the optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

# Define loss functions for each task
clarity_loss_fn = nn.CrossEntropyLoss()
evasion_loss_fn = nn.CrossEntropyLoss()

# Create DataLoader objects
from torch.utils.data import DataLoader

batch_size = 16
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

print(f"Train DataLoader has {len(train_dataloader)} batches.")
print(f"Test DataLoader has {len(test_dataloader)} batches.")

Train DataLoader has 194 batches.
Test DataLoader has 22 batches.


In [23]:
# device = torch.device('cpu') # Temporarily force to CPU for debugging
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') # Original line
model.to(device)

epochs = 3 # Define the number of epochs

print(f"Starting training on device: {device}")

for epoch in range(epochs):
    model.train() # Set model to training mode
    total_train_loss = 0
    for batch_idx, batch in enumerate(train_dataloader):
        # Move tensors to the appropriate device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        clarity_labels = batch['clarity_labels'].to(device)
        evasion_labels = batch['evasion_labels'].to(device)

        # Zero the gradients
        optimizer.zero_grad()

        # Forward pass
        clarity_logits, evasion_logits = model(input_ids=input_ids, attention_mask=attention_mask)

        # Calculate loss for each task
        clarity_loss = clarity_loss_fn(clarity_logits, clarity_labels)
        evasion_loss = evasion_loss_fn(evasion_logits, evasion_labels)

        # Combine losses
        total_loss = clarity_loss + evasion_loss

        # Backward pass and optimize
        total_loss.backward()
        optimizer.step()

        total_train_loss += total_loss.item()

        if (batch_idx + 1) % 50 == 0: # Print training loss periodically
            print(f"Epoch {epoch+1}, Batch {batch_idx+1}/{len(train_dataloader)}, Loss: {total_loss.item():.4f}")

    avg_train_loss = total_train_loss / len(train_dataloader)
    print(f"Epoch {epoch+1} finished. Average training loss: {avg_train_loss:.4f}")

print("Training complete.")

Starting training on device: cuda
Epoch 1, Batch 50/194, Loss: 3.4187
Epoch 1, Batch 100/194, Loss: 3.1346
Epoch 1, Batch 150/194, Loss: 3.0475
Epoch 1 finished. Average training loss: 2.9486
Epoch 2, Batch 50/194, Loss: 2.7070
Epoch 2, Batch 100/194, Loss: 2.3705
Epoch 2, Batch 150/194, Loss: 2.9643
Epoch 2 finished. Average training loss: 2.8419
Epoch 3, Batch 50/194, Loss: 2.9516
Epoch 3, Batch 100/194, Loss: 2.8901
Epoch 3, Batch 150/194, Loss: 2.6371
Epoch 3 finished. Average training loss: 2.8249
Training complete.


In [24]:
import random

# Select a random index from the training dataset
random_idx = random.randint(0, len(train_df) - 1)

# Get the sample from the train_df
sample_row = train_df.iloc[random_idx]

# Extract the original text and labels
input_text = sample_row['text']
true_clarity_label_encoded = sample_row['clarity_label_encoded']
true_evasion_label_encoded = sample_row['evasion_label_encoded']

true_clarity_label = clarity_encoder.inverse_transform([true_clarity_label_encoded])[0]
true_evasion_label = evasion_encoder.inverse_transform([true_evasion_label_encoded])[0]

# Tokenize the input text
inputs = tokenizer(input_text, return_tensors='pt', truncation=True, padding=True)

# Move inputs to the device
input_ids = inputs['input_ids'].to(device)
attention_mask = inputs['attention_mask'].to(device)

# Set model to evaluation mode
model.eval()

# Get predictions
with torch.no_grad():
    clarity_logits, evasion_logits = model(input_ids=input_ids, attention_mask=attention_mask)

# Get predicted labels (indices)
predicted_clarity_label_encoded = torch.argmax(clarity_logits, dim=-1).item()
predicted_evasion_label_encoded = torch.argmax(evasion_logits, dim=-1).item()

# Decode predicted labels
predicted_clarity_label = clarity_encoder.inverse_transform([predicted_clarity_label_encoded])[0]
predicted_evasion_label = evasion_encoder.inverse_transform([predicted_evasion_label_encoded])[0]

print(f"Input Text: {input_text}")
print("\n--- True Labels ---")
print(f"Clarity Label: {true_clarity_label}")
print(f"Evasion Label: {true_evasion_label}")

print("\n--- Predicted Labels ---")
print(f"Predicted Clarity Label: {predicted_clarity_label}")
print(f"Predicted Evasion Label: {predicted_evasion_label}")


Input Text: Q. Eighty days before an election, sir? Is this the right time? [SEP] Well, wait a minute. You just threw—look, I just read last night that now New Jersey is going to try the universal mail-in voting. Well, they didn't know this. So now, all of a sudden, New Jersey is going to be hit with millions of ballots to be sent out. They didn't know anything about this.So how does a Post Office—how does a Postal Service that doesn't know about it, now all of a sudden, New Jersey is supposed to take out, and millions of ballots are going to be sent all over New Jersey?And if you look at some of the things they say, like in—take the State of Nevada, take that little scam that's going on over there with the clubhouse politician Governor. Take that, where the votes don't even have to be in until 7—they get counted 7 days after November 3. That means if—Nevada is a very important State. I think we have a great chance of winning that State. If the votes don't have to be in for 7—and they'

# Evaluation


In [25]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [26]:
model.eval() # Set model to evaluation mode

# Initialize lists to store true labels and predictions
all_clarity_labels = []
all_clarity_preds = []
all_evasion_labels = []
all_evasion_preds = []

print("Starting evaluation on the test dataset...")

with torch.no_grad(): # Disable gradient calculation for evaluation
    for batch_idx, batch in enumerate(test_dataloader):
        # Move tensors to the appropriate device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        clarity_labels = batch['clarity_labels'].to(device)
        evasion_labels = batch['evasion_labels'].to(device)

        # Forward pass
        clarity_logits, evasion_logits = model(input_ids=input_ids, attention_mask=attention_mask)

        # Get predicted labels
        clarity_preds = torch.argmax(clarity_logits, dim=-1)
        evasion_preds = torch.argmax(evasion_logits, dim=-1)

        # Append true labels and predictions
        all_clarity_labels.extend(clarity_labels.cpu().tolist())
        all_clarity_preds.extend(clarity_preds.cpu().tolist())
        all_evasion_labels.extend(evasion_labels.cpu().tolist())
        all_evasion_preds.extend(evasion_preds.cpu().tolist())

# Convert lists to numpy arrays for metric calculation
import numpy as np
all_clarity_labels = np.array(all_clarity_labels)
all_clarity_preds = np.array(all_clarity_preds)
all_evasion_labels = np.array(all_evasion_labels)
all_evasion_preds = np.array(all_evasion_preds)

# Calculate and print metrics for clarity_label
print("\n--- Clarity Label Evaluation ---")
clarity_accuracy = accuracy_score(all_clarity_labels, all_clarity_preds)
clarity_precision, clarity_recall, clarity_f1, _ = precision_recall_fscore_support(
    all_clarity_labels, all_clarity_preds, average='weighted', zero_division=0
)
print(f"Accuracy: {clarity_accuracy:.4f}")
print(f"Precision (weighted): {clarity_precision:.4f}")
print(f"Recall (weighted): {clarity_recall:.4f}")
print(f"F1-Score (weighted): {clarity_f1:.4f}")

# Calculate and print metrics for evasion_label
print("\n--- Evasion Label Evaluation ---")
evasion_accuracy = accuracy_score(all_evasion_labels, all_evasion_preds)
evasion_precision, evasion_recall, evasion_f1, _ = precision_recall_fscore_support(
    all_evasion_labels, all_evasion_preds, average='weighted', zero_division=0
)
print(f"Accuracy: {evasion_accuracy:.4f}")
print(f"Precision (weighted): {evasion_precision:.4f}")
print(f"Recall (weighted): {evasion_recall:.4f}")
print(f"F1-Score (weighted): {evasion_f1:.4f}")

print("\nEvaluation complete for both tasks.")

Starting evaluation on the test dataset...

--- Clarity Label Evaluation ---
Accuracy: 0.5855
Precision (weighted): 0.3428
Recall (weighted): 0.5855
F1-Score (weighted): 0.4324

--- Evasion Label Evaluation ---
Accuracy: 0.2957
Precision (weighted): 0.0874
Recall (weighted): 0.2957
F1-Score (weighted): 0.1349

Evaluation complete for both tasks.


In [29]:
# import torch, gc
# gc.collect()
# torch.cuda.empty_cache()
# torch.cuda.ipc_collect()