In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv('cleaned_data.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3448 entries, 0 to 3447
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   president              3448 non-null   object
 1   question_order         3448 non-null   int64 
 2   interview_question     3448 non-null   object
 3   interview_answer       3448 non-null   object
 4   question               3448 non-null   object
 5   inaudible              3448 non-null   bool  
 6   multiple_questions     3448 non-null   bool  
 7   affirmative_questions  3448 non-null   bool  
 8   index                  3448 non-null   int64 
 9   clarity_label          3448 non-null   object
 10  evasion_label          3448 non-null   object
dtypes: bool(3), int64(2), object(6)
memory usage: 225.7+ KB


In [3]:
unique_labels = df['evasion_label'].unique().tolist()
label2id = {label: i for i, label in enumerate(unique_labels)}
id2label = {i: label for i, label in enumerate(unique_labels)}
label2id

{'Explicit': 0,
 'General': 1,
 'Partial/half-answer': 2,
 'Dodging': 3,
 'Implicit': 4,
 'Deflection': 5,
 'Declining to answer': 6,
 'Claims ignorance': 7,
 'Clarification': 8}

In [4]:
df['label'] = df['evasion_label'].map(label2id)
df['question'] = df['question'].astype(str)
df['interview_answer'] = df['interview_answer'].astype(str)

In [5]:
train_df, val_df = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)

dataset = DatasetDict({
    "train": Dataset.from_pandas(train_df),
    "validation": Dataset.from_pandas(val_df)
})

In [6]:
model_checkpoint = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def tokenize_function(examples):
    return tokenizer(
        examples["question"],
        examples["interview_answer"],
        padding="max_length",
        truncation=True,
        max_length=256
    )

tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map: 100%|██████████| 2758/2758 [00:00<00:00, 8247.83 examples/s]
Map: 100%|██████████| 690/690 [00:00<00:00, 8857.73 examples/s]


In [7]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint,
    num_labels=len(unique_labels),
    id2label=id2label,
    label2id=label2id
)

Cancellation requested; stopping current tasks.


KeyboardInterrupt: 

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    f1 = f1_score(labels, predictions, average='macro')
    acc = accuracy_score(labels, predictions)
    return {"accuracy": acc, "f1_macro": f1}

In [None]:
training_args = TrainingArguments(
    output_dir="./results_roberta",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

metrics = trainer.evaluate()
print(metrics)