# V3 Training & Evaluation

In [None]:
!pip install datasets
!pip install transformers

import torch
import random
import torch.nn.functional as F
from torch.utils.data import DataLoader

from datasets import load_metric,load_dataset,Dataset

import transformers
from transformers import RobertaTokenizer, DataCollatorWithPadding,RobertaForSequenceClassification,AdamW,get_scheduler,TrainingArguments,Trainer, set_seed

import itertools
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split,StratifiedKFold
from tqdm.auto import tqdm, trange

import csv      
import gc
import re

import logging
import warnings

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.10.1-py3-none-any.whl (469 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m469.0/469.0 KB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess
  Downloading multiprocess-0.70.14-py39-none-any.whl (132 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.9/132.9 KB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
Collecting aiohttp
  Downloading aiohttp-3.8.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m43.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0.0,>=0.2.0
  Downloading huggingface_hub-0.13.1-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.2/199.2 KB[0m [31m24.7 MB/s[0m eta [36m0:00:00[0m
Collecting responses<0.19
  Downlo

In [None]:
def compute_metrics(model, testing_dataloader):
    """computes F1 score and accuracy over dataset

    Args:
        model (any type): model for evaluation
        testing_dataloader (huggingface dataset): self explained

    Returns:
        f1 score
    """
    metric_f1 = load_metric("f1")

    model.eval()
    for batch in testing_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)

        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)  
        metric_f1.add_batch(predictions=predictions, references=batch["labels"])
        
    return metric_f1.compute(average='macro')['f1']

In [None]:
BATCH_SIZE = 64
N_FOLDS = 5
N_EPOCHS = 3
WEIGHT_DECAY=0.1
LR=5e-5
MODEL_CHECKPOINT = 'roberta-base'
RANDOM_SEED = 42

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
transformers.logging.set_verbosity(transformers.logging.ERROR)

logging.disable(logging.ERROR)
np.warnings.filterwarnings('ignore', category=np.VisibleDeprecationWarning)
warnings.filterwarnings("ignore", category=UserWarning) 

# Helper function for reproducible behavior to set the seed in random, numpy, torch
set_seed(RANDOM_SEED)

## Data preprocessing

In [None]:
data_babe = load_dataset("mediabiasgroup/BABE")['train'].to_pandas()
# Filter out sentences with no agreement
data_babe=data_babe[data_babe.label_bias!='No agreement']
# Rename and make label numeric
data_babe['label']=data_babe['label_bias'].apply(lambda x: 0 if x=='Non-biased' else 1)
data_babe.info()
data_babe.head()

data_babe[:10]

Downloading builder script:   0%|          | 0.00/2.53k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/718 [00:00<?, ?B/s]

Downloading and preparing dataset babe/default to /root/.cache/huggingface/datasets/mediabiasgroup___babe/default/0.0.0/a62c13014c0a0666b78ffcf5a331e8044855fb362823df4ca729437b084b7d69...


Downloading data:   0%|          | 0.00/1.46M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset babe downloaded and prepared to /root/.cache/huggingface/datasets/mediabiasgroup___babe/default/0.0.0/a62c13014c0a0666b78ffcf5a331e8044855fb362823df4ca729437b084b7d69. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3673 entries, 0 to 3673
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   text           3673 non-null   object
 1   news_link      3673 non-null   object
 2   outlet         3673 non-null   object
 3   topic          3673 non-null   object
 4   type           3673 non-null   object
 5   label_bias     3673 non-null   object
 6   label_opinion  3673 non-null   object
 7   biased_words   3673 non-null   object
 8   label          3673 non-null   int64 
dtypes: int64(1), object(8)
memory usage: 287.0+ KB


Unnamed: 0,text,news_link,outlet,topic,type,label_bias,label_opinion,biased_words,label
0,"""Orange Is the New Black"" star Yael Stone is r...",https://www.foxnews.com/entertainment/australi...,Fox News,environment,right,Non-biased,Entirely factual,[],0
1,"""We have one beautiful law,"" Trump recently sa...",https://www.alternet.org/2020/06/law-and-order...,Alternet,gun control,left,Biased,Somewhat factual but also opinionated,"['bizarre', 'characteristically']",1
2,"...immigrants as criminals and eugenics, all o...",https://www.nbcnews.com/news/latino/after-step...,MSNBC,white-nationalism,left,Biased,Expresses writer’s opinion,"['criminals', 'fringe', 'extreme']",1
3,...we sounded the alarm in the early months of...,https://www.alternet.org/2019/07/fox-news-has-...,Alternet,white-nationalism,left,Biased,Somewhat factual but also opinionated,[],1
4,[Black Lives Matter] is essentially a non-fals...,http://feedproxy.google.com/~r/breitbart/~3/-v...,Breitbart,marriage-equality,,Biased,Expresses writer’s opinion,['cult'],1
5,[Democrats employ] their full arsenal to deleg...,https://thefederalist.com/2020/09/22/democrats...,Federalist,vaccine,,Biased,Expresses writer’s opinion,"['apparent', 'crime']",1
6,[Newsoms's] obsession with masks has created a...,https://thefederalist.com/2020/12/09/gavin-new...,Federalist,vaccine,,Biased,Expresses writer’s opinion,['obsession'],1
7,[Newsoms's] onslaught of propaganda ignores co...,https://thefederalist.com/2020/12/09/gavin-new...,Federalist,vaccine,,Biased,Expresses writer’s opinion,"['propaganda', 'vilifying', 'unimpeded']",1
8,[The police] now prefer to think of themselves...,http://feedproxy.google.com/~r/breitbart/~3/2R...,Breitbart,marriage-equality,,Biased,Expresses writer’s opinion,['wounds'],1
9,‘A new low’: Washington Post media critic blow...,https://www.alternet.org/2019/08/a-new-low-was...,Alternet,white-nationalism,left,Biased,Expresses writer’s opinion,"['blows', 'up', 'absurd', 'lies', 'nationalism...",1


In [None]:
data_v3 = pd.read_csv('/content/data_v3.csv')
data_v3.info()
data_v3.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 310 entries, 0 to 309
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      310 non-null    object
 1   label   310 non-null    int64 
 2   text    310 non-null    object
dtypes: int64(1), object(2)
memory usage: 7.4+ KB


Unnamed: 0,id,label,text
0,cdc102f5-88b2-4f1c-ba7d-9970ac9669cd,0,"""You think I'm joking,"" he continued. """
1,cb534c0b-5bea-4ade-b5da-cd766f6d5ab7,0,"Speaking about the ""Divine Nine"" — the nine bl..."
2,1c03f90d-54c0-4a9b-9309-1818488d294f,0,"I know where the power is."""
3,be0165c0-fd53-438a-8fda-bb04df7d12e2,1,President Joe Biden made a bizarre race-relate...
4,cd0a2226-c30d-428f-b29d-1e6344b618ac,0,"""And, by the way, you know I'm not — I may be ..."


In [None]:
data_full = pd.concat([data_v3, data_babe], join = 'inner').reindex()
data_full.info()
data_full.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3983 entries, 0 to 3673
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   3983 non-null   int64 
 1   text    3983 non-null   object
dtypes: int64(1), object(1)
memory usage: 93.4+ KB


Unnamed: 0,label,text
0,0,"""You think I'm joking,"" he continued. """
1,0,"Speaking about the ""Divine Nine"" — the nine bl..."
2,0,"I know where the power is."""
3,1,President Joe Biden made a bizarre race-relate...
4,0,"""And, by the way, you know I'm not — I may be ..."


In [None]:
tokenizer = RobertaTokenizer.from_pretrained(MODEL_CHECKPOINT)

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

## Training

In [None]:
def train_and_evaluate(data, run_name):
  skfold = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=RANDOM_SEED)

  tok = tokenizer(list(data['text']),truncation=True, padding=True, max_length=128)

  data = pd.DataFrame({'input_ids':tok['input_ids'],'attention_mask':tok['attention_mask'],'label':data['label']})
  data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

  training_args = TrainingArguments(
    output_dir = './',
    num_train_epochs=N_EPOCHS,
    per_device_train_batch_size=32,  
    logging_steps=20,
    disable_tqdm = False,
    save_total_limit=2,
    weight_decay=WEIGHT_DECAY,
    run_name=run_name,
    learning_rate=LR)
  
  scores = []

  for train_index, val_index in skfold.split(data['input_ids'],data['label']):
    token_train = Dataset.from_dict(data.iloc[train_index])
    token_valid = Dataset.from_dict(data.iloc[val_index])
    
    model=RobertaForSequenceClassification.from_pretrained(MODEL_CHECKPOINT, num_labels=2)
    trainer = Trainer(model,training_args,train_dataset=token_train,data_collator=data_collator,
                      tokenizer=tokenizer)
    trainer.train()

    #evaluation
    eval_dataloader = DataLoader(token_valid, batch_size=32, collate_fn=data_collator)
    scores.append(compute_metrics(model, eval_dataloader))
    print(scores[-1])
  
  print(f"F1 score for {run_name}: {np.mean(scores)} +/- {np.std(scores)}")
  return scores

#### V3 only

In [None]:
train_and_evaluate(data_v3, "V3 only")

Downloading pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]



Step,Training Loss
20,0.5905


  metric_f1 = load_metric("f1")


Downloading builder script:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

0.41509433962264153




Step,Training Loss
20,0.5747


0.41509433962264153




Step,Training Loss
20,0.6043


0.41509433962264153




Step,Training Loss
20,0.5254


0.7074410163339383




Step,Training Loss
20,0.5413


0.40952380952380957
F1 score for V3 only: 0.4724495689451345 +/- 0.11751552964314349


[0.41509433962264153,
 0.41509433962264153,
 0.41509433962264153,
 0.7074410163339383,
 0.40952380952380957]

#### BABE only

In [None]:
train_and_evaluate(data_babe, "BABE Only")



Step,Training Loss
20,0.6642
40,0.5415
60,0.4952
80,0.5146
100,0.4682
120,0.3807
140,0.3476
160,0.2936
180,0.3543
200,0.2305


0.8102648102648102




Step,Training Loss
20,0.6293
40,0.5396
60,0.5285
80,0.517
100,0.5476
120,0.3931
140,0.3613
160,0.3563
180,0.3509
200,0.2808


0.8419307951292585




Step,Training Loss
20,0.6601
40,0.5649
60,0.457
80,0.4513
100,0.4428
120,0.3629
140,0.3755
160,0.3644
180,0.2767
200,0.1683


0.8257958433940183




Step,Training Loss
20,0.6225
40,0.5242
60,0.5152
80,0.5126
100,0.4468
120,0.3493
140,0.3621
160,0.3821
180,0.3891
200,0.2815


0.8037214484679666




Step,Training Loss
20,0.6742
40,0.5495
60,0.5141
80,0.4442
100,0.4225
120,0.3156
140,0.3262
160,0.3695
180,0.3616
200,0.2629


0.8157037720373391
F1 score for BABE Only: 0.8194833338586787 +/- 0.013352449887641716


[0.8102648102648102,
 0.8419307951292585,
 0.8257958433940183,
 0.8037214484679666,
 0.8157037720373391]

#### BABE + V3 combined

In [None]:
train_and_evaluate(data_full, "BABE+V3")



Step,Training Loss
20,0.6954
40,0.6521
60,0.5406
80,0.4902
100,0.4617
120,0.3955
140,0.3872
160,0.3646
180,0.3346
200,0.3372


0.8031874477623282




Step,Training Loss
20,0.6299
40,0.5544
60,0.5523
80,0.4793
100,0.4588
120,0.3292
140,0.3522
160,0.3578
180,0.3506
200,0.3787


0.8302854055428426




Step,Training Loss
20,0.6631
40,0.5975
60,0.5099
80,0.4965
100,0.5155
120,0.3861
140,0.3717
160,0.4148
180,0.3646
200,0.3538


0.8401652999575222




Step,Training Loss
20,0.6081
40,0.5584
60,0.489
80,0.4551
100,0.4834
120,0.384
140,0.3468
160,0.2945
180,0.3115
200,0.3893


0.8435534591194969




Step,Training Loss
20,0.6516
40,0.5671
60,0.5347
80,0.4831
100,0.4176
120,0.3346
140,0.3378
160,0.3461
180,0.3284
200,0.335


0.805140758873929
F1 score for BABE+V3: 0.8244664742512237 +/- 0.01715171317655662


[0.8031874477623282,
 0.8302854055428426,
 0.8401652999575222,
 0.8435534591194969,
 0.805140758873929]