In [None]:
!nvidia-smi

Sat Jun  3 04:56:30 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   57C    P8    10W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pwd
import os
os.chdir('/content/drive/MyDrive/Colab Notebooks/Media_Bias_Feature_for_the_Identification_of_Biased_Statements_in_News_Articles')
!pwd

/content
/content/drive/MyDrive/Colab Notebooks/Media_Bias_Feature_for_the_Identification_of_Biased_Statements_in_News_Articles


In [None]:
import pandas as pd
import numpy as np

from sklearn.utils import shuffle

RANDOM_SEED = 42

#Hybrid-based characterisation

In [None]:
!pip install transformers
!pip install pytorch-lightning

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m58.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m28.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m128.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.15.1 tokenizers-0.13.3 transformers-4.29.2
Looking in i

In [None]:
import pandas as pd
import numpy as np
import gc

from tqdm.auto import tqdm

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from transformers import BertTokenizerFast as BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup

import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger

RANDOM_SEED = 42

In [None]:
BERT_MODEL_NAME = 'bert-base-cased'
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_NAME)

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
TEXT = ['text']

In [None]:
class DTset(Dataset):

  def __init__(
    self,
    data: pd.DataFrame,
    tokenizer: BertTokenizer,
    max_token_len: int = 512
  ):
    self.tokenizer = tokenizer
    self.data = data
    self.max_token_len = max_token_len

  def __len__(self):
    return len(self.data)

  def __getitem__(self, index: int):
    data_row = self.data.iloc[index]

    text = data_row[TEXT].astype(str).values.tolist()

    encoding = self.tokenizer.encode_plus(
      text,
      add_special_tokens=True,
      max_length=self.max_token_len,
      return_token_type_ids=False,
      padding="max_length",
      truncation=True,
      is_split_into_words = True,
      return_attention_mask=True,
      return_tensors='pt'
    )

    return dict(
      text = text,
      input_ids=encoding["input_ids"].flatten(),
      attention_mask=encoding["attention_mask"].flatten(),
    )

In [None]:
class BertBasedClassifier(pl.LightningModule):

  def __init__(self, n_classes: int):
    super().__init__()
    self.bert = BertModel.from_pretrained(BERT_MODEL_NAME, return_dict=True)
    self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
    self.sigmoid = nn.Sigmoid()
    self.softmax = nn.Softmax(dim=1)
    self.n_classes = n_classes

  def forward(self, input_ids, attention_mask, classification=None):
    if(self.n_classes == 1):
      output = self.bert(input_ids, attention_mask=attention_mask)
      output = self.out(output.pooler_output)
      output = self.sigmoid(output)
      return output
    else:
      output = self.bert(input_ids, attention_mask=attention_mask)
      output = self.out(output.pooler_output)
      output = self.softmax(output)
      return output

In [None]:
def feature_characterise(tagging_df, feature_size, ckpt_pth, max_token_count):
  classifier = BertBasedClassifier.load_from_checkpoint(ckpt_pth, n_classes = feature_size)
  classifier.eval()
  classifier.freeze()

  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

  classifier = classifier.to(device)

  tagging_dataset = DTset(
    tagging_df,
    tokenizer,
    max_token_len = max_token_count
  )

  tags_pred_index = []

  for item in tqdm(tagging_dataset):
    prediction = classifier(
      item["input_ids"].unsqueeze(dim=0).to(device),
      item["attention_mask"].unsqueeze(dim=0).to(device)
    )
    tags_pred_index.append(prediction.flatten())

  tags_pred_index = torch.stack(tags_pred_index).detach().cpu()
  tags_pred_index = tags_pred_index.numpy()

  tags_pred_binary = []
  tags_pred_int = []

  if(feature_size > 1):
    for i in tags_pred_index:
      c = 0
      k = 0
      highest = 0
      for j in i:
        if(j > highest):
          highest = j
          k = c
        c = c + 1
      tags_pred_binary.append(np.where(i == highest, 1, 0))
      tags_pred_int.append(k)
  else:
    tags_pred_binary = np.where(tags_pred_index > 0.5, 1, 0)
    tags_pred_int = np.where(tags_pred_index > 0.5, 1, 0)

  del classifier
  gc.collect()

  return tags_pred_index, tags_pred_binary, tags_pred_int

**Feature Characterization with Final Build**

Feature: BABE SG2 Subjective Statements, News Headlines USA, News Articles USA, LIAR, FakeNewsTFG, NewsMTSC

In [None]:
def feature_characterise_pipeline(dt_train, dt_test):
  #feature: BABE SG2 subjectivity
  ckpt_dir = "Best_model/Lexicon_of_Sentences_BABESG2_Subjective_Statements/Lexicon-of-Sentences-BABESG2-Subjective-Statements-Best-Checkpoint.ckpt"

  BABESG2_subjectivity_tag_index_output_train_arr, BABESG2_subjectivity_tag_binary_output_train_arr, BABESG2_subjectivity_tag_int_output_train_arr = feature_characterise(dt_train, 3, ckpt_dir, 150)
  BABESG2_subjectivity_tag_index_output_test_arr, BABESG2_subjectivity_tag_binary_output_test_arr, BABESG2_subjectivity_tag_int_output_test_arr = feature_characterise(dt_test, 3, ckpt_dir, 150)

  BABESG2_subjectivity_tag_index_output_train_df = pd.DataFrame(BABESG2_subjectivity_tag_index_output_train_arr, columns=['objective_BABESG2', 'partly-subjective_BABESG2', 'subjective_BABESG2'])
  BABESG2_subjectivity_tag_index_output_test_df = pd.DataFrame(BABESG2_subjectivity_tag_index_output_test_arr, columns=['objective_BABESG2', 'partly-subjective_BABESG2', 'subjective_BABESG2'])

  BABESG2_subjectivity_tag_binary_output_train_df = pd.DataFrame(BABESG2_subjectivity_tag_binary_output_train_arr, columns=['objective_BABESG2', 'partly-subjective_BABESG2', 'subjective_BABESG2'])
  BABESG2_subjectivity_tag_binary_output_test_df = pd.DataFrame(BABESG2_subjectivity_tag_binary_output_test_arr, columns=['objective_BABESG2', 'partly-subjective_BABESG2', 'subjective_BABESG2'])

  BABESG2_subjectivity_tag_int_output_train_df = pd.DataFrame(BABESG2_subjectivity_tag_int_output_train_arr, columns=['subjectivity_BABESG2'])
  BABESG2_subjectivity_tag_int_output_test_df = pd.DataFrame(BABESG2_subjectivity_tag_int_output_test_arr, columns=['subjectivity_BABESG2'])

  #feature: News Headlines USA hyper-partisan
  ckpt_dir = "Best_model/Lexicon_of_Sentences_News_Headlines_USA/Lexicon-of-Sentences-News-Headlines-USA-Best-Checkpoint.ckpt"

  headline_hyperpartisan_tag_index_output_train_arr, headline_hyperpartisan_tag_binary_output_train_arr, headline_hyperpartisan_tag_int_output_train_arr = feature_characterise(dt_train, 3, ckpt_dir, 150)
  headline_hyperpartisan_tag_index_output_test_arr, headline_hyperpartisan_tag_binary_output_test_arr, headline_hyperpartisan_tag_int_output_test_arr = feature_characterise(dt_test, 3, ckpt_dir, 150)

  headline_hyperpartisan_tag_index_output_train_df = pd.DataFrame(headline_hyperpartisan_tag_index_output_train_arr, columns=['nonpartisan_headline', 'slight-partisan_headline', 'hyperpartisan_headline'])
  headline_hyperpartisan_tag_index_output_test_df = pd.DataFrame(headline_hyperpartisan_tag_index_output_test_arr, columns=['nonpartisan_headline', 'slight-partisan_headline', 'hyperpartisan_headline'])

  headline_hyperpartisan_tag_binary_output_train_df = pd.DataFrame(headline_hyperpartisan_tag_binary_output_train_arr, columns=['nonpartisan_headline', 'slight-partisan_headline', 'hyperpartisan_headline'])
  headline_hyperpartisan_tag_binary_output_test_df = pd.DataFrame(headline_hyperpartisan_tag_binary_output_test_arr, columns=['nonpartisan_headline', 'slight-partisan_headline', 'hyperpartisan_headline'])

  headline_hyperpartisan_tag_int_output_train_df = pd.DataFrame(headline_hyperpartisan_tag_int_output_train_arr, columns=['hyperpartisan_headline'])
  headline_hyperpartisan_tag_int_output_test_df = pd.DataFrame(headline_hyperpartisan_tag_int_output_test_arr, columns=['hyperpartisan_headline'])

  #feature: News Articles USA hyper-partisan
  ckpt_dir = "Best_model/Lexicon_of_Sentences_News_Articles_USA/Lexicon-of-Sentences-News-Articles-USA-Best-Checkpoint.ckpt"

  article_hyperpartisan_tag_index_output_train_arr, article_hyperpartisan_tag_binary_output_train_arr, article_hyperpartisan_tag_int_output_train_arr = feature_characterise(dt_train, 3, ckpt_dir, 512)
  article_hyperpartisan_tag_index_output_test_arr, article_hyperpartisan_tag_binary_output_test_arr, article_hyperpartisan_tag_int_output_test_arr = feature_characterise(dt_test, 3, ckpt_dir, 512)

  article_hyperpartisan_tag_index_output_train_df = pd.DataFrame(article_hyperpartisan_tag_index_output_train_arr, columns=['nonpartisan_article', 'slight-partisan_article', 'hyperpartisan_article'])
  article_hyperpartisan_tag_index_output_test_df = pd.DataFrame(article_hyperpartisan_tag_index_output_test_arr, columns=['nonpartisan_article', 'slight-partisan_article', 'hyperpartisan_article'])

  article_hyperpartisan_tag_binary_output_train_df = pd.DataFrame(article_hyperpartisan_tag_binary_output_train_arr, columns=['nonpartisan_article', 'slight-partisan_article', 'hyperpartisan_article'])
  article_hyperpartisan_tag_binary_output_test_df = pd.DataFrame(article_hyperpartisan_tag_binary_output_test_arr, columns=['nonpartisan_article', 'slight-partisan_article', 'hyperpartisan_article'])

  article_hyperpartisan_tag_int_output_train_df = pd.DataFrame(article_hyperpartisan_tag_int_output_train_arr, columns=['hyperpartisan_article'])
  article_hyperpartisan_tag_int_output_test_df = pd.DataFrame(article_hyperpartisan_tag_int_output_test_arr, columns=['hyperpartisan_article'])

  #feature: LIAR fake news
  ckpt_dir = "Best_model/Lexicon_of_Sentences_LIAR/Lexicon-of-Sentences-LIAR-Best-Checkpoint.ckpt"

  LIAR_bluff_tag_index_output_train_arr, LIAR_bluff_tag_binary_output_train_arr, LIAR_bluff_tag_int_output_train_arr = feature_characterise(dt_train, 6, ckpt_dir, 150)
  LIAR_bluff_tag_index_output_test_arr, LIAR_bluff_tag_binary_output_test_arr, LIAR_bluff_tag_int_output_test_arr = feature_characterise(dt_test, 6, ckpt_dir, 150)

  LIAR_bluff_tag_index_output_train_df = pd.DataFrame(LIAR_bluff_tag_index_output_train_arr, columns=['true_LIAR', 'mostly-true_LIAR', 'half-true_LIAR', 'barely-true_LIAR', 'false_LIAR', 'pants-fire_LIAR'])
  LIAR_bluff_tag_index_output_test_df = pd.DataFrame(LIAR_bluff_tag_index_output_test_arr, columns=['true_LIAR', 'mostly-true_LIAR', 'half-true_LIAR', 'barely-true_LIAR', 'false_LIAR', 'pants-fire_LIAR'])

  LIAR_bluff_tag_binary_output_train_df = pd.DataFrame(LIAR_bluff_tag_binary_output_train_arr, columns=['true_LIAR', 'mostly-true_LIAR', 'half-true_LIAR', 'barely-true_LIAR', 'false_LIAR', 'pants-fire_LIAR'])
  LIAR_bluff_tag_binary_output_test_df = pd.DataFrame(LIAR_bluff_tag_binary_output_test_arr, columns=['true_LIAR', 'mostly-true_LIAR', 'half-true_LIAR', 'barely-true_LIAR', 'false_LIAR', 'pants-fire_LIAR'])

  LIAR_bluff_tag_int_output_train_df = pd.DataFrame(LIAR_bluff_tag_int_output_train_arr, columns=['bluff_LIAR'])
  LIAR_bluff_tag_int_output_test_df = pd.DataFrame(LIAR_bluff_tag_int_output_test_arr, columns=['bluff_LIAR'])

  #feature: FakeNewsTFG fake news
  ckpt_dir = "Best_model/Lexicon_of_Sentences_FakeNewsTFG/Lexicon-of-Sentences-FakeNewsTFG-Best-Checkpoint.ckpt"

  FakeNewsTFG_bluff_tag_index_output_train_arr, FakeNewsTFG_bluff_tag_binary_output_train_arr, FakeNewsTFG_bluff_tag_int_output_train_arr = feature_characterise(dt_train, 1, ckpt_dir, 150)
  FakeNewsTFG_bluff_tag_index_output_test_arr, FakeNewsTFG_bluff_tag_binary_output_test_arr, FakeNewsTFG_bluff_tag_int_output_test_arr = feature_characterise(dt_test, 1, ckpt_dir, 150)

  FakeNewsTFG_bluff_tag_index_output_train_df = pd.DataFrame(FakeNewsTFG_bluff_tag_index_output_train_arr, columns=['bluff_FakeNewsTFG'])
  FakeNewsTFG_bluff_tag_index_output_test_df = pd.DataFrame(FakeNewsTFG_bluff_tag_index_output_test_arr, columns=['bluff_FakeNewsTFG'])

  FakeNewsTFG_bluff_tag_binary_output_train_df = pd.DataFrame(FakeNewsTFG_bluff_tag_binary_output_train_arr, columns=['bluff_FakeNewsTFG'])
  FakeNewsTFG_bluff_tag_binary_output_test_df = pd.DataFrame(FakeNewsTFG_bluff_tag_binary_output_test_arr, columns=['bluff_FakeNewsTFG'])

  FakeNewsTFG_bluff_tag_int_output_train_df = pd.DataFrame(FakeNewsTFG_bluff_tag_int_output_train_arr, columns=['bluff_FakeNewsTFG'])
  FakeNewsTFG_bluff_tag_int_output_test_df = pd.DataFrame(FakeNewsTFG_bluff_tag_int_output_test_arr, columns=['bluff_FakeNewsTFG'])

  #feature: NewsMTSC news sentiment
  ckpt_dir = "Best_model/Lexicon_of_Sentences_NewsMTSC/Lexicon-of-Sentences-NewsMTSC-Best-Checkpoint.ckpt"

  NewsMTSC_sentiment_tag_index_output_train_arr, NewsMTSC_sentiment_tag_binary_output_train_arr, NewsMTSC_sentiment_tag_int_output_train_arr = feature_characterise(dt_train, 3, ckpt_dir, 150)
  NewsMTSC_sentiment_tag_index_output_test_arr, NewsMTSC_sentiment_tag_binary_output_test_arr, NewsMTSC_sentiment_tag_int_output_test_arr = feature_characterise(dt_test, 3, ckpt_dir, 150)

  NewsMTSC_sentiment_tag_index_output_train_df = pd.DataFrame(NewsMTSC_sentiment_tag_index_output_train_arr, columns=['negative_NewsMTSC', 'neutral_NewsMTSC', 'positive_NewsMTSC'])
  NewsMTSC_sentiment_tag_index_output_test_df = pd.DataFrame(NewsMTSC_sentiment_tag_index_output_test_arr, columns=['negative_NewsMTSC', 'neutral_NewsMTSC', 'positive_NewsMTSC'])

  NewsMTSC_sentiment_tag_binary_output_train_df = pd.DataFrame(NewsMTSC_sentiment_tag_binary_output_train_arr, columns=['negative_NewsMTSC', 'neutral_NewsMTSC', 'positive_NewsMTSC'])
  NewsMTSC_sentiment_tag_binary_output_test_df = pd.DataFrame(NewsMTSC_sentiment_tag_binary_output_test_arr, columns=['negative_NewsMTSC', 'neutral_NewsMTSC', 'positive_NewsMTSC'])

  NewsMTSC_sentiment_tag_int_output_train_df = pd.DataFrame(NewsMTSC_sentiment_tag_int_output_train_arr, columns=['sentiment_NewsMTSC'])
  NewsMTSC_sentiment_tag_int_output_test_df = pd.DataFrame(NewsMTSC_sentiment_tag_int_output_test_arr, columns=['sentiment_NewsMTSC'])

  feature_characterised_index_ouput_train_df = pd.concat([dt_train,
                                                          BABESG2_subjectivity_tag_index_output_train_df,
                                                          headline_hyperpartisan_tag_index_output_train_df,
                                                          article_hyperpartisan_tag_index_output_train_df,
                                                          LIAR_bluff_tag_index_output_train_df,
                                                          FakeNewsTFG_bluff_tag_index_output_train_df,
                                                          NewsMTSC_sentiment_tag_index_output_train_df],
                                                         axis = 1)

  feature_characterised_index_ouput_test_df = pd.concat([dt_test,
                                                         BABESG2_subjectivity_tag_index_output_test_df,
                                                         headline_hyperpartisan_tag_index_output_test_df,
                                                         article_hyperpartisan_tag_index_output_test_df,
                                                         LIAR_bluff_tag_index_output_test_df,
                                                         FakeNewsTFG_bluff_tag_index_output_test_df,
                                                         NewsMTSC_sentiment_tag_index_output_test_df],
                                                        axis = 1)

  feature_characterised_binary_ouput_train_df = pd.concat([dt_train,
                                                           BABESG2_subjectivity_tag_binary_output_train_df,
                                                           headline_hyperpartisan_tag_binary_output_train_df,
                                                           article_hyperpartisan_tag_binary_output_train_df,
                                                           LIAR_bluff_tag_binary_output_train_df,
                                                           FakeNewsTFG_bluff_tag_binary_output_train_df,
                                                           NewsMTSC_sentiment_tag_binary_output_train_df],
                                                          axis = 1)

  feature_characterised_binary_ouput_test_df = pd.concat([dt_test,
                                                          BABESG2_subjectivity_tag_binary_output_test_df,
                                                          headline_hyperpartisan_tag_binary_output_test_df,
                                                          article_hyperpartisan_tag_binary_output_test_df,
                                                          LIAR_bluff_tag_binary_output_test_df,
                                                          FakeNewsTFG_bluff_tag_binary_output_test_df,
                                                          NewsMTSC_sentiment_tag_binary_output_test_df],
                                                         axis = 1)

  feature_characterised_int_ouput_train_df = pd.concat([dt_train,
                                                        BABESG2_subjectivity_tag_int_output_train_df,
                                                        headline_hyperpartisan_tag_int_output_train_df,
                                                        article_hyperpartisan_tag_int_output_train_df,
                                                        LIAR_bluff_tag_int_output_train_df,
                                                        FakeNewsTFG_bluff_tag_int_output_train_df,
                                                        NewsMTSC_sentiment_tag_int_output_train_df],
                                                       axis = 1)

  feature_characterised_int_ouput_test_df = pd.concat([dt_test,
                                                       BABESG2_subjectivity_tag_int_output_test_df,
                                                       headline_hyperpartisan_tag_int_output_test_df,
                                                       article_hyperpartisan_tag_int_output_test_df,
                                                       LIAR_bluff_tag_int_output_test_df,
                                                       FakeNewsTFG_bluff_tag_int_output_test_df,
                                                       NewsMTSC_sentiment_tag_int_output_test_df],
                                                      axis = 1)

  return feature_characterised_index_ouput_train_df, feature_characterised_index_ouput_test_df, feature_characterised_binary_ouput_train_df, feature_characterised_binary_ouput_test_df, feature_characterised_int_ouput_train_df, feature_characterised_int_ouput_test_df

In [None]:
BABESG2_train_df = pd.read_csv("Datasets/Splits/Train/final_labels_SG2_label_bias_train.csv")
BABESG2_test_df = pd.read_csv("Datasets/Splits/Test/final_labels_SG2_label_bias_test.csv")

In [None]:
feature_characterised_index_output_BABESG2_train_df, feature_characterised_index_output_BABESG2_test_df, feature_characterised_binary_output_BABESG2_train_df, feature_characterised_binary_output_BABESG2_test_df, feature_characterised_int_output_BABESG2_train_df, feature_characterised_int_output_BABESG2_test_df = feature_characterise_pipeline(BABESG2_train_df, BABESG2_test_df)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/2571 [00:00<?, ?it/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/1102 [00:00<?, ?it/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/2571 [00:00<?, ?it/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/1102 [00:00<?, ?it/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/2571 [00:00<?, ?it/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/1102 [00:00<?, ?it/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/2571 [00:00<?, ?it/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/1102 [00:00<?, ?it/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/2571 [00:00<?, ?it/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/1102 [00:00<?, ?it/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/2571 [00:00<?, ?it/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/1102 [00:00<?, ?it/s]

In [None]:
feature_characterised_index_output_BABESG2_train_df.head()

Unnamed: 0,text,bias,objective_BABESG2,partly-subjective_BABESG2,subjective_BABESG2,nonpartisan_headline,slight-partisan_headline,hyperpartisan_headline,nonpartisan_article,slight-partisan_article,...,true_LIAR,mostly-true_LIAR,half-true_LIAR,barely-true_LIAR,false_LIAR,pants-fire_LIAR,bluff_FakeNewsTFG,negative_NewsMTSC,neutral_NewsMTSC,positive_NewsMTSC
0,Letters from GOP members of Congress and natio...,1,0.898398,0.101066,0.000536,0.14119,0.657582,0.201228,0.00049,0.001449,...,0.0115,0.007012,0.075582,0.086666,0.813493,0.005746,0.003119,0.995946,0.00333,0.000724
1,Activist Dorian Wilde said anti-LGBT statement...,0,0.996902,0.002536,0.000562,0.000708,0.995503,0.003789,0.0117,0.820799,...,0.008342,0.024063,0.391299,0.256079,0.311679,0.008538,0.000533,0.010452,0.987198,0.00235
2,U.S. President Donald Trump retweeted a video ...,1,0.997604,0.001978,0.000418,0.017055,0.938557,0.044387,0.001611,0.000991,...,0.014166,0.028831,0.537781,0.275299,0.131245,0.012678,0.001078,0.99267,0.006891,0.000439
3,Everything in the progressive agenda is aimed ...,1,0.000609,0.004326,0.995066,0.000209,0.991865,0.007926,0.001661,0.005381,...,0.005221,0.013751,0.144046,0.274683,0.555828,0.006472,0.223309,0.537822,0.007349,0.454829
4,Even pro-life leaders have expressed their app...,0,0.994585,0.005081,0.000334,0.000169,0.999602,0.000228,0.021303,0.044221,...,0.193611,0.540472,0.236409,0.008222,0.016375,0.004911,0.03162,0.000669,0.001619,0.997712


In [None]:
feature_characterised_index_output_BABESG2_test_df.head()

Unnamed: 0,text,bias,objective_BABESG2,partly-subjective_BABESG2,subjective_BABESG2,nonpartisan_headline,slight-partisan_headline,hyperpartisan_headline,nonpartisan_article,slight-partisan_article,...,true_LIAR,mostly-true_LIAR,half-true_LIAR,barely-true_LIAR,false_LIAR,pants-fire_LIAR,bluff_FakeNewsTFG,negative_NewsMTSC,neutral_NewsMTSC,positive_NewsMTSC
0,Protests erupted in Louisville and several oth...,0,0.997487,0.001771,0.000742,0.99939,0.000528,8.3e-05,0.139669,0.761316,...,0.00812,0.007542,0.068742,0.271123,0.638053,0.006419,0.000275,0.996193,0.00332,0.000487
1,The legislation is the most sweeping college a...,0,0.983527,0.015457,0.001016,0.000227,0.999403,0.000371,0.077666,0.83649,...,0.125085,0.543168,0.311038,0.007111,0.011672,0.001925,0.011142,0.025151,0.023878,0.950971
2,Left-wing Christians declare that the real way...,1,0.001486,0.105182,0.893333,0.001895,0.149217,0.848887,0.011861,0.023174,...,0.007114,0.014652,0.147767,0.336381,0.48641,0.007676,0.003512,0.390852,0.003499,0.605649
3,Perhaps the pushback from the Pentagon will in...,1,0.001157,0.092227,0.906616,0.005944,0.192958,0.801098,0.018832,0.087844,...,0.015459,0.049777,0.601219,0.187316,0.139155,0.007074,0.246112,0.99721,0.001474,0.001316
4,Trump's latest conspiracy theory seems to echo...,0,0.002653,0.261513,0.735834,0.004262,0.025038,0.9707,0.000775,0.001013,...,0.008646,0.015451,0.236943,0.330947,0.399992,0.008021,0.005408,0.997835,0.0015,0.000665


In [None]:
feature_characterised_binary_output_BABESG2_train_df.head()

Unnamed: 0,text,bias,objective_BABESG2,partly-subjective_BABESG2,subjective_BABESG2,nonpartisan_headline,slight-partisan_headline,hyperpartisan_headline,nonpartisan_article,slight-partisan_article,...,true_LIAR,mostly-true_LIAR,half-true_LIAR,barely-true_LIAR,false_LIAR,pants-fire_LIAR,bluff_FakeNewsTFG,negative_NewsMTSC,neutral_NewsMTSC,positive_NewsMTSC
0,Letters from GOP members of Congress and natio...,1,1,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,1,0,0
1,Activist Dorian Wilde said anti-LGBT statement...,0,1,0,0,0,1,0,0,1,...,0,0,1,0,0,0,0,0,1,0
2,U.S. President Donald Trump retweeted a video ...,1,1,0,0,0,1,0,0,0,...,0,0,1,0,0,0,0,1,0,0
3,Everything in the progressive agenda is aimed ...,1,0,0,1,0,1,0,0,0,...,0,0,0,0,1,0,0,1,0,0
4,Even pro-life leaders have expressed their app...,0,1,0,0,0,1,0,0,0,...,0,1,0,0,0,0,0,0,0,1


In [None]:
feature_characterised_binary_output_BABESG2_test_df.head()

Unnamed: 0,text,bias,objective_BABESG2,partly-subjective_BABESG2,subjective_BABESG2,nonpartisan_headline,slight-partisan_headline,hyperpartisan_headline,nonpartisan_article,slight-partisan_article,...,true_LIAR,mostly-true_LIAR,half-true_LIAR,barely-true_LIAR,false_LIAR,pants-fire_LIAR,bluff_FakeNewsTFG,negative_NewsMTSC,neutral_NewsMTSC,positive_NewsMTSC
0,Protests erupted in Louisville and several oth...,0,1,0,0,1,0,0,0,1,...,0,0,0,0,1,0,0,1,0,0
1,The legislation is the most sweeping college a...,0,1,0,0,0,1,0,0,1,...,0,1,0,0,0,0,0,0,0,1
2,Left-wing Christians declare that the real way...,1,0,0,1,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,1
3,Perhaps the pushback from the Pentagon will in...,1,0,0,1,0,0,1,0,0,...,0,0,1,0,0,0,0,1,0,0
4,Trump's latest conspiracy theory seems to echo...,0,0,0,1,0,0,1,0,0,...,0,0,0,0,1,0,0,1,0,0


In [None]:
feature_characterised_int_output_BABESG2_train_df.head()

Unnamed: 0,text,bias,subjectivity_BABESG2,hyperpartisan_headline,hyperpartisan_article,bluff_LIAR,bluff_FakeNewsTFG,sentiment_NewsMTSC
0,Letters from GOP members of Congress and natio...,1,0,1,2,4,0,0
1,Activist Dorian Wilde said anti-LGBT statement...,0,0,1,1,2,0,1
2,U.S. President Donald Trump retweeted a video ...,1,0,1,2,2,0,0
3,Everything in the progressive agenda is aimed ...,1,2,1,2,4,0,0
4,Even pro-life leaders have expressed their app...,0,0,1,2,1,0,2


In [None]:
feature_characterised_int_output_BABESG2_test_df.head()

Unnamed: 0,text,bias,subjectivity_BABESG2,hyperpartisan_headline,hyperpartisan_article,bluff_LIAR,bluff_FakeNewsTFG,sentiment_NewsMTSC
0,Protests erupted in Louisville and several oth...,0,0,0,1,4,0,0
1,The legislation is the most sweeping college a...,0,0,1,1,1,0,2
2,Left-wing Christians declare that the real way...,1,2,2,2,4,0,2
3,Perhaps the pushback from the Pentagon will in...,1,2,2,2,2,0,0
4,Trump's latest conspiracy theory seems to echo...,0,2,2,2,4,0,0


In [None]:
feature_characterised_index_output_BABESG2_train_df.to_csv('Ground_truth_datasets_characterised/Splits/Train/BABESG2_hybrid_characterised_index_output_train.csv', index = False)
feature_characterised_index_output_BABESG2_test_df.to_csv('Ground_truth_datasets_characterised/Splits/Test/BABESG2_hybrid_characterised_index_output_test.csv', index = False)

In [None]:
feature_characterised_binary_output_BABESG2_train_df.to_csv('Ground_truth_datasets_characterised/Splits/Train/BABESG2_hybrid_characterised_binary_output_train.csv', index = False)
feature_characterised_binary_output_BABESG2_test_df.to_csv('Ground_truth_datasets_characterised/Splits/Test/BABESG2_hybrid_characterised_binary_output_test.csv', index = False)

In [None]:
feature_characterised_int_output_BABESG2_train_df.to_csv('Ground_truth_datasets_characterised/Splits/Train/BABESG2_hybrid_characterised_int_output_train.csv', index = False)
feature_characterised_int_output_BABESG2_test_df.to_csv('Ground_truth_datasets_characterised/Splits/Test/BABESG2_hybrid_characterised_int_output_test.csv', index = False)

Feature: BABE SG2 Biased Statements

In [None]:
fdt_hb_BABESG2_index_output_train = pd.read_csv('Ground_truth_datasets_characterised/Splits/Train/BABESG2_hybrid_characterised_index_output_train.csv')
fdt_hb_BABESG2_binary_output_train = pd.read_csv('Ground_truth_datasets_characterised/Splits/Train/BABESG2_hybrid_characterised_binary_output_train.csv')
fdt_hb_BABESG2_int_output_train = pd.read_csv('Ground_truth_datasets_characterised/Splits/Train/BABESG2_hybrid_characterised_int_output_train.csv')

In [None]:
fdt_hb_BABESG2_index_output_train = fdt_hb_BABESG2_index_output_train.drop(['bias'], axis=1)
fdt_hb_BABESG2_binary_output_train = fdt_hb_BABESG2_binary_output_train.drop(['bias'], axis=1)
fdt_hb_BABESG2_int_output_train = fdt_hb_BABESG2_int_output_train.drop(['bias'], axis=1)

In [None]:
fdt_hb_BABESG2_index_output_test = pd.read_csv('Ground_truth_datasets_characterised/Splits/Test/BABESG2_hybrid_characterised_index_output_test.csv')
fdt_hb_BABESG2_binary_output_test = pd.read_csv('Ground_truth_datasets_characterised/Splits/Test/BABESG2_hybrid_characterised_binary_output_test.csv')
fdt_hb_BABESG2_int_output_test = pd.read_csv('Ground_truth_datasets_characterised/Splits/Test/BABESG2_hybrid_characterised_int_output_test.csv')

In [None]:
fdt_hb_BABESG2_index_output_test = fdt_hb_BABESG2_index_output_test.drop(['bias'], axis=1)
fdt_hb_BABESG2_binary_output_test = fdt_hb_BABESG2_binary_output_test.drop(['bias'], axis=1)
fdt_hb_BABESG2_int_output_test = fdt_hb_BABESG2_int_output_test.drop(['bias'], axis=1)

In [None]:
BABESG2_train_dt = pd.read_csv(f"Datasets/Splits/Train/final_labels_SG2_label_bias_train.csv")
BABESG2_test_dt = pd.read_csv(f"Datasets/Splits/Test/final_labels_SG2_label_bias_test.csv")

ckpt_dir = (f"Best_model/Lexicon_of_Sentences_BABESG2_Biased_Statements/Lexicon-of-Sentences-BABESG2-Biased-Statements-Best-Checkpoint.ckpt")

BABESG2_bias_tag_index_output_train_arr, BABESG2_bias_tag_binary_output_train_arr, BABESG2_bias_tag_int_output_train_arr = feature_characterise(BABESG2_train_dt, 1, ckpt_dir, 150)
BABESG2_bias_tag_index_output_test_arr, BABESG2_bias_tag_binary_output_test_arr, BABESG2_bias_tag_int_output_test_arr = feature_characterise(BABESG2_test_dt, 1, ckpt_dir, 150)

BABESG2_bias_tag_index_output_train_df = pd.DataFrame(BABESG2_bias_tag_index_output_train_arr, columns=['bias_BABESG2'])
BABESG2_bias_tag_index_output_test_df = pd.DataFrame(BABESG2_bias_tag_index_output_test_arr, columns=['bias_BABESG2'])

BABESG2_bias_tag_binary_output_train_df = pd.DataFrame(BABESG2_bias_tag_binary_output_train_arr, columns=['bias_BABESG2'])
BABESG2_bias_tag_binary_output_test_df = pd.DataFrame(BABESG2_bias_tag_binary_output_test_arr, columns=['bias_BABESG2'])

BABESG2_bias_tag_int_output_train_df = pd.DataFrame(BABESG2_bias_tag_int_output_train_arr, columns=['bias_BABESG2'])
BABESG2_bias_tag_int_output_test_df = pd.DataFrame(BABESG2_bias_tag_int_output_test_arr, columns=['bias_BABESG2'])

feature_characterised_index_ouput_train_df = pd.concat([BABESG2_train_dt,
                                                          BABESG2_bias_tag_index_output_train_df],
                                                         axis = 1)

feature_characterised_index_ouput_test_df = pd.concat([BABESG2_test_dt,
                                                        BABESG2_bias_tag_index_output_test_df],
                                                       axis = 1)

feature_characterised_binary_ouput_train_df = pd.concat([BABESG2_train_dt,
                                                           BABESG2_bias_tag_binary_output_train_df],
                                                          axis = 1)

feature_characterised_binary_ouput_test_df = pd.concat([BABESG2_test_dt,
                                                         BABESG2_bias_tag_binary_output_test_df],
                                                        axis = 1)

feature_characterised_int_ouput_train_df = pd.concat([BABESG2_train_dt,
                                                        BABESG2_bias_tag_int_output_train_df],
                                                       axis = 1)

feature_characterised_int_ouput_test_df = pd.concat([BABESG2_test_dt,
                                                      BABESG2_bias_tag_int_output_test_df],
                                                     axis = 1)

feature_characterised_index_output_BABESG2_train_df = feature_characterised_index_ouput_train_df.merge(right = fdt_hb_BABESG2_index_output_train, how = 'left', on = 'text')
feature_characterised_index_output_BABESG2_test_df = feature_characterised_index_ouput_test_df.merge(right = fdt_hb_BABESG2_index_output_test, how = 'left', on = 'text')

feature_characterised_binary_output_BABESG2_train_df = feature_characterised_binary_ouput_train_df.merge(right = fdt_hb_BABESG2_binary_output_train, how = 'left', on = 'text')
feature_characterised_binary_output_BABESG2_test_df = feature_characterised_binary_ouput_test_df.merge(right = fdt_hb_BABESG2_binary_output_test, how = 'left', on = 'text')

feature_characterised_int_output_BABESG2_train_df = feature_characterised_int_ouput_train_df.merge(right = fdt_hb_BABESG2_int_output_train, how = 'left', on = 'text')
feature_characterised_int_output_BABESG2_test_df = feature_characterised_int_ouput_test_df.merge(right = fdt_hb_BABESG2_int_output_test, how = 'left', on = 'text')

Downloading pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/2571 [00:00<?, ?it/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/1102 [00:00<?, ?it/s]

In [None]:
feature_characterised_index_output_BABESG2_train_df.head()

Unnamed: 0,text,bias,bias_BABESG2,objective_BABESG2,partly-subjective_BABESG2,subjective_BABESG2,nonpartisan_headline,slight-partisan_headline,hyperpartisan_headline,nonpartisan_article,...,true_LIAR,mostly-true_LIAR,half-true_LIAR,barely-true_LIAR,false_LIAR,pants-fire_LIAR,bluff_FakeNewsTFG,negative_NewsMTSC,neutral_NewsMTSC,positive_NewsMTSC
0,Letters from GOP members of Congress and natio...,1,0.935469,0.898398,0.101066,0.000536,0.14119,0.657582,0.201228,0.00049,...,0.0115,0.007012,0.075582,0.086666,0.813493,0.005746,0.003119,0.995946,0.00333,0.000724
1,Activist Dorian Wilde said anti-LGBT statement...,0,0.116028,0.996902,0.002536,0.000562,0.000708,0.995503,0.003789,0.0117,...,0.008342,0.024063,0.391299,0.256079,0.311679,0.008538,0.000533,0.010452,0.987198,0.00235
2,U.S. President Donald Trump retweeted a video ...,1,0.411189,0.997604,0.001978,0.000418,0.017055,0.938557,0.044387,0.001611,...,0.014166,0.028831,0.537781,0.275299,0.131245,0.012678,0.001078,0.99267,0.006891,0.000439
3,Everything in the progressive agenda is aimed ...,1,0.956674,0.000609,0.004326,0.995066,0.000209,0.991865,0.007926,0.001661,...,0.005221,0.013751,0.144046,0.274683,0.555828,0.006472,0.223309,0.537822,0.007349,0.454829
4,Even pro-life leaders have expressed their app...,0,0.271919,0.994585,0.005081,0.000334,0.000169,0.999602,0.000228,0.021303,...,0.193611,0.540472,0.236409,0.008222,0.016375,0.004911,0.03162,0.000669,0.001619,0.997712


In [None]:
feature_characterised_index_output_BABESG2_test_df.head()

Unnamed: 0,text,bias,bias_BABESG2,objective_BABESG2,partly-subjective_BABESG2,subjective_BABESG2,nonpartisan_headline,slight-partisan_headline,hyperpartisan_headline,nonpartisan_article,...,true_LIAR,mostly-true_LIAR,half-true_LIAR,barely-true_LIAR,false_LIAR,pants-fire_LIAR,bluff_FakeNewsTFG,negative_NewsMTSC,neutral_NewsMTSC,positive_NewsMTSC
0,Protests erupted in Louisville and several oth...,0,0.049884,0.997487,0.001771,0.000742,0.99939,0.000528,8.3e-05,0.139669,...,0.00812,0.007542,0.068742,0.271123,0.638053,0.006419,0.000275,0.996193,0.00332,0.000487
1,The legislation is the most sweeping college a...,0,0.094605,0.983527,0.015457,0.001016,0.000227,0.999402,0.000371,0.077666,...,0.125085,0.543168,0.311038,0.007111,0.011672,0.001925,0.011142,0.025151,0.023878,0.950971
2,Left-wing Christians declare that the real way...,1,0.964535,0.001486,0.105182,0.893333,0.001895,0.149217,0.848887,0.011861,...,0.007114,0.014652,0.147767,0.336381,0.48641,0.007676,0.003512,0.390852,0.003499,0.605649
3,Perhaps the pushback from the Pentagon will in...,1,0.43621,0.001157,0.092227,0.906616,0.005944,0.192958,0.801098,0.018832,...,0.015459,0.049777,0.601219,0.187316,0.139155,0.007074,0.246112,0.99721,0.001474,0.001316
4,Trump's latest conspiracy theory seems to echo...,0,0.974726,0.002653,0.261513,0.735834,0.004262,0.025038,0.9707,0.000775,...,0.008646,0.015451,0.236943,0.330947,0.399992,0.008021,0.005408,0.997835,0.0015,0.000665


In [None]:
feature_characterised_binary_output_BABESG2_train_df.head()

Unnamed: 0,text,bias,bias_BABESG2,objective_BABESG2,partly-subjective_BABESG2,subjective_BABESG2,nonpartisan_headline,slight-partisan_headline,hyperpartisan_headline,nonpartisan_article,...,true_LIAR,mostly-true_LIAR,half-true_LIAR,barely-true_LIAR,false_LIAR,pants-fire_LIAR,bluff_FakeNewsTFG,negative_NewsMTSC,neutral_NewsMTSC,positive_NewsMTSC
0,Letters from GOP members of Congress and natio...,1,1,1,0,0,0,1,0,0,...,0,0,0,0,1,0,0,1,0,0
1,Activist Dorian Wilde said anti-LGBT statement...,0,0,1,0,0,0,1,0,0,...,0,0,1,0,0,0,0,0,1,0
2,U.S. President Donald Trump retweeted a video ...,1,0,1,0,0,0,1,0,0,...,0,0,1,0,0,0,0,1,0,0
3,Everything in the progressive agenda is aimed ...,1,1,0,0,1,0,1,0,0,...,0,0,0,0,1,0,0,1,0,0
4,Even pro-life leaders have expressed their app...,0,0,1,0,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,1


In [None]:
feature_characterised_binary_output_BABESG2_test_df.head()

Unnamed: 0,text,bias,bias_BABESG2,objective_BABESG2,partly-subjective_BABESG2,subjective_BABESG2,nonpartisan_headline,slight-partisan_headline,hyperpartisan_headline,nonpartisan_article,...,true_LIAR,mostly-true_LIAR,half-true_LIAR,barely-true_LIAR,false_LIAR,pants-fire_LIAR,bluff_FakeNewsTFG,negative_NewsMTSC,neutral_NewsMTSC,positive_NewsMTSC
0,Protests erupted in Louisville and several oth...,0,0,1,0,0,1,0,0,0,...,0,0,0,0,1,0,0,1,0,0
1,The legislation is the most sweeping college a...,0,0,1,0,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,1
2,Left-wing Christians declare that the real way...,1,1,0,0,1,0,0,1,0,...,0,0,0,0,1,0,0,0,0,1
3,Perhaps the pushback from the Pentagon will in...,1,0,0,0,1,0,0,1,0,...,0,0,1,0,0,0,0,1,0,0
4,Trump's latest conspiracy theory seems to echo...,0,1,0,0,1,0,0,1,0,...,0,0,0,0,1,0,0,1,0,0


In [None]:
feature_characterised_int_output_BABESG2_train_df.head()

Unnamed: 0,text,bias,bias_BABESG2,subjectivity_BABESG2,hyperpartisan_headline,hyperpartisan_article,bluff_LIAR,bluff_FakeNewsTFG,sentiment_NewsMTSC
0,Letters from GOP members of Congress and natio...,1,1,0,1,2,4,0,0
1,Activist Dorian Wilde said anti-LGBT statement...,0,0,0,1,1,2,0,1
2,U.S. President Donald Trump retweeted a video ...,1,0,0,1,2,2,0,0
3,Everything in the progressive agenda is aimed ...,1,1,2,1,2,4,0,0
4,Even pro-life leaders have expressed their app...,0,0,0,1,2,1,0,2


In [None]:
feature_characterised_int_output_BABESG2_test_df.head()

Unnamed: 0,text,bias,bias_BABESG2,subjectivity_BABESG2,hyperpartisan_headline,hyperpartisan_article,bluff_LIAR,bluff_FakeNewsTFG,sentiment_NewsMTSC
0,Protests erupted in Louisville and several oth...,0,0,0,0,1,4,0,0
1,The legislation is the most sweeping college a...,0,0,0,1,1,1,0,2
2,Left-wing Christians declare that the real way...,1,1,2,2,2,4,0,2
3,Perhaps the pushback from the Pentagon will in...,1,0,2,2,2,2,0,0
4,Trump's latest conspiracy theory seems to echo...,0,1,2,2,2,4,0,0


In [None]:
feature_characterised_index_output_BABESG2_train_df.to_csv('Ground_truth_datasets_characterised/Splits/Train/BABESG2_hybrid_characterised_index_output_train.csv', index = False)
feature_characterised_index_output_BABESG2_test_df.to_csv('Ground_truth_datasets_characterised/Splits/Test/BABESG2_hybrid_characterised_index_output_test.csv', index = False)

In [None]:
feature_characterised_binary_output_BABESG2_train_df.to_csv('Ground_truth_datasets_characterised/Splits/Train/BABESG2_hybrid_characterised_binary_output_train.csv', index = False)
feature_characterised_binary_output_BABESG2_test_df.to_csv('Ground_truth_datasets_characterised/Splits/Test/BABESG2_hybrid_characterised_binary_output_test.csv', index = False)

In [None]:
feature_characterised_int_output_BABESG2_train_df.to_csv('Ground_truth_datasets_characterised/Splits/Train/BABESG2_hybrid_characterised_int_output_train.csv', index = False)
feature_characterised_int_output_BABESG2_test_df.to_csv('Ground_truth_datasets_characterised/Splits/Test/BABESG2_hybrid_characterised_int_output_test.csv', index = False)

**Feature Characterization with K-Fold Build**

Feature: BABE SG2 Biased Statements

In [None]:
fdt_hb_BABESG2_index_output_train = pd.read_csv('Ground_truth_datasets_characterised/Splits/Train/BABESG2_hybrid_characterised_index_output_train.csv')
fdt_hb_BABESG2_binary_output_train = pd.read_csv('Ground_truth_datasets_characterised/Splits/Train/BABESG2_hybrid_characterised_binary_output_train.csv')
fdt_hb_BABESG2_int_output_train = pd.read_csv('Ground_truth_datasets_characterised/Splits/Train/BABESG2_hybrid_characterised_int_output_train.csv')

In [None]:
fdt_hb_BABESG2_index_output_train = fdt_hb_BABESG2_index_output_train.drop(['bias', 'bias_BABESG2'], axis=1)
fdt_hb_BABESG2_binary_output_train = fdt_hb_BABESG2_binary_output_train.drop(['bias', 'bias_BABESG2'], axis=1)
fdt_hb_BABESG2_int_output_train = fdt_hb_BABESG2_int_output_train.drop(['bias', 'bias_BABESG2'], axis=1)

In [None]:
for k in range(1, 6):
  BABESG2_train_dt = pd.read_csv(f"Datasets/Splits/Train/final_labels_SG2_label_bias_train_KFold{k}.csv")
  BABESG2_val_dt = pd.read_csv(f"Datasets/Splits/Validation/final_labels_SG2_label_bias_val_KFold{k}.csv")

  ckpt_dir = (f"Best_model/Lexicon_of_Sentences_BABESG2_Biased_Statements/Lexicon-of-Sentences-BABESG2-Biased-Statements-KFold{k}-Best-Checkpoint.ckpt")

  BABESG2_bias_tag_index_output_train_arr, BABESG2_bias_tag_binary_output_train_arr, BABESG2_bias_tag_int_output_train_arr = feature_characterise(BABESG2_train_dt, 1, ckpt_dir, 150)
  BABESG2_bias_tag_index_output_val_arr, BABESG2_bias_tag_binary_output_val_arr, BABESG2_bias_tag_int_output_val_arr = feature_characterise(BABESG2_val_dt, 1, ckpt_dir, 150)

  BABESG2_bias_tag_index_output_train_df = pd.DataFrame(BABESG2_bias_tag_index_output_train_arr, columns=['bias_BABESG2'])
  BABESG2_bias_tag_index_output_val_df = pd.DataFrame(BABESG2_bias_tag_index_output_val_arr, columns=['bias_BABESG2'])

  BABESG2_bias_tag_binary_output_train_df = pd.DataFrame(BABESG2_bias_tag_binary_output_train_arr, columns=['bias_BABESG2'])
  BABESG2_bias_tag_binary_output_val_df = pd.DataFrame(BABESG2_bias_tag_binary_output_val_arr, columns=['bias_BABESG2'])

  BABESG2_bias_tag_int_output_train_df = pd.DataFrame(BABESG2_bias_tag_int_output_train_arr, columns=['bias_BABESG2'])
  BABESG2_bias_tag_int_output_val_df = pd.DataFrame(BABESG2_bias_tag_int_output_val_arr, columns=['bias_BABESG2'])

  feature_characterised_index_ouput_train_df = pd.concat([BABESG2_train_dt,
                                                          BABESG2_bias_tag_index_output_train_df],
                                                         axis = 1)

  feature_characterised_index_ouput_val_df = pd.concat([BABESG2_val_dt,
                                                        BABESG2_bias_tag_index_output_val_df],
                                                       axis = 1)

  feature_characterised_binary_ouput_train_df = pd.concat([BABESG2_train_dt,
                                                           BABESG2_bias_tag_binary_output_train_df],
                                                          axis = 1)

  feature_characterised_binary_ouput_val_df = pd.concat([BABESG2_val_dt,
                                                         BABESG2_bias_tag_binary_output_val_df],
                                                        axis = 1)

  feature_characterised_int_ouput_train_df = pd.concat([BABESG2_train_dt,
                                                        BABESG2_bias_tag_int_output_train_df],
                                                       axis = 1)

  feature_characterised_int_ouput_val_df = pd.concat([BABESG2_val_dt,
                                                      BABESG2_bias_tag_int_output_val_df],
                                                     axis = 1)

  feature_characterised_index_output_BABESG2_train_df = feature_characterised_index_ouput_train_df.merge(right = fdt_hb_BABESG2_index_output_train, how = 'left', on = 'text')
  feature_characterised_index_output_BABESG2_val_df = feature_characterised_index_ouput_val_df.merge(right = fdt_hb_BABESG2_index_output_train, how = 'left', on = 'text')

  feature_characterised_binary_output_BABESG2_train_df = feature_characterised_binary_ouput_train_df.merge(right = fdt_hb_BABESG2_binary_output_train, how = 'left', on = 'text')
  feature_characterised_binary_output_BABESG2_val_df = feature_characterised_binary_ouput_val_df.merge(right = fdt_hb_BABESG2_binary_output_train, how = 'left', on = 'text')

  feature_characterised_int_output_BABESG2_train_df = feature_characterised_int_ouput_train_df.merge(right = fdt_hb_BABESG2_int_output_train, how = 'left', on = 'text')
  feature_characterised_int_output_BABESG2_val_df = feature_characterised_int_ouput_val_df.merge(right = fdt_hb_BABESG2_int_output_train, how = 'left', on = 'text')

  feature_characterised_index_output_BABESG2_train_df.to_csv(f'Ground_truth_datasets_characterised/Splits/Train/BABESG2_hybrid_characterised_index_output_train_KFold{k}.csv', index = False)
  feature_characterised_index_output_BABESG2_val_df.to_csv(f'Ground_truth_datasets_characterised/Splits/Validation/BABESG2_hybrid_characterised_index_output_val_KFold{k}.csv', index = False)

  feature_characterised_binary_output_BABESG2_train_df.to_csv(f'Ground_truth_datasets_characterised/Splits/Train/BABESG2_hybrid_characterised_binary_output_train_KFold{k}.csv', index = False)
  feature_characterised_binary_output_BABESG2_val_df.to_csv(f'Ground_truth_datasets_characterised/Splits/Validation/BABESG2_hybrid_characterised_binary_output_val_KFold{k}.csv', index = False)

  feature_characterised_int_output_BABESG2_train_df.to_csv(f'Ground_truth_datasets_characterised/Splits/Train/BABESG2_hybrid_characterised_int_output_train_KFold{k}.csv', index = False)
  feature_characterised_int_output_BABESG2_val_df.to_csv(f'Ground_truth_datasets_characterised/Splits/Validation/BABESG2_hybrid_characterised_int_output_val_KFold{k}.csv', index = False)


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/2056 [00:00<?, ?it/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/515 [00:00<?, ?it/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/2057 [00:00<?, ?it/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/514 [00:00<?, ?it/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/2057 [00:00<?, ?it/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/514 [00:00<?, ?it/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/2057 [00:00<?, ?it/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/514 [00:00<?, ?it/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/2057 [00:00<?, ?it/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/514 [00:00<?, ?it/s]

In [None]:
feature_characterised_index_output_BABESG2_train_df.head()

Unnamed: 0,text,bias,bias_BABESG2,objective_BABESG2,partly-subjective_BABESG2,subjective_BABESG2,nonpartisan_headline,slight-partisan_headline,hyperpartisan_headline,nonpartisan_article,...,true_LIAR,mostly-true_LIAR,half-true_LIAR,barely-true_LIAR,false_LIAR,pants-fire_LIAR,bluff_FakeNewsTFG,negative_NewsMTSC,neutral_NewsMTSC,positive_NewsMTSC
0,Activist Dorian Wilde said anti-LGBT statement...,0,0.062496,0.996902,0.002536,0.000562,0.000708,0.995503,0.003789,0.0117,...,0.008342,0.024063,0.391299,0.256079,0.311679,0.008538,0.000533,0.010452,0.987198,0.00235
1,U.S. President Donald Trump retweeted a video ...,1,0.616126,0.997604,0.001978,0.000418,0.017055,0.938557,0.044387,0.001611,...,0.014166,0.028831,0.537781,0.275299,0.131245,0.012678,0.001078,0.99267,0.006891,0.000439
2,Everything in the progressive agenda is aimed ...,1,0.924491,0.000609,0.004326,0.995066,0.000209,0.991865,0.007926,0.001661,...,0.005221,0.013751,0.144046,0.274683,0.555828,0.006472,0.223309,0.537822,0.007349,0.454829
3,Even pro-life leaders have expressed their app...,0,0.152709,0.994585,0.005081,0.000334,0.000169,0.999602,0.000228,0.021303,...,0.193611,0.540472,0.236409,0.008222,0.016375,0.004911,0.03162,0.000669,0.001619,0.997712
4,Backlash against marginalized communities does...,0,0.392562,0.989249,0.009894,0.000857,0.003504,0.852908,0.143588,0.001164,...,0.068446,0.143811,0.662793,0.032322,0.090104,0.002523,0.173392,0.99593,0.002794,0.001277


In [None]:
feature_characterised_index_output_BABESG2_val_df.head()

Unnamed: 0,text,bias,bias_BABESG2,objective_BABESG2,partly-subjective_BABESG2,subjective_BABESG2,nonpartisan_headline,slight-partisan_headline,hyperpartisan_headline,nonpartisan_article,...,true_LIAR,mostly-true_LIAR,half-true_LIAR,barely-true_LIAR,false_LIAR,pants-fire_LIAR,bluff_FakeNewsTFG,negative_NewsMTSC,neutral_NewsMTSC,positive_NewsMTSC
0,Letters from GOP members of Congress and natio...,1,0.335689,0.898398,0.101066,0.000536,0.14119,0.657582,0.201228,0.00049,...,0.0115,0.007012,0.075582,0.086666,0.813493,0.005746,0.003119,0.995946,0.00333,0.000724
1,Trump proceeded to ignore multiple other attem...,1,0.599408,0.097506,0.899504,0.00299,0.038922,0.054464,0.906613,0.000395,...,0.010011,0.025061,0.503072,0.36439,0.084098,0.013369,0.000574,0.998218,0.001177,0.000605
2,"As a candidate, Trump recognized that China wa...",1,0.936547,0.000582,0.004573,0.994845,0.001075,0.017359,0.981566,0.000484,...,0.010887,0.013331,0.151161,0.573233,0.228471,0.022917,0.000266,0.998138,0.000987,0.000875
3,The true elegance of ranked choice voting is t...,0,0.26329,0.001187,0.008646,0.990167,0.001724,0.84873,0.149546,0.071031,...,0.04484,0.210532,0.696947,0.016638,0.027695,0.003348,0.2413,0.000757,0.000309,0.998934
4,The House Democrats’ coronavirus recovery bill...,0,0.538307,0.440177,0.558005,0.001818,0.0012,0.998621,0.000179,0.004252,...,0.006231,0.018304,0.441259,0.390897,0.134444,0.008866,0.000301,0.984273,0.013346,0.002381


In [None]:
feature_characterised_binary_output_BABESG2_train_df.head()

Unnamed: 0,text,bias,bias_BABESG2,objective_BABESG2,partly-subjective_BABESG2,subjective_BABESG2,nonpartisan_headline,slight-partisan_headline,hyperpartisan_headline,nonpartisan_article,...,true_LIAR,mostly-true_LIAR,half-true_LIAR,barely-true_LIAR,false_LIAR,pants-fire_LIAR,bluff_FakeNewsTFG,negative_NewsMTSC,neutral_NewsMTSC,positive_NewsMTSC
0,Activist Dorian Wilde said anti-LGBT statement...,0,0,1,0,0,0,1,0,0,...,0,0,1,0,0,0,0,0,1,0
1,U.S. President Donald Trump retweeted a video ...,1,1,1,0,0,0,1,0,0,...,0,0,1,0,0,0,0,1,0,0
2,Everything in the progressive agenda is aimed ...,1,1,0,0,1,0,1,0,0,...,0,0,0,0,1,0,0,1,0,0
3,Even pro-life leaders have expressed their app...,0,0,1,0,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,1
4,Backlash against marginalized communities does...,0,0,1,0,0,0,1,0,0,...,0,0,1,0,0,0,0,1,0,0


In [None]:
feature_characterised_binary_output_BABESG2_val_df.head()

Unnamed: 0,text,bias,bias_BABESG2,objective_BABESG2,partly-subjective_BABESG2,subjective_BABESG2,nonpartisan_headline,slight-partisan_headline,hyperpartisan_headline,nonpartisan_article,...,true_LIAR,mostly-true_LIAR,half-true_LIAR,barely-true_LIAR,false_LIAR,pants-fire_LIAR,bluff_FakeNewsTFG,negative_NewsMTSC,neutral_NewsMTSC,positive_NewsMTSC
0,Letters from GOP members of Congress and natio...,1,0,1,0,0,0,1,0,0,...,0,0,0,0,1,0,0,1,0,0
1,Trump proceeded to ignore multiple other attem...,1,1,0,1,0,0,0,1,0,...,0,0,1,0,0,0,0,1,0,0
2,"As a candidate, Trump recognized that China wa...",1,1,0,0,1,0,0,1,0,...,0,0,0,1,0,0,0,1,0,0
3,The true elegance of ranked choice voting is t...,0,0,0,0,1,0,1,0,0,...,0,0,1,0,0,0,0,0,0,1
4,The House Democrats’ coronavirus recovery bill...,0,1,0,1,0,0,1,0,0,...,0,0,1,0,0,0,0,1,0,0


In [None]:
feature_characterised_int_output_BABESG2_train_df.head()

Unnamed: 0,text,bias,bias_BABESG2,subjectivity_BABESG2,hyperpartisan_headline,hyperpartisan_article,bluff_LIAR,bluff_FakeNewsTFG,sentiment_NewsMTSC
0,Activist Dorian Wilde said anti-LGBT statement...,0,0,0,1,1,2,0,1
1,U.S. President Donald Trump retweeted a video ...,1,1,0,1,2,2,0,0
2,Everything in the progressive agenda is aimed ...,1,1,2,1,2,4,0,0
3,Even pro-life leaders have expressed their app...,0,0,0,1,2,1,0,2
4,Backlash against marginalized communities does...,0,0,0,1,2,2,0,0


In [None]:
feature_characterised_int_output_BABESG2_val_df.head()

Unnamed: 0,text,bias,bias_BABESG2,subjectivity_BABESG2,hyperpartisan_headline,hyperpartisan_article,bluff_LIAR,bluff_FakeNewsTFG,sentiment_NewsMTSC
0,Letters from GOP members of Congress and natio...,1,0,0,1,2,4,0,0
1,Trump proceeded to ignore multiple other attem...,1,1,1,2,2,2,0,0
2,"As a candidate, Trump recognized that China wa...",1,1,2,2,2,3,0,0
3,The true elegance of ranked choice voting is t...,0,0,2,1,2,2,0,2
4,The House Democrats’ coronavirus recovery bill...,0,1,1,1,2,2,0,0
