Download packages

In [None]:
#Need to install
#!pip install pandas numpy sklearn.metrics sklearn.model_selection emoji re datasets transformers warnings


In [None]:
# For data manipulation and analysis
import pandas as pd
import numpy as np

# For machine learning tools and evaluation
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, balanced_accuracy_score


from sklearn.model_selection import train_test_split

import emoji
import re
import datasets
from transformers import TrainingArguments, Trainer
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import pipeline
import warnings

Get BERT model and tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained("DTAI-KULeuven/robbert-2022-dutch-base")
model = AutoModelForSequenceClassification.from_pretrained("DTAI-KULeuven/robbert-2022-dutch-base")

Prepare data

In [31]:
# Read data 
data =  pd.read_csv("insert_own_path")
# Classification only works when outcome variable is called labels
data[['labels']] = data[['label_mis']]
# Convert emoji to descriptions (in English) using emoji package
def no_emoji(text):
    text = emoji.demojize(text) 
    return text 
data['text'] = data['text'].apply(no_emoji)
#Remove urls, &gt, &lt and &amp, and [numbers] from text
#Removed [numbers] because there were meaningless numbers between brackets in the Tweets
def clean_text(text):
    text = re.sub(r'https?://\S+|www\.\S+|\r|\n|&gt.?| &lt.?|&amp.?|\[\d*\]', '', text)
    return text 
data['text'] = data['text'].apply(clean_text) 

#Display cleaned text
pd.set_option('display.max_rows', None)
pd.set_option('max_colwidth', None)

#One dataset with only text and labels to train BERT model
#One dataset with also year so I can later split dataset and evaluate metrics per year
data_inf = data[['text', 'labels', 'id', 'year']]
data = data[['text', 'labels']]



Set values of variables important for training

In [32]:
max_length = 512
# set random seed for reproducibility
SEED_GLOBAL = 42
np.random.seed(SEED_GLOBAL)
#Set training directory 
training_directory = "insert_own_path"

Split data into training and test set

In [33]:
# Train and test set for training model
df_train, df_test = train_test_split(data, random_state=42, test_size=0.25)

#Create identical test with text and labels plus year so performance metrics can be split by year
df_train_inf, df_test_inf = train_test_split(data_inf, random_state=42, test_size=0.25)

Tokenize data (from https://github.com/MoritzLaurer/summer-school-transformers-2023/blob/main/3_tune_bert.ipynb)

In [None]:
# convert pandas dataframes to Hugging Face dataset object to facilitate pre-processing
dataset = datasets.DatasetDict({
    "train": datasets.Dataset.from_pandas(df_train),
    "test": datasets.Dataset.from_pandas(df_test)
})

# tokenize
def tokenize(examples):
  return tokenizer(examples["text"], truncation=True, padding = 'max_length', max_length=512)  
dataset = dataset.map(tokenize, batched=True)

Set training arguments (from https://github.com/MoritzLaurer/summer-school-transformers-2023/blob/main/3_tune_bert.ipynb)

In [35]:
train_args = TrainingArguments(
    num_train_epochs=3,  
    learning_rate=2e-5,
    per_device_train_batch_size=16,  
    per_device_eval_batch_size=64,   
    warmup_ratio=0.06, 
    weight_decay=0.1,
    seed=SEED_GLOBAL,
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    evaluation_strategy="epoch", 
    save_strategy = "epoch",
    report_to="all",
    output_dir=f'{training_directory}',
    logging_dir=f'{training_directory}',
)


Set evaluation metrics (from https://github.com/MoritzLaurer/summer-school-transformers-2023/blob/main/3_tune_bert.ipynb)

In [36]:
# Function to calculate metrics
# documentation on all metrics: https://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics

def compute_metrics_standard(eval_pred):
    with warnings.catch_warnings():
        warnings.filterwarnings("ignore")

        labels = eval_pred.label_ids
        pred_logits = eval_pred.predictions
        preds_max = np.argmax(pred_logits, axis=1) 

        # metrics
        precision_mis, recall_mis, f1_mis, _ = precision_recall_fscore_support(labels, preds_max, average= 'binary') 
        precision_macro, recall_macro, f1_macro, _ = precision_recall_fscore_support(labels, preds_max, average='macro')  
        precision_micro, recall_micro, f1_micro, _ = precision_recall_fscore_support(labels, preds_max, average='micro')  
        acc_balanced = balanced_accuracy_score(labels, preds_max)
        acc_not_balanced = accuracy_score(labels, preds_max)

        metrics = {
            'accuracy': acc_not_balanced,
            'f1_macro': f1_macro,
            'accuracy_balanced': acc_balanced,
            'f1_micro': f1_micro,
            'precision_macro': precision_macro,
            'recall_macro': recall_macro,
            'precision_micro': precision_micro,
            'recall_micro': recall_micro,
            'precision_misinformation': precision_mis,
            'recall_misinformation': recall_mis,
            'f1_misinformation': f1_mis,
        }

        return metrics

Set BERT model

In [None]:
trainer = Trainer(
    model=model,                         
    args=train_args,             
    train_dataset=dataset["train"],        
    eval_dataset=dataset["test"],           
    compute_metrics=compute_metrics_standard     
)

Train BERT model 

In [None]:
trainer.train()

Save model

In [39]:
trainer.save_model(output_dir = "insert_own_path")

Load model

In [40]:
model_path = "insert_own_path"
model = AutoModelForSequenceClassification.from_pretrained(model_path)

Seeing what model predicts as misinformation (from https://github.com/MoritzLaurer/summer-school-transformers-2023/blob/main/3_tune_bert.ipynb)

In [None]:
# documentation: https://huggingface.co/docs/transformers/main_classes/pipelines#transformers.ZeroShotClassificationPipeline
pipe_classifier = pipeline(
    "text-classification",
    model=model,  
    tokenizer=tokenizer,
    framework="pt"
)

Inference: inspect what BERT model classifies as misinformation

In [None]:
#Use df_test with date so data can be splitted on year
df_inference = df_test_inf[["text", "labels", "id", "year"]].copy(deep=True)
text_lst = df_inference["text"].tolist()

#inference
pipe_output = pipe_classifier(
    text_lst, 
    batch_size=64 
)
print(pipe_output)

df_output = pd.DataFrame(pipe_output)

# add inference data to original dataframe
df_inference["label_text_pred"] = df_output["label"].tolist()
df_inference["label_text_pred_probability"] = df_output["score"].round(2).tolist()
#Print df_inference
df_inference 

Calculate metrics per year

Split dataset on date

In [43]:
# Convert 'label_text_pred' to binary values
df_inference['label_pred'] = df_inference['label_text_pred'].apply(lambda x: 1 if x == 'LABEL_1' else 0)

#Split data
joint_data_2020 = df_inference[df_inference['year'] == 2020]
joint_data_2021 = df_inference[df_inference['year'] != 2020]

Calculate recall and precision for overall dataset and datasets split on year

In [None]:
# Calculate precision and recall overall (to test if this gets me the same results as above)
precision = precision_recall_fscore_support(y_true=df_inference['labels'], y_pred=df_inference['label_pred'], average='macro')
recall = recall_score(df_inference['labels'], df_inference['label_pred'], average= 'binary')
print(f'Precision_recall_fscore_macro: {precision}')
print(f'Recall_misinfo: {recall}')

In [None]:
# Calculate precision and recall (2020)
precision_recall_f = precision_recall_fscore_support(y_true=joint_data_2020['labels'], y_pred=joint_data_2020['label_pred'], average='binary')
precision_recall_f_macro = precision_recall_fscore_support(y_true=joint_data_2020['labels'], y_pred=joint_data_2020['label_pred'], average='macro')
accuracy = accuracy_score(y_true=joint_data_2020['labels'], y_pred=joint_data_2020['label_pred'])
print(f'Precision_recall_fscore_misinfo: {precision_recall_f}')
print(f'Precision_recall_fscore_macro: {precision_recall_f_macro}')
print(f'Accuracy: {accuracy}')

In [None]:
# Calculate precision and recall (after 2020)
precision_recall_f = precision_recall_fscore_support(y_true=joint_data_2021['labels'], y_pred=joint_data_2021['label_pred'], average='binary')
precision_recall_f_macro = precision_recall_fscore_support(y_true=joint_data_2021['labels'], y_pred=joint_data_2021['label_pred'], average='macro')
accuracy = accuracy_score(y_true=joint_data_2021['labels'], y_pred=joint_data_2021['label_pred'])
print(f'Precision_recall_fscore_misinfo: {precision_recall_f}')
print(f'Precision_recall_fscore_macro: {precision_recall_f_macro}')
print(f'Accuracy: {accuracy}')

Calculate metrics overall and per year for accurate information Tweets

In [None]:
precision = precision_recall_fscore_support(y_true=df_inference['labels'], y_pred=df_inference['label_pred'], average='binary', pos_label =0)
print(f'Precision, recall, fscore: {precision}')

In [None]:
# Calculate precision and recall (2020)
precision_recall_f = precision_recall_fscore_support(y_true=joint_data_2020['labels'], y_pred=joint_data_2020['label_pred'], average='binary', pos_label =0)
print(f'Precision_recall_fscore: {precision_recall_f}')

In [None]:
# Calculate precision and recall (after 2020)
precision_recall_f = precision_recall_fscore_support(y_true=joint_data_2021['labels'], y_pred=joint_data_2021['label_pred'], average='binary', pos_label =0)
print(f'Precision_recall_fscore: {precision_recall_f}')