### This script implements BERT-base training only on human-generated dataset. LIAR 2 middle category is excluded from training. Additionally, we test on the "LLM Fake News Dataset" ###

In [1]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras import mixed_precision
from sklearn.metrics import confusion_matrix, classification_report 
from transformers import BertTokenizer
import pandas as pd
import zipfile
import os 
from sklearn.model_selection import train_test_split



  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(


In [2]:
with zipfile.ZipFile('combined_train_df_2.csv.zip','r') as zip:
    with zip.open('combined_train_df_2.csv') as f:
        train = pd.read_csv(f,encoding='ISO-8859-1')

with zipfile.ZipFile('test_df_2.csv.zip','r') as zip:
    with zip.open('test_df_2.csv') as f:
        test = pd.read_csv(f,encoding='ISO-8859-1')

In [None]:
# Removing third category (Half-True)
train = train[train['label']!=3]

test = test[test['label']!=3]

In [7]:

# Some Descriptives

display(train['dataset'].value_counts(normalize=True))
display(train['binary_label'].value_counts(normalize=True))



display(len(test))
display(len(train))

dataset
Fakeddit                          0.626830
Kaggle 1 - Fake News              0.264520
Kaggle 2 - News Project           0.072445
Kaggle 3 - Fake News Detection    0.022061
LIAR 2                            0.014144
Name: proportion, dtype: float64

binary_label
1    0.535523
0    0.464477
Name: proportion, dtype: float64

19350

1122458

In [8]:
gpus = tf.config.experimental.list_physical_devices('GPU')

if gpus:
    try:
        # Enable memory growth for the first (and only) GPU
        tf.config.experimental.set_memory_growth(gpus[0], True)
        print(f"Memory growth enabled for {gpus[0]}")
    except RuntimeError as e:
        print(e)  # This happens if GPUs are initialized before setting memory growth
else:
    print("No GPU found. Running on CPU.")

Memory growth enabled for PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')


In [9]:
BATCH_SIZE = 16
SEED = 42


# Test-Validation Split
train, val = train_test_split(train,test_size=0.3,random_state=42)#stratify=temp_train['dataset'])



# Training data
X_train = train['text'].values  
y_train = train['binary_label'].values 

# Validation data
X_val = val['text'].values
y_val = val['binary_label'].values

#Test data
X_test = test['text'].values
y_test = test['binary_label'].values


In [10]:
X_train = [str(x) for x in X_train]
X_val = [str(x) for x in X_val]

In [None]:
# Distribution of datasets in training validation and test set
# Test set is equally sampled as we wanted. LIAR 2 a bit less due to the removing of category 3

display(val['dataset'].value_counts(normalize=True))

display(train['dataset'].value_counts(normalize=True))

display(test['dataset'].value_counts(normalize=True))

dataset
Fakeddit                          0.626927
Kaggle 1 - Fake News              0.264514
Kaggle 2 - News Project           0.072439
Kaggle 3 - Fake News Detection    0.022148
LIAR 2                            0.013972
Name: proportion, dtype: float64

dataset
Fakeddit                          0.626788
Kaggle 1 - Fake News              0.264523
Kaggle 2 - News Project           0.072447
Kaggle 3 - Fake News Detection    0.022024
LIAR 2                            0.014218
Name: proportion, dtype: float64

dataset
Kaggle 2 - News Project           0.206718
Kaggle 3 - Fake News Detection    0.206718
Fakeddit                          0.206718
Kaggle 1 - Fake News              0.206718
LIAR 2                            0.173127
Name: proportion, dtype: float64

In [None]:
# Tokenization

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

train_encodings = tokenizer(X_train, truncation=True, padding = 'max_length',max_length=60, return_tensors="tf")

val_encodings = tokenizer(
    X_val,
    truncation=True,
    padding = 'max_length',
    max_length = 60,
    return_tensors="tf"
)



In [13]:
# Prepare dataset
inputs = {
    'input_word_ids': train_encodings['input_ids'],
    'input_mask': train_encodings['attention_mask'],
    'input_type_ids': train_encodings['token_type_ids']
}
labels = tf.cast(y_train, tf.float32)


val_inputs = {
    'input_word_ids': val_encodings['input_ids'],
    'input_mask': val_encodings['attention_mask'],
    'input_type_ids': val_encodings['token_type_ids']
}
val_labels = tf.cast(y_val, tf.float32)


# Now build dataset properly
train_ds = tf.data.Dataset.from_tensor_slices((inputs,  labels)).shuffle(buffer_size=len(X_train),seed=SEED).batch(BATCH_SIZE).cache().prefetch(tf.data.AUTOTUNE)
val_ds = tf.data.Dataset.from_tensor_slices((val_inputs, val_labels))\
         .batch(BATCH_SIZE)\
         .prefetch(tf.data.AUTOTUNE)


In [14]:
# Build Model
mixed_precision.set_global_policy('mixed_float16')

# BERT encoder 
bert_model = hub.KerasLayer(
    "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3",
    trainable=True
)


# Inputs
input_ids = tf.keras.Input(shape=(60,), dtype=tf.int32, name="input_word_ids")
input_mask = tf.keras.Input(shape=(60,), dtype=tf.int32, name="input_mask")
type_ids = tf.keras.Input(shape=(60,), dtype=tf.int32, name="input_type_ids")

bert_inputs = {
    'input_word_ids': input_ids,
    'input_mask': input_mask,
    'input_type_ids': type_ids
}

bert_outputs = bert_model(bert_inputs)
cls_token = bert_outputs['pooled_output']

x = tf.keras.layers.Dropout(0.1)(cls_token)
x = tf.keras.layers.Dense(1, activation='sigmoid', dtype='float32')(x)

model = tf.keras.Model(inputs=[input_ids, input_mask, type_ids], outputs=x)
model.summary()


INFO:tensorflow:Mixed precision compatibility check (mixed_float16): OK
Your GPU will likely run quickly with dtype policy mixed_float16 as it has compute capability of at least 7.0. Your GPU: NVIDIA GeForce RTX 4050 Laptop GPU, compute capability 8.9
Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_mask (InputLayer)        [(None, 60)]         0           []                               
                                                                                                  
 input_type_ids (InputLayer)    [(None, 60)]         0           []                               
                                                                                                  
 input_word_ids (InputLayer)    [(None, 60)]         0           []                               
                                        

In [15]:
model.compile(loss = tf.keras.losses.BinaryCrossentropy(), optimizer = tf.keras.optimizers.Adam(2e-5), metrics = ['accuracy'])

In [16]:
history = model.fit(train_ds, validation_data=val_ds, epochs=1)





In [None]:
# Testing

X_test = [str(x) for x in X_test]

test_encodings = tokenizer(X_test, truncation=True, padding='max_length', max_length=60, return_tensors="tf")

# Prepare dataset
inputs_test = {
    'input_word_ids': test_encodings['input_ids'],
    'input_mask': test_encodings['attention_mask'],
    'input_type_ids': test_encodings['token_type_ids']
}

# Predictions
predictions = model.predict(dict(inputs_test))

threshold = 0.5
preds = (predictions>threshold).astype(int)

print(classification_report(preds,y_test, target_names = ['Fake','Real']))

              precision    recall  f1-score   support

        Fake       0.88      0.83      0.85      9932
        Real       0.83      0.88      0.86      9418

    accuracy                           0.86     19350
   macro avg       0.86      0.86      0.86     19350
weighted avg       0.86      0.86      0.86     19350



In [None]:
# Accuracy by dataset

preds_new = pd.DataFrame(preds,index=test.index)
concat = pd.concat([test,preds_new],axis=1)

concat.columns.values[-1] = 'preds'

concat['preds'].value_counts()

accuracy_df = (concat['preds'] == concat['binary_label']).groupby(concat['dataset']).mean()

accuracy_df


dataset
Fakeddit                          0.858750
Kaggle 1 - Fake News              0.965250
Kaggle 2 - News Project           0.839750
Kaggle 3 - Fake News Detection    0.881500
LIAR 2                            0.706866
dtype: float64

## Testing on LLM-generated fake news ##

In [2]:
# Reading LLM data
with zipfile.ZipFile('llm_train_df.csv.zip','r') as zip:
    with zip.open('llm_train_df.csv') as f:
        llm_data = pd.read_csv(f,encoding='ISO-8859-1')

In [4]:
X_test = llm_data['text'].values
y_test = llm_data['binary_label'].values 

X_test = [str(x) for x in X_test]

In [5]:

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

test_encodings = tokenizer(X_test, truncation=True, padding = 'max_length',max_length=60, return_tensors="tf")

inputs = {  'input_word_ids': test_encodings['input_ids'],
            'input_mask': test_encodings['attention_mask'],
            'input_type_ids': test_encodings['token_type_ids']
}





In [6]:
preds = model.predict(inputs)

threshold = 0.5
preds = (preds>threshold).astype(int)

print(classification_report(preds,y_test, target_names = ['Fake','Real']))

              precision    recall  f1-score   support

        Fake       0.27      0.71      0.39    100279
        Real       0.74      0.30      0.43    281968

    accuracy                           0.41    382247
   macro avg       0.51      0.51      0.41    382247
weighted avg       0.62      0.41      0.42    382247



In [8]:
# Per model accuracy

preds_new = pd.DataFrame(preds,index=llm_data.index)
concat = pd.concat([llm_data,preds_new],axis=1)

concat.columns.values[-1] = 'preds'

concat['preds'].value_counts()

accuracy_df = (concat['preds'] == concat['binary_label']).groupby(concat['model']).mean()

accuracy_df


model
GPT3.5        0.299480
Llama2 13b    0.247443
Llama2 7b     0.248331
Mistral 7b    0.262976
dtype: float64