In [2]:
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import DataCollatorWithPadding
from transformers import create_optimizer




In [3]:
gossip_fake = pd.read_csv('C:\\Users\\global village\\OneDrive\Desktop\\dataset\\gossipcop_fake.csv')
gossip_real = pd.read_csv('C:\\Users\\global village\\OneDrive\Desktop\\dataset\\gossipcop_real.csv')
politifact_fake = pd.read_csv('C:\\Users\\global village\\OneDrive\Desktop\\dataset\\politifact_fake.csv')
politifact_real = pd.read_csv('C:\\Users\\global village\\OneDrive\Desktop\\dataset\\politifact_real.csv')


In [4]:

gossip_fake['label'] = 0
politifact_fake['label'] = 0
gossip_real['label'] = 1
politifact_real['label'] = 1

df = pd.concat([gossip_fake, gossip_real, politifact_fake, politifact_real])
df = df[['title', 'label']]
df.dropna(inplace=True)
df.rename(columns={'title': 'text'}, inplace=True)
df.reset_index(drop=True, inplace=True)

In [5]:
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['text'].tolist(), df['label'].tolist(), test_size=0.1, random_state=42)

In [6]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=128)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [7]:
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_labels
))

val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(val_encodings),
    val_labels
))

In [8]:
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

optimizer, schedule = create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=len(train_dataset)*3)

model.compile(optimizer=optimizer,
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`





TensorFlow and JAX classes are deprecated and will be removed in Transformers v5. We recommend migrating to PyTorch classes or pinning your version of Transformers.
All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
model.fit(train_dataset.shuffle(1000).batch(16),
          validation_data=val_dataset.batch(16),
          epochs=2)

Epoch 1/2


Epoch 2/2


<tf_keras.src.callbacks.History at 0x15b1a6c1a10>

In [10]:
model.save_pretrained("./bert_fakenews_model")
tokenizer.save_pretrained("./bert_fakenews_model")


('./bert_fakenews_model\\tokenizer_config.json',
 './bert_fakenews_model\\special_tokens_map.json',
 './bert_fakenews_model\\vocab.txt',
 './bert_fakenews_model\\added_tokens.json')

In [11]:
sample_text = "Breaking: President announces new climate policy for 2030."

inputs = tokenizer(sample_text, return_tensors="tf", truncation=True, padding=True, max_length=128)

outputs = model(inputs)
logits = outputs.logits

# (0 = Fake, 1 = Real)
predicted_label = tf.argmax(logits, axis=1).numpy()[0]
label_name = "Real" if predicted_label == 1 else "Fake"
print(f"Predicted Label: {label_name}")


TensorFlow and JAX classes are deprecated and will be removed in Transformers v5. We recommend migrating to PyTorch classes or pinning your version of Transformers.


Predicted Label: Fake


In [12]:
texts = [
    "Donald Trump buys new football team in Texas.",
    "NASA successfully lands rover on Mars.",
    "Scientists discover water on the Sun's surface.",
    "Biden declares national emergency over economic crash."
]

encodings = tokenizer(texts, return_tensors='tf', padding=True, truncation=True, max_length=128)

outputs = model(encodings)
logits = outputs.logits
predictions = tf.argmax(logits, axis=1).numpy()

for text, pred in zip(texts, predictions):
    label = "Real" if pred == 1 else "Fake"
    print(f"[{label}] {text}")


[Fake] Donald Trump buys new football team in Texas.
[Real] NASA successfully lands rover on Mars.
[Fake] Scientists discover water on the Sun's surface.
[Real] Biden declares national emergency over economic crash.


In [13]:
from sklearn.metrics import classification_report

true_labels = []
pred_labels = []

for batch in val_dataset.batch(16):
    input_ids = batch['input_ids']
    attention_mask = batch['attention_mask']
    labels = batch['label']
    
    outputs = model({'input_ids': input_ids, 'attention_mask': attention_mask})
    preds = tf.argmax(outputs.logits, axis=1)
    
    true_labels.extend(labels.numpy())
    pred_labels.extend(preds.numpy())


print(classification_report(true_labels, pred_labels, target_names=["Fake", "Real"]))


TypeError: tuple indices must be integers or slices, not str

In [14]:
from sklearn.metrics import classification_report

true_labels = []
pred_labels = []

for batch in val_dataset.batch(16):
    inputs, labels = batch  
    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']
    
    outputs = model({'input_ids': input_ids, 'attention_mask': attention_mask}, training=False)
    preds = tf.argmax(outputs.logits, axis=1)
    
    true_labels.extend(labels.numpy())
    pred_labels.extend(preds.numpy())

print(classification_report(true_labels, pred_labels, target_names=["Fake", "Real"]))


              precision    recall  f1-score   support

        Fake       0.75      0.68      0.72       570
        Real       0.90      0.93      0.91      1750

    accuracy                           0.87      2320
   macro avg       0.83      0.80      0.81      2320
weighted avg       0.86      0.87      0.86      2320

