In [71]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch
import accelerate

In [73]:
# Load the dataset
news_dataset = pd.read_csv(r'C:\Users\monil\Desktop\Graduate Project\resources\datasets\train.csv\train.csv')

In [75]:
# Replace null values with empty string
news_dataset = news_dataset.fillna('')


In [77]:
# %% Downsample Dataset to 8,000 While Keeping Proportions
# Get separate DataFrames for each label
fake_news = news_dataset[news_dataset["label"] == 1]
real_news = news_dataset[news_dataset["label"] == 0]

# Get equal proportions (around 4,000 each)
fake_sample = fake_news.sample(n=4000, random_state=42)
real_sample = real_news.sample(n=4000, random_state=42)

# Combine and shuffle
balanced_dataset = pd.concat([fake_sample, real_sample]).sample(frac=1, random_state=42)


In [79]:
# Extract features and labels
X = balanced_dataset["text"].values
Y = balanced_dataset["label"].values

In [81]:
# Split the dataset into training and test data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [29]:
# Load pre-trained BERT model and tokenizer from local directory
tocken_name = r"C:\Users\monil\Desktop\Graduate Project\resources\BERT Model\bert-base-uncased"  # Path to the downloaded files
tokenizer = BertTokenizer.from_pretrained(tocken_name)

model_checkpoint = r"C:\Users\monil\Desktop\Graduate Project\Fake-News-Detection\results\checkpoint-1200"
model = BertForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2)

In [31]:
# Tokenize the dataset
def tokenize_data(texts, labels):
    return tokenizer(texts, padding=True, truncation=True, return_tensors="pt")



In [None]:
train_encodings = tokenize_data(X_train.tolist(), Y_train.tolist())
test_encodings = tokenize_data(X_test.tolist(), Y_test.tolist())



In [17]:
# %% Move Encodings to GPU
train_encodings = {key: val.to("cuda") for key, val in train_encodings.items()}
test_encodings = {key: val.to("cuda") for key, val in test_encodings.items()}

In [35]:
# %% Create PyTorch Dataset
class NewsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = {key: val.cpu() for key, val in encodings.items()}  # Ensure CPU tensors
        self.labels = torch.tensor(labels, dtype=torch.long)  # Ensure long tensor

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item["labels"] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = NewsDataset(train_encodings, Y_train.tolist())
test_dataset = NewsDataset(test_encodings, Y_test.tolist())


# # Save the tokenized dataset
torch.save(train_dataset, "./train_dataset.pt")
torch.save(test_dataset, "./test_dataset.pt")

## After restarting the kernal load the datasets which were previously saved
# train_dataset = torch.load("./train_dataset.pt",weights_only=False)
# test_dataset = torch.load("./test_dataset.pt",weights_only=False)

# Now you can use train_dataset and test_dataset directly

In [33]:
# Move the model to the GPU
model.to("cuda")

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [37]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",  # Ensure save and eval strategy match
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,  # Added this to prevent memory issues
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=500,
    save_total_limit=2,
    load_best_model_at_end=True,
    fp16=True,  # Enable Mixed Precision Training for Faster Training
)

In [39]:
# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

In [39]:
# Train the model
trainer.train()


Epoch,Training Loss,Validation Loss
1,No log,0.070194
2,0.121800,0.066765
3,0.020200,0.070457


TrainOutput(global_step=1200, training_loss=0.06055707385142644, metrics={'train_runtime': 19845.2492, 'train_samples_per_second': 0.967, 'train_steps_per_second': 0.06, 'total_flos': 5051732262912000.0, 'train_loss': 0.06055707385142644, 'epoch': 3.0})

In [42]:
# Save the fine-tuned model and tokenizer
model.save_pretrained(r"C:\Users\monil\Desktop\Graduate Project\resources\BERT Model\bert-base-uncased\fine_tuned_bert_model")
tokenizer.save_pretrained(r"C:\Users\monil\Desktop\Graduate Project\resources\BERT Model\bert-base-uncased\fine_tuned_bert_tokenizer")

('C:\\Users\\monil\\Desktop\\Graduate Project\\resources\\BERT Model\\bert-base-uncased\\fine_tuned_bert_tokenizer\\tokenizer_config.json',
 'C:\\Users\\monil\\Desktop\\Graduate Project\\resources\\BERT Model\\bert-base-uncased\\fine_tuned_bert_tokenizer\\special_tokens_map.json',
 'C:\\Users\\monil\\Desktop\\Graduate Project\\resources\\BERT Model\\bert-base-uncased\\fine_tuned_bert_tokenizer\\vocab.txt',
 'C:\\Users\\monil\\Desktop\\Graduate Project\\resources\\BERT Model\\bert-base-uncased\\fine_tuned_bert_tokenizer\\added_tokens.json')

In [41]:
# Evaluate the model
results = trainer.evaluate()
print(results.keys())
print("Test Accuracy:", results.get("accuracy", "Key not found"))
print("Evaluation Results:", results)


dict_keys(['eval_loss', 'eval_model_preparation_time', 'eval_runtime', 'eval_samples_per_second', 'eval_steps_per_second'])
Test Accuracy: Key not found
Evaluation Results: {'eval_loss': 0.07045724242925644, 'eval_model_preparation_time': 0.0077, 'eval_runtime': 440.9766, 'eval_samples_per_second': 3.628, 'eval_steps_per_second': 0.227}


In [45]:
# Move model to eval mode
model.eval()

# Prepare data loader
from torch.utils.data import DataLoader

test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

# Store predictions and labels
all_preds = []
all_labels = []

# Evaluate model
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

with torch.no_grad():
    for batch in test_loader:
        inputs = {key: val.to(device) for key, val in batch.items() if key != "labels"}
        labels = batch["labels"].to(device)

        outputs = model(**inputs)
        preds = torch.argmax(outputs.logits, dim=1)  # Get predicted class

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Compute accuracy
accuracy = accuracy_score(all_labels, all_preds)
print(f"Test Accuracy: {accuracy:.4f}")  # Print accuracy


Test Accuracy: 0.9869


In [59]:
def predict_fake_news(text):
    model.eval()  # Set model to evaluation mode

    # Tokenize input
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    inputs = {key: val.to(device) for key, val in inputs.items()}  # Move to GPU if available

    # Get model outputs
    with torch.no_grad():
        outputs = model(**inputs)
    
    logits = outputs.logits.cpu().numpy()  # Get raw logits
    probs = torch.nn.functional.softmax(outputs.logits, dim=-1).cpu().numpy()  # Convert to probabilities

    print(f"Logits: {logits}")
    print(f"Probabilities: {probs}")

    # Get predicted class
    predicted_class = torch.argmax(outputs.logits, dim=-1).item()
    labels = ["Fake News", "Real News"]  # Adjust if your dataset is labeled differently
    return labels[predicted_class], probs

In [61]:
test_texts = [
    "Scientists confirm the moon is actually made of cheese!",
    "New study shows drinking water makes people immortal.",
    "Breaking: A man found a time machine in his basement!"
]

for text in test_texts:
    print(f"Text: {text} -> Prediction: {predict_fake_news(text)}")

Logits: [[-3.984375  4.34375 ]]
Probabilities: [[2.415663e-04 9.997584e-01]]
Text: Scientists confirm the moon is actually made of cheese! -> Prediction: ('Real News', array([[2.415663e-04, 9.997584e-01]], dtype=float32))
Logits: [[-3.6210938  4.0546875]]
Probabilities: [[4.6371284e-04 9.9953628e-01]]
Text: New study shows drinking water makes people immortal. -> Prediction: ('Real News', array([[4.6371284e-04, 9.9953628e-01]], dtype=float32))
Logits: [[-3.9003906  4.3632812]]
Probabilities: [[2.5764457e-04 9.9974233e-01]]
Text: Breaking: A man found a time machine in his basement! -> Prediction: ('Real News', array([[2.5764457e-04, 9.9974233e-01]], dtype=float32))


In [63]:
labels = np.array(Y_train.tolist())  # Convert labels to numpy array
fake_count = (labels == 0).sum()
real_count = (labels == 1).sum()

print(f"Fake News: {fake_count}, Real News: {real_count}")

Fake News: 3200, Real News: 3200


In [69]:
texts = [
    "qwertyuiop asdfghjkl zxcvbnm 1234567890",
    "This is completely made-up and has no basis in reality.",
    "Aliens have landed in New York and are taking selfies with humans!",
    "A magical unicorn was found in the Amazon rainforest!",
    "Government officials admit to time travel experiments."
]

for text in texts:
    prediction, probs = predict_fake_news(text)
    print(f"Text: {text} -> Prediction: {prediction}, Probabilities: {probs}")

Logits: [[-4.03125  4.375  ]]
Probabilities: [[2.2341634e-04 9.9977654e-01]]
Text: qwertyuiop asdfghjkl zxcvbnm 1234567890 -> Prediction: Real News, Probabilities: [[2.2341634e-04 9.9977654e-01]]
Logits: [[-3.2285156  3.4472656]]
Probabilities: [[0.0012595 0.9987405]]
Text: This is completely made-up and has no basis in reality. -> Prediction: Real News, Probabilities: [[0.0012595 0.9987405]]
Logits: [[-3.9042969  4.1640625]]
Probabilities: [[3.1319875e-04 9.9968684e-01]]
Text: Aliens have landed in New York and are taking selfies with humans! -> Prediction: Real News, Probabilities: [[3.1319875e-04 9.9968684e-01]]
Logits: [[-3.6503906  3.9804688]]
Probabilities: [[4.8500832e-04 9.9951494e-01]]
Text: A magical unicorn was found in the Amazon rainforest! -> Prediction: Real News, Probabilities: [[4.8500832e-04 9.9951494e-01]]
Logits: [[-3.65625   4.015625]]
Probabilities: [[4.6552691e-04 9.9953449e-01]]
Text: Government officials admit to time travel experiments. -> Prediction: Real New