In [1]:
!pip install transformers datasets scikit-learn

import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset, load_metric
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np



In [2]:
import pandas as pd

# Load data
true = pd.read_csv("True.csv")
fake = pd.read_csv("Fake.csv")

# Add labels
true["label"] = 1
fake["label"] = 0

# Add a new 'text' column combining title and text
true["text"] = true["title"] + " " + true["text"]
fake["text"] = fake["title"] + " " + fake["text"]

# Combine and shuffle
df = pd.concat([true, fake])[["text", "label"]].sample(frac=1).reset_index(drop=True)


In [3]:
from sklearn.model_selection import train_test_split
from datasets import Dataset

# Split into train/test
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df["label"], random_state=42)

# Add split column
train_df["split"] = "train"
test_df["split"] = "test"

# Combine and convert
combined_df = pd.concat([train_df, test_df])
dataset = Dataset.from_pandas(combined_df)

# Split datasets
train_ds = dataset.filter(lambda x: x['split'] == 'train').remove_columns(['split'])
test_ds = dataset.filter(lambda x: x['split'] == 'test').remove_columns(['split'])


Filter:   0%|          | 0/44898 [00:00<?, ? examples/s]

Filter:   0%|          | 0/44898 [00:00<?, ? examples/s]

In [4]:
from transformers import AutoTokenizer

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# Tokenization function
def tokenize_function(example):
    return tokenizer(example["text"], padding="max_length", truncation=True)

# Apply to both train and test datasets
tokenized_train = train_ds.map(tokenize_function, batched=True)
tokenized_test = test_ds.map(tokenize_function, batched=True)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/35918 [00:00<?, ? examples/s]

Map:   0%|          | 0/8980 [00:00<?, ? examples/s]

In [5]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
import os
os.environ["WANDB_DISABLED"] = "true"


In [7]:
from transformers import TrainingArguments, Trainer
import torch

# Set training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,                          # ⚡ Fast training
    per_device_train_batch_size=16,              # 🔥 Moderate batch size
    per_device_eval_batch_size=16,
    warmup_steps=50,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    eval_strategy="epoch", # Corrected parameter name
    save_strategy="epoch",
    fp16=torch.cuda.is_available()              # Use mixed precision if GPU
)

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test
)

# 🚀 Train
trainer.train()

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss
1,0.0,0.002095


TrainOutput(global_step=2245, training_loss=0.014112737046617948, metrics={'train_runtime': 517.0953, 'train_samples_per_second': 69.461, 'train_steps_per_second': 4.342, 'total_flos': 4757964024926208.0, 'train_loss': 0.014112737046617948, 'epoch': 1.0})

In [8]:
model.save_pretrained("model")
tokenizer.save_pretrained("model")


('model/tokenizer_config.json',
 'model/special_tokens_map.json',
 'model/vocab.txt',
 'model/added_tokens.json',
 'model/tokenizer.json')

In [10]:
from sklearn.metrics import classification_report, accuracy_score

# 🔁 Get model predictions on test set
predictions = trainer.predict(tokenized_test)
y_true = predictions.label_ids
y_pred = predictions.predictions.argmax(axis=1)

# ✅ Accuracy
acc = accuracy_score(y_true, y_pred)
print(f"\n✅ Accuracy: {acc:.4f}")

# 📊 Detailed Report
report = classification_report(y_true, y_pred, target_names=["Fake", "Real"])
print("\n📊 Classification Report:")
print(report)

# ✅ Count of Correct Predictions
correct = (y_true == y_pred).sum()
total = len(y_true)
print(f"\n✅ {correct} out of {total} predictions are correct.")


✅ Accuracy: 0.9998

📊 Classification Report:
              precision    recall  f1-score   support

        Fake       1.00      1.00      1.00      4696
        Real       1.00      1.00      1.00      4284

    accuracy                           1.00      8980
   macro avg       1.00      1.00      1.00      8980
weighted avg       1.00      1.00      1.00      8980


✅ 8978 out of 8980 predictions are correct.


In [11]:
from transformers import pipeline

device = 0 if torch.cuda.is_available() else -1
clf = pipeline("text-classification", model=model, tokenizer=tokenizer, device=device)

test_sentences = [
    "World Health Organization declares end to global COVID-19 emergency.",

   "NASA successfully launches new Mars rover to explore signs of ancient life.",

   "India becomes the world’s fastest-growing major economy in 2024.",

   "Supreme Court rules in favor of environmental regulations to cut emissions.",

    "Electric vehicle sales reach record high across Europe.",
    "Drinking bleach can cure COVID-19, experts say.",

"Time traveler from 2075 reveals secret mission to warn humanity.",

"Elvis Presley found alive on remote island, claims fan.",

"Aliens spotted shopping at Walmart in California.",

"Scientists confirm moon is made of cheese in new NASA study.",

]

for text in test_sentences:
    pred = clf(text)[0]
    label = pred["label"]
    score = pred["score"]
    print(f"\n📰 \"{text}\"\n→ Predicted: {label} with confidence {score:.2f}")


Device set to use cuda:0



📰 "World Health Organization declares end to global COVID-19 emergency."
→ Predicted: LABEL_1 with confidence 0.80

📰 "NASA successfully launches new Mars rover to explore signs of ancient life."
→ Predicted: LABEL_0 with confidence 0.91

📰 "India becomes the world’s fastest-growing major economy in 2024."
→ Predicted: LABEL_1 with confidence 1.00

📰 "Supreme Court rules in favor of environmental regulations to cut emissions."
→ Predicted: LABEL_0 with confidence 0.98

📰 "Electric vehicle sales reach record high across Europe."
→ Predicted: LABEL_1 with confidence 0.95

📰 "Drinking bleach can cure COVID-19, experts say."
→ Predicted: LABEL_0 with confidence 0.94

📰 "Time traveler from 2075 reveals secret mission to warn humanity."
→ Predicted: LABEL_0 with confidence 1.00

📰 "Elvis Presley found alive on remote island, claims fan."
→ Predicted: LABEL_0 with confidence 1.00

📰 "Aliens spotted shopping at Walmart in California."
→ Predicted: LABEL_0 with confidence 1.00

📰 "Scientists c