In [None]:
from google.colab import files
uploaded = files.upload()

Saving Fake.csv to Fake.csv
Saving True.csv to True.csv


In [None]:
!pip install transformers datasets torch scikit-learn




In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load datasets
fake = pd.read_csv("Fake.csv")
real = pd.read_csv("True.csv")

# Add labels: 0 = fake, 1 = real
fake['label'] = 0
real['label'] = 1

# Combine and shuffle
df = pd.concat([fake[['text', 'label']], real[['text', 'label']]])
df = df.dropna()
df = df.sample(frac=1).reset_index(drop=True)

# Train-test split
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['text'].tolist(), df['label'].tolist(), test_size=0.2, random_state=42
)


In [None]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=512)


In [None]:
import torch

class NewsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.encodings['input_ids'][idx]),
            'attention_mask': torch.tensor(self.encodings['attention_mask'][idx]),
            'labels': torch.tensor(self.labels[idx])
        }

train_dataset = NewsDataset(train_encodings, train_labels)
test_dataset = NewsDataset(test_encodings, test_labels)


In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"


In [None]:
from transformers import BertForSequenceClassification, Trainer, TrainingArguments
import os

# Disable WandB tracking
os.environ["WANDB_DISABLED"] = "true"

# Load pre-trained BERT model with classification head
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Training configuration
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    logging_strategy='epoch',
    save_strategy='epoch',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    load_best_model_at_end=True,
)

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# Start training
trainer.train()


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss
1,0.0238,0.00491
2,0.0264,0.004408


Epoch,Training Loss,Validation Loss
1,0.0238,0.00491
2,0.0264,0.004408
3,0.0097,0.0046


TrainOutput(global_step=13470, training_loss=0.019975421795069774, metrics={'train_runtime': 10654.442, 'train_samples_per_second': 10.114, 'train_steps_per_second': 1.264, 'total_flos': 2.835126865926144e+16, 'train_loss': 0.019975421795069774, 'epoch': 3.0})

In [None]:
trainer.train(resume_from_checkpoint=True)


Epoch,Training Loss,Validation Loss


TrainOutput(global_step=13470, training_loss=0.0, metrics={'train_runtime': 0.3959, 'train_samples_per_second': 272170.031, 'train_steps_per_second': 34023.148, 'total_flos': 2.835126865926144e+16, 'train_loss': 0.0, 'epoch': 3.0})

In [None]:
metrics = trainer.evaluate()
print(metrics)


{'eval_loss': 0.004408468957990408, 'eval_runtime': 234.9396, 'eval_samples_per_second': 38.223, 'eval_steps_per_second': 4.78, 'epoch': 3.0}


In [None]:
predictions = trainer.predict(test_dataset)
preds = predictions.predictions.argmax(-1)

# Optionally print or compare with actual labels
print(preds)


[0 0 1 ... 1 1 1]


In [None]:
model.save_pretrained("./fake_news_model")
tokenizer.save_pretrained("./fake_news_model")


('./fake_news_model/tokenizer_config.json',
 './fake_news_model/special_tokens_map.json',
 './fake_news_model/vocab.txt',
 './fake_news_model/added_tokens.json')

In [None]:
from transformers import BertForSequenceClassification, BertTokenizer

model = BertForSequenceClassification.from_pretrained('./fake_news_model')
tokenizer = BertTokenizer.from_pretrained('./fake_news_model')


In [None]:
from transformers import pipeline

classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)

result = classifier("Breaking: NASA discovers water on Mars!")
print(result)


Device set to use cuda:0


[{'label': 'LABEL_0', 'score': 0.9991183876991272}]


In [None]:
model.save_pretrained("bert-fake-news-model")
tokenizer.save_pretrained("bert-fake-news-model")


('bert-fake-news-model/tokenizer_config.json',
 'bert-fake-news-model/special_tokens_map.json',
 'bert-fake-news-model/vocab.txt',
 'bert-fake-news-model/added_tokens.json')

In [None]:
!zip -r bert-fake-news-model.zip bert-fake-news-model
from google.colab import files
files.download("bert-fake-news-model.zip")


  adding: bert-fake-news-model/ (stored 0%)
  adding: bert-fake-news-model/config.json (deflated 48%)
  adding: bert-fake-news-model/special_tokens_map.json (deflated 80%)
  adding: bert-fake-news-model/vocab.txt (deflated 53%)
  adding: bert-fake-news-model/tokenizer_config.json (deflated 75%)
  adding: bert-fake-news-model/model.safetensors (deflated 7%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>