In [1]:
from google.colab import files
import pandas as pd
import io

# Upload the file
uploaded = files.upload()

# Load it from memory — safest method
for filename in uploaded:
    print(f"Reading: {filename}")
    df = pd.read_csv(io.BytesIO(uploaded[filename]))
    print("Columns loaded:", df.columns)


Saving WELFake_Dataset.csv to WELFake_Dataset.csv
Reading: WELFake_Dataset.csv
Columns loaded: Index(['Unnamed: 0', 'title', 'text', 'label'], dtype='object')


In [2]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,1,,Did they post their votes for Hillary already?,1
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1


In [3]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification,get_scheduler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from tqdm import tqdm
import torch.nn as nn
from torch.optim import AdamW

# Config
model_name = "jy46604790/Fake-News-Bert-Detect"
batch_size = 64
max_length = 256
num_epochs = 3
learning_rate = 2e-5

# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Training on device: {device}")

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
model.to(device)

# Dataset class
class WELFakeDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.encodings = tokenizer(texts, padding=True, truncation=True, max_length=max_length, return_tensors="pt")
        self.labels = torch.tensor(labels)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item["labels"] = self.labels[idx]
        return item


df["content"] = df["title"].fillna('') + " " + df["text"].fillna('').str[:300]
df = df.dropna(subset=["label"])

# Train-validation split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df["content"].tolist(),
    df["label"].tolist(),
    test_size=0.2,
    stratify=df["label"],
    random_state=42
)

# Create datasets and loaders
train_dataset = WELFakeDataset(train_texts, train_labels, tokenizer, max_length)
val_dataset = WELFakeDataset(val_texts, val_labels, tokenizer, max_length)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

# Optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=learning_rate)
num_training_steps = num_epochs * len(train_loader)
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    print(f"\nEpoch {epoch + 1}/{num_epochs}")

    for batch in tqdm(train_loader):
        batch = {k: v.to(device) for k, v in batch.items()}

        outputs = model(**batch)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

    avg_loss = total_loss / len(train_loader)
    print(f"Average training loss: {avg_loss:.4f}")

    # Validation
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in val_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=-1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(batch["labels"].cpu().numpy())

    acc = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds)
    recall = recall_score(all_labels, all_preds)

    print(f"Validation Accuracy: {acc:.4f}")
    print(f"Validation F1 Score: {f1:.4f}")
    print(f"Validation Precision: {precision:.4f}")
    print(f"Validation Recall: {recall:.4f}")

# Save fine-tuned model
model.save_pretrained("fine_tuned_bert_welfake")
tokenizer.save_pretrained("fine_tuned_bert_welfake")

print("Fine-tuning complete and model saved!")


Training on device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.14k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/735 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]


Epoch 1/3


100%|██████████| 902/902 [38:21<00:00,  2.55s/it]


Average training loss: 0.0842
Validation Accuracy: 0.9965
Validation F1 Score: 0.9966
Validation Precision: 0.9984
Validation Recall: 0.9949

Epoch 2/3


100%|██████████| 902/902 [38:26<00:00,  2.56s/it]


Average training loss: 0.0067
Validation Accuracy: 0.9944
Validation F1 Score: 0.9945
Validation Precision: 0.9993
Validation Recall: 0.9898

Epoch 3/3


100%|██████████| 902/902 [38:26<00:00,  2.56s/it]


Average training loss: 0.0024
Validation Accuracy: 0.9979
Validation F1 Score: 0.9980
Validation Precision: 0.9987
Validation Recall: 0.9973
Fine-tuning complete and model saved!


In [4]:
from google.colab import files
!zip -r fine_tuned_bert_welfake.zip fine_tuned_bert_welfake
files.download("fine_tuned_bert_welfake.zip")


  adding: fine_tuned_bert_welfake/ (stored 0%)
  adding: fine_tuned_bert_welfake/special_tokens_map.json (deflated 85%)
  adding: fine_tuned_bert_welfake/tokenizer_config.json (deflated 75%)
  adding: fine_tuned_bert_welfake/model.safetensors (deflated 8%)
  adding: fine_tuned_bert_welfake/merges.txt (deflated 53%)
  adding: fine_tuned_bert_welfake/config.json (deflated 51%)
  adding: fine_tuned_bert_welfake/vocab.json (deflated 59%)
  adding: fine_tuned_bert_welfake/tokenizer.json (deflated 82%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>