### Environment Setup

In [8]:
# pip install transformers torch scikit-learn pandas streamlit

Import Required Libraries

In [3]:
# Data Handling
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# PyTorch and Hugging Face
import torch
from torch.utils.data import Dataset
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from transformers import Trainer, TrainingArguments

# For Metrics
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# Load datasets
true_df = pd.read_csv("Data\True.csv")
fake_df = pd.read_csv("Data\Fake.csv")

# Add labels: 1 for true, 0 for fake
true_df["label"] = 1
fake_df["label"] = 0

  true_df = pd.read_csv("Data\True.csv")
  fake_df = pd.read_csv("Data\Fake.csv")


In [11]:
true_df["subject"].unique()

array(['politicsNews', 'worldnews'], dtype=object)

In [6]:
# Sample randomly
true_sample = true_df.sample(n=2000, random_state=42)
fake_sample = fake_df.sample(n=2000, random_state=42)

# Combine and shuffle
df = pd.concat([true_sample, fake_sample], ignore_index=True)
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

In [8]:
# Save as CSV
df.to_csv("small_news_dataset.csv", index=False)
df

Unnamed: 0,title,text,subject,date,label
0,May's government pushes Brexit bill to avoid '...,LONDON (Reuters) - Brexit minister David Davis...,worldnews,"September 6, 2017",1
1,Trump’s EPA OKs Pesticide That Causes Brain D...,Farmworkers were pulled from fields on Friday ...,News,"May 15, 2017",0
2,Man arrested at Trump rally said he wanted to ...,(Reuters) - A man arrested over the weekend tr...,politicsNews,"June 20, 2016",1
3,Jared Kushner NEVER Registered To Vote As A “F...,"Meanwhile, as President Trump continues to mee...",left-news,"Sep 29, 2017",0
4,MARTHA STEWART Makes Lewd Gesture Towards Trum...,"Martha, Martha, Martha You re 75-years old! Ti...",left-news,"May 8, 2017",0
...,...,...,...,...,...
3995,Arkansas attorney general says open to working...,(Reuters) - Arkansas Attorney General Leslie R...,politicsNews,"November 17, 2016",1
3996,UK's May to meet Bill Clinton to discuss North...,LONDON (Reuters) - British Prime Minister Ther...,worldnews,"October 18, 2017",1
3997,Supreme Court dismisses Hawaii's challenge to ...,WASHINGTON (Reuters) - The U.S. Supreme Court ...,politicsNews,"October 24, 2017",1
3998,Watch Mitt Romney Totally Humiliate Himself I...,Mitt Romney got a lot of praise and a lot of h...,News,"November 29, 2016",0


In [24]:
!pip install gdown
import gdown

url = "https://drive.google.com/drive/u/2/folders/1odGBNOdeNMbidYc_aLSyjO6ZweeNzkaL"
output = "small_news_dataset.csv"
gdown.download(url, output, quiet=False)

Collecting gdown
  Downloading gdown-5.2.0-py3-none-any.whl.metadata (5.8 kB)
Downloading gdown-5.2.0-py3-none-any.whl (18 kB)
Installing collected packages: gdown
Successfully installed gdown-5.2.0


Downloading...
From: https://drive.google.com/drive/u/2/folders/1odGBNOdeNMbidYc_aLSyjO6ZweeNzkaL
To: c:\Users\91950\Desktop\Build a Transformer-Based Fake News Detection Chatbot\small_news_dataset.csv
1.12MB [00:00, 4.27MB/s]


'small_news_dataset.csv'

#### Use text as the primary input.
#### Optionally concatenate title + text to give model both headline and body.
#### Avoid subject and date for now (they introduce bias and require extra encoding logic).
#### Why not subject/date?- Transformers handle context in language, not structured metadata like dates. Including subject might make the model lazy (e.g., if all fake news is under one subject, it just memorizes that).

In [9]:
# Combine title and text into one column
df["content"] = df["title"].astype(str) + " " + df["text"].astype(str)

# Keep only combined text and label
df = df[["content", "label"]]

In [10]:
df

Unnamed: 0,content,label
0,May's government pushes Brexit bill to avoid '...,1
1,Trump’s EPA OKs Pesticide That Causes Brain D...,0
2,Man arrested at Trump rally said he wanted to ...,1
3,Jared Kushner NEVER Registered To Vote As A “F...,0
4,MARTHA STEWART Makes Lewd Gesture Towards Trum...,0
...,...,...
3995,Arkansas attorney general says open to working...,1
3996,UK's May to meet Bill Clinton to discuss North...,1
3997,Supreme Court dismisses Hawaii's challenge to ...,1
3998,Watch Mitt Romney Totally Humiliate Himself I...,0


###  Preprocessing function

In [11]:
import re

def clean_text(text):
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove URLs
    text = re.sub(r'http\S+|www\S+', '', text)
    # Remove non-alphanumeric characters (keep spaces)
    text = re.sub(r'[^A-Za-z0-9\s]', '', text)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Apply cleaning to your combined text
df.loc[:, "content"] = df["content"].apply(lambda x: clean_text(str(x)))

In [12]:
df.head()

Unnamed: 0,content,label
0,Mays government pushes Brexit bill to avoid ch...,1
1,Trumps EPA OKs Pesticide That Causes Brain Dam...,0
2,Man arrested at Trump rally said he wanted to ...,1
3,Jared Kushner NEVER Registered To Vote As A Fe...,0
4,MARTHA STEWART Makes Lewd Gesture Towards Trum...,0


###  Split into Train, Validation, Test

In [13]:
from sklearn.model_selection import train_test_split

# First split into train and temp (val+test)
train_texts, temp_texts, train_labels, temp_labels = train_test_split(
    df["content"], df["label"], test_size=0.2, random_state=42, stratify=df["label"]
)

# Split temp into validation and test
val_texts, test_texts, val_labels, test_labels = train_test_split(
    temp_texts, temp_labels, test_size=0.5, random_state=42, stratify=temp_labels
)

print("Train size:", len(train_texts))
print("Val size:", len(val_texts))
print("Test size:", len(test_texts))


Train size: 3200
Val size: 400
Test size: 400


### Model Selection and Integration

In [14]:
from transformers import DistilBertTokenizer

# Load tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

In [15]:
tokenizer

DistilBertTokenizer(name_or_path='distilbert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

In [16]:
# Tokenize function
def tokenize_data(texts):
    return tokenizer(
        list(texts),
        padding='max_length',
        truncation=True,
        max_length=256,  # shorter for faster training
        return_tensors='pt'
    )

train_encodings = tokenize_data(train_texts)
val_encodings = tokenize_data(val_texts)
test_encodings = tokenize_data(test_texts)

In [17]:
class NewsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels.iloc[idx], dtype=torch.long)
        return item

train_dataset = NewsDataset(train_encodings, train_labels)
val_dataset = NewsDataset(val_encodings, val_labels)
test_dataset = NewsDataset(test_encodings, test_labels)

###  Fine-tune DistilBERT

#### 1. Load the Model

In [18]:
from transformers import DistilBertForSequenceClassification

model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels=2  # binary classification (fake or true)
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


#### 2. Define Metrics for Evaluation

In [19]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

#### 3. Set Training Arguments

In [20]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',              # Where model checkpoints will go
    num_train_epochs=2,                  # Feel free to increase if time allows
    per_device_train_batch_size=8,       
    per_device_eval_batch_size=8,
    warmup_steps=50,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    do_train=True,                       # Explicitly tell it to train
    do_eval=True                         # Explicitly tell it to evaluate
)


ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.26.0`: Please run `pip install transformers[torch]` or `pip install 'accelerate>=0.26.0'`

####  4. Train the Model with HuggingFace Trainer

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)


#### 5. Start Training

In [None]:
trainer.train()



Step,Training Loss
10,0.7106
20,0.6746
30,0.6105
40,0.3917
50,0.1316
60,0.0224
70,0.0628
80,0.0041
90,0.0022
100,0.0016


TrainOutput(global_step=800, training_loss=0.047589722672710194, metrics={'train_runtime': 8137.6264, 'train_samples_per_second': 0.786, 'train_steps_per_second': 0.098, 'total_flos': 423895675699200.0, 'train_loss': 0.047589722672710194, 'epoch': 2.0})

In [None]:
import transformers
print(transformers.__version__)


4.55.0


#### 6. Evaluate on Test Set (after training)

In [None]:
trainer.evaluate(test_dataset)



{'eval_loss': 0.035448167473077774,
 'eval_accuracy': 0.995,
 'eval_f1': 0.995,
 'eval_precision': 0.995,
 'eval_recall': 0.995,
 'eval_runtime': 99.8491,
 'eval_samples_per_second': 4.006,
 'eval_steps_per_second': 0.501,
 'epoch': 2.0}

#### 7: Save the Trained Model & Tokenizer

In [None]:
# Create a folder to save model and tokenizer
model_path = "saved_model"

# Save both model and tokenizer
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

('saved_model\\tokenizer_config.json',
 'saved_model\\special_tokens_map.json',
 'saved_model\\vocab.txt',
 'saved_model\\added_tokens.json')

###  Chatbot Function

In [None]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import torch
import torch.nn.functional as F

# Load model + tokenizer
def load_model(model_path="saved_model"):
    tokenizer = DistilBertTokenizer.from_pretrained(model_path)
    model = DistilBertForSequenceClassification.from_pretrained(model_path)
    model.eval()  # Set to evaluation mode
    return tokenizer, model

# Predict function
def classify_news(text, tokenizer, model):
    # Preprocess the input text
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=512
    )

    # Get predictions
    with torch.no_grad():
        outputs = model(**inputs)
        probs = F.softmax(outputs.logits, dim=1)
        confidence, predicted_class = torch.max(probs, dim=1)

    label_map = {0: "Fake", 1: "True"}
    predicted_label = label_map[predicted_class.item()]
    confidence_percent = confidence.item() * 100

    # Conversational response
    response = f"This news is likely {predicted_label.upper()} with a confidence of {confidence_percent:.2f}%."

    return response, predicted_label, confidence_percent


In [None]:
if __name__ == "__main__":
    tokenizer, model = load_model()
    user_input = input("Paste your news snippet: ")
    response, label, conf = classify_news(user_input, tokenizer, model)
    print("\n", response)

Paste your news snippet:  trump imposed 60% tarrifs on india



 This news is likely TRUE with a confidence of 99.55%.
