In [None]:
import pandas as pd
import re
from nltk.tokenize import word_tokenize

# Load the dataset (Fake.CSV for fake news)
fake_df = pd.read_csv('Fake.csv')  # Adjust the path

# Display the first few rows
print(fake_df.head())

# Clean the text data
def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters, numbers, and punctuation
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    return text

# Apply the cleaning function to the 'text' column
fake_df['cleaned_text'] = fake_df['text'].apply(clean_text)

# Check cleaned data
print(fake_df[['cleaned_text']].head())


In [None]:
from transformers import BertTokenizer

# Load pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenizing the text
def tokenize_data(texts):
    return tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors='pt')

# Apply tokenization on cleaned text
train_texts = fake_df['cleaned_text'].tolist()
train_encodings = tokenize_data(train_texts)

# Display tokenized data
print(train_encodings)


In [None]:
from transformers import BertForSequenceClassification
from torch.utils.data import Dataset, DataLoader
import torch

# Define Dataset Class
class FakeNewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

# Prepare the dataset and DataLoader
labels = fake_df['label'].tolist()  # Assuming the labels are in the 'label' column
train_dataset = FakeNewsDataset(train_encodings, labels)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)

# Load pre-trained BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)  # Binary classification (fake or not)

# Set up the optimizer
from torch.optim import AdamW
optimizer = AdamW(model.parameters(), lr=5e-5)

# Training loop (simplified)
model.train()
for epoch in range(3):  # Train for 3 epochs
    for batch in train_loader:
        optimizer.zero_grad()  # Clear gradients
        inputs = {key: value.to('cuda') for key, value in batch.items()}  # Move data to GPU if available
        outputs = model(**inputs)  # Forward pass
        loss = outputs.loss
        loss.backward()  # Backward pass
        optimizer.step()  # Update weights
    print(f'Epoch {epoch+1} completed, loss: {loss.item()}')


In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import Trainer, TrainingArguments

# Tokenize the test set
test_df = pd.read_csv('path_to_your_test.csv')  # Adjust the path for your test data
test_texts = test_df['cleaned_text'].tolist()
test_labels = test_df['label'].tolist()

test_encodings = tokenize_data(test_texts)
test_dataset = FakeNewsDataset(test_encodings, test_labels)

# Initialize the Trainer
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # number of training epochs
    per_device_train_batch_size=8,   # batch size for training
    per_device_eval_batch_size=16,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)

trainer = Trainer(
    model=model,                          # the model to be trained
    args=training_args,                   # training arguments
    train_dataset=train_dataset,          # training dataset
    eval_dataset=test_dataset,            # evaluation dataset
)

# Evaluate the model
trainer.evaluate()

# Get predictions and calculate metrics manually if required
predictions = trainer.predict(test_dataset)
predicted_labels = predictions.predictions.argmax(axis=-1)

precision = precision_score(test_labels, predicted_labels)
recall = recall_score(test_labels, predicted_labels)
f1 = f1_score(test_labels, predicted_labels)

print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")


In [None]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification

# Load DistilBERT tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

# Tokenize the text using DistilBERT tokenizer
train_encodings = tokenizer(train_texts, padding=True, truncation=True, max_length=512, return_tensors='pt')

# Train the model (similar to the BERT training process)
# Use the same training loop as above, just replace BERT with DistilBERT in the model.
