# Imports 

In [2]:
from __future__ import print_function
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader
import torch
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Define device for torch
use_cuda = True
print("CUDA is available:", torch.cuda.is_available())
device = torch.device("cuda" if (use_cuda and torch.cuda.is_available()) else "cpu")

CUDA is available: True


# Load Dataset
WELFake from HuggingFace

In [4]:
from datasets import load_dataset

dataset = load_dataset("davanstrien/WELFake")
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['title', 'text', 'label'],
        num_rows: 72134
    })
})


### Data Preprocessing

In [5]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier



In [26]:
from collections import Counter

data = dataset["train"]
train_temp_split = data.train_test_split(test_size=0.20, stratify_by_column="label",seed=42)
final_dataset={
    "train":train_temp_split["train"],
    "val":train_temp_split["test"],
}
display(final_dataset)

print("Train label dist:", Counter(final_dataset['train']['label']))
print("Validation label dist:", Counter(final_dataset['val']['label']))

# Convert to pandas DataFrames
train_df = final_dataset['train'].to_pandas()
val_df = final_dataset['val'].to_pandas()

# Display the first few rows to confirm conversion
print("\nTrain DataFrame shape:", train_df.shape)
print("Validation DataFrame shape:", val_df.shape)
print("\nSample of training data:")
display(train_df.head(3))


{'train': Dataset({
     features: ['title', 'text', 'label'],
     num_rows: 57707
 }),
 'val': Dataset({
     features: ['title', 'text', 'label'],
     num_rows: 14427
 })}

Train label dist: Counter({1: 29685, 0: 28022})
Validation label dist: Counter({1: 7421, 0: 7006})

Train DataFrame shape: (57707, 3)
Validation DataFrame shape: (14427, 3)

Sample of training data:


Unnamed: 0,title,text,label
0,"Susan Collins Bucked Party, Voted To Protect ...",Even though Senator Susan Collins voted with t...,1
1,Peru prosecutor probing alleged abuse seeks to...,LIMA (Reuters) - A public prosecutor in Peru i...,0
2,US Officials See No Link Between Trump and Russia,Clinton Campaign Demands FBI Affirm Trump's Ru...,1


### Feature Engineering

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')

In [25]:
# Load FakeNewsNet dataset from local
politifact_fake = pd.read_csv('../fakeNewsNet/politifact_fake.csv')
politifact_real = pd.read_csv('../fakeNewsNet/politifact_real.csv')
gossipcop_fake = pd.read_csv('../fakeNewsNet/gossipcop_fake.csv')
gossipcop_real = pd.read_csv('../fakeNewsNet/gossipcop_real.csv')


politifact_fake['label'] = 1
politifact_real['label'] = 0    
gossipcop_fake['label'] = 1
gossipcop_real['label'] = 0
    
# title = text, if fake set = label = 1
# concat into one dataframe
fakenewsnet_df = pd.concat([
    politifact_fake,
    politifact_real,
    gossipcop_fake,
    gossipcop_real
])

fakenewsnet_df = fakenewsnet_df[['title','label']].dropna().rename(columns={'title':'text'})

# Verify
print("FakeNewsNet DataFrame shape:", fakenewsnet_df.shape)
print("Columns:", fakenewsnet_df.columns)
print("Label distribution:", fakenewsnet_df['label'].value_counts())

FakeNewsNet DataFrame shape: (23196, 2)
Columns: Index(['text', 'label'], dtype='object')
Label distribution: label
0    17441
1     5755
Name: count, dtype: int64


### Model Implementation

In [None]:
from sklearn.linear_model import LogisticRegression


# TF-IDF + Logistic Regression

X_train = vectorizer.fit_transform(train_df['text'])
y_train = train_df['label']
X_val = vectorizer.transform(val_df['text'])
y_val = val_df['label']
X_test = vectorizer.transform(test_df['text'])
y_test = test_df['label']

lr = LogisticRegression(C=1.0, random_state=42, max_iter=1000)
lr.fit(X_train, y_train)

AttributeError: 'NoneType' object has no attribute 'lower'

### Evaluation

In [None]:
from sklearn.metrics import classification_report

print("\nValidation Set Performance:")
print(classification_report(y_val, lr.predict(X_val)))
print("\nTest Set Performance:")
print(classification_report(y_test, lr.predict(X_test)))