In [23]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import re
from urllib.parse import urlparse
import joblib

In [25]:
# Load Fake.csv
fake_df = pd.read_csv('Fake.csv')

# Add label column explicitly
fake_df['label'] = 0  # Fake = 0

# Select relevant columns, ensuring 'label' is included
fake_df = fake_df[['title', 'text', 'label']].copy()

# Add placeholders
fake_df['url'] = 'http://fake-site.com/article' + fake_df.index.astype(str)
fake_df['image_path'] = 'images/fake_placeholder.jpg'
fake_df.rename(columns={'title': 'title_text'}, inplace=True)

# Check for NaN and drop
print("Fake News NaN Counts:")
print(fake_df.isnull().sum())
fake_df = fake_df.dropna(subset=['title_text', 'text'])  # Drop NaN text rows

# Verify labels
print("Fake News Label Distribution:")
print(fake_df['label'].value_counts())
print("Fake News Columns:", fake_df.columns.tolist())
print("Fake News Preview:")
print(fake_df.head())

Fake News NaN Counts:
title_text    0
text          0
label         0
url           0
image_path    0
dtype: int64
Fake News Label Distribution:
label
0    23481
Name: count, dtype: int64
Fake News Columns: ['title_text', 'text', 'label', 'url', 'image_path']
Fake News Preview:
                                          title_text  \
0   Donald Trump Sends Out Embarrassing New Year’...   
1   Drunk Bragging Trump Staffer Started Russian ...   
2   Sheriff David Clarke Becomes An Internet Joke...   
3   Trump Is So Obsessed He Even Has Obama’s Name...   
4   Pope Francis Just Called Out Donald Trump Dur...   

                                                text  label  \
0  Donald Trump just couldn t wish all Americans ...      0   
1  House Intelligence Committee Chairman Devin Nu...      0   
2  On Friday, it was revealed that former Milwauk...      0   
3  On Christmas day, Donald Trump announced that ...      0   
4  Pope Francis used his annual Christmas Day mes...      0   

     

In [26]:
# Load True.csv
true_df = pd.read_csv('True.csv')
true_df['label'] = 1  # Real = 1

# Select relevant columns, including label
true_df = true_df[['title', 'text', 'label']].copy()
true_df['url'] = 'https://reuters.com/article' + true_df.index.astype(str)
true_df['image_path'] = 'images/real_placeholder.jpg'
true_df.rename(columns={'title': 'title_text'}, inplace=True)

# Load News_Category_Dataset_v3.json
news_df = pd.read_json('News_Category_Dataset_v3.json', lines=True)
news_df = news_df[['headline', 'short_description', 'link']].head(len(fake_df))
news_df['label'] = 1
news_df['image_path'] = 'images/real_placeholder.jpg'
news_df.rename(columns={'headline': 'title_text', 'short_description': 'text', 'link': 'url'}, inplace=True)

# Combine real datasets
real_df = pd.concat([true_df, news_df], ignore_index=True)

# Check for NaN and drop
print("Real News NaN Counts:")
print(real_df.isnull().sum())
real_df = real_df.dropna(subset=['title_text', 'text'])

# Verify labels
print("Real News Label Distribution:")
print(real_df['label'].value_counts())
print("Real News Columns:", real_df.columns.tolist())
print("Real News Preview:")
print(real_df.head())

Real News NaN Counts:
title_text    0
text          0
label         0
url           0
image_path    0
dtype: int64
Real News Label Distribution:
label
1    44898
Name: count, dtype: int64
Real News Columns: ['title_text', 'text', 'label', 'url', 'image_path']
Real News Preview:
                                          title_text  \
0  As U.S. budget fight looms, Republicans flip t...   
1  U.S. military to accept transgender recruits o...   
2  Senior U.S. Republican senator: 'Let Mr. Muell...   
3  FBI Russia probe helped by Australian diplomat...   
4  Trump wants Postal Service to charge 'much mor...   

                                                text  label  \
0  WASHINGTON (Reuters) - The head of a conservat...      1   
1  WASHINGTON (Reuters) - Transgender people will...      1   
2  WASHINGTON (Reuters) - The special counsel inv...      1   
3  WASHINGTON (Reuters) - Trump campaign adviser ...      1   
4  SEATTLE/WASHINGTON (Reuters) - President Donal...      1   

     

In [27]:
# Combine fake and real
df = pd.concat([fake_df, real_df], ignore_index=True)

# Shuffle
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Check for NaN and drop
print("Combined Dataset NaN Counts:")
print(df.isnull().sum())
df = df.dropna(subset=['label', 'title_text', 'text'])
df['label'] = df['label'].astype(int)

# Verify labels
print("Combined Dataset Label Distribution:")
print(df['label'].value_counts())
assert len(df['label'].unique()) == 2, "Only one class found in combined dataset!"

# Save combined dataset
df.to_csv('combined_news_dataset.csv', index=False)
print("Combined Dataset Saved as 'combined_news_dataset.csv'!")
print("Combined Dataset Preview:")
print(df.head())
print("Dataset Size:", df.shape)

Combined Dataset NaN Counts:
title_text    0
text          0
label         0
url           0
image_path    0
dtype: int64
Combined Dataset Label Distribution:
label
1    44898
0    23481
Name: count, dtype: int64
Combined Dataset Saved as 'combined_news_dataset.csv'!
Combined Dataset Preview:
                                          title_text  \
0  FIVE REASONS You Should Vote For Donald Trump ...   
1  Trump to meet House, Senate tax cut negotiator...   
2  UN To Scale Up Humanitarian Operations In Ukra...   
3  Rudy Giuliani Reverses Trump Team's Position, ...   
4  Doug Jones Says Congress Should 'Move On' From...   

                                                text  label  \
1  WASHINGTON (Reuters) - President Donald Trump ...      1   
2  The UN will allocate $20 million to “help with...      1   
3  Obstruction of justice charges are reportedly ...      1   
4  Earlier this month, Alabama's senator-elect ca...      1   

                                                 url 

In [28]:
# Clean text function
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    return text

# Combine title and text
df['full_text'] = df['title_text'] + ' ' + df['text']
df['clean_text'] = df['full_text'].apply(clean_text)

# TF-IDF vectorization
tfidf = TfidfVectorizer(max_features=500)
text_features = tfidf.fit_transform(df['clean_text']).toarray()
print("Text Features Shape:", text_features.shape)

Text Features Shape: (68379, 500)


In [29]:
# Simulate image features
image_features = np.array([[2500, 100] if label == 0 else [2500, 150] for label in df['label']])
print("Image Features Shape:", image_features.shape)

# Extract URL features
def extract_url_features(url):
    domain = urlparse(str(url)).netloc
    is_https = 1 if 'https' in str(url) else 0
    return [len(domain), is_https]

url_features = np.array([extract_url_features(url) for url in df['url']])
print("URL Features Shape:", url_features.shape)

Image Features Shape: (68379, 2)
URL Features Shape: (68379, 2)


In [30]:
# Combine all features
X = np.hstack((text_features, image_features, url_features))
y = df['label'].values

# Verify classes in y
print("Classes in y:", np.unique(y))
print("NaN in y:", np.isnan(y).sum())
assert len(np.unique(y)) == 2, "Only one class in y!"
assert not np.isnan(y).any(), "y contains NaN!"

# Split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
print("Training Features Shape:", X_train.shape)
print("Testing Features Shape:", X_test.shape)
print("Training Labels Distribution:", np.bincount(y_train))
print("Testing Labels Distribution:", np.bincount(y_test))

Classes in y: [0 1]
NaN in y: 0
Training Features Shape: (47865, 504)
Testing Features Shape: (20514, 504)
Training Labels Distribution: [16516 31349]
Testing Labels Distribution: [ 6965 13549]


In [31]:
# Train Logistic Regression
model = LogisticRegression(max_iter=1000, class_weight='balanced')
model.fit(X_train, y_train)

# Save your trained model and vectorizer
joblib.dump(model, 'my_fake_news_model.pkl')
joblib.dump(tfidf, 'my_tfidf_vectorizer.pkl')
print("Your model and vectorizer saved as 'my_fake_news_model.pkl' and 'my_tfidf_vectorizer.pkl'!")

Your model and vectorizer saved as 'my_fake_news_model.pkl' and 'my_tfidf_vectorizer.pkl'!


In [32]:
# Predict on test set
y_pred = model.predict(X_test)

# Evaluate
report = classification_report(y_test, y_pred, output_dict=True)
print("Classification Report for Your Model:")
print(pd.DataFrame(report).T)

Classification Report for Your Model:
              precision  recall  f1-score  support
0                   1.0     1.0       1.0   6965.0
1                   1.0     1.0       1.0  13549.0
accuracy            1.0     1.0       1.0      1.0
macro avg           1.0     1.0       1.0  20514.0
weighted avg        1.0     1.0       1.0  20514.0
