In [1]:
import pandas as pd
import numpy as np
import itertools
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

In [2]:
# Load dataset
df = pd.read_csv(r"Z:\Edu-versity\news.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [3]:
# Separate features and labels

In [4]:
x = df['text']
x.head()

0    Daniel Greenfield, a Shillman Journalism Fello...
1    Google Pinterest Digg Linkedin Reddit Stumbleu...
2    U.S. Secretary of State John F. Kerry said Mon...
3    — Kaydee King (@KaydeeKing) November 9, 2016 T...
4    It's primary day in New York and front-runners...
Name: text, dtype: object

In [5]:
y = df['label']
y.head()

0    FAKE
1    FAKE
2    REAL
3    FAKE
4    REAL
Name: label, dtype: object

In [6]:
# Split the dataset
x_train,x_test,y_train,y_test = train_test_split(x,y, test_size = 0.2, random_state = 42)

In [7]:
# Text Vectorization

In [8]:
extraction = TfidfVectorizer(stop_words = 'english', max_df = 0.7)

In [9]:
tfidf_train = extraction.fit_transform(x_train)
tfidf_test = extraction.transform(x_test)

In [10]:
# Build and train the mode
model = PassiveAggressiveClassifier(max_iter=50)
model.fit(tfidf_train, y_train)

In [11]:
# Predict and evaluate
y_pred = model.predict(tfidf_test)
score = accuracy_score(y_test, y_pred)
conf_mat = confusion_matrix(y_test, y_pred)

In [12]:
print(f"Accuracy: {round(score*100,2)}%")
print("Confusion Matrix:")
print(conf_mat)

Accuracy: 93.84%
Confusion Matrix:
[[593  35]
 [ 43 596]]


In [13]:
#591 → True Negatives (TN)  Model predicted Negative and it was actually Negative. (Correct prediction.)
#37 → False Positives (FP)  Model predicted Positive, but it was actually Negative.
#39 → False Negatives (FN)  Model predicted Negative, but it was actually Positive.
#600 → True Positives (TP)  Model predicted Positive and it was actually Positive. (Correct prediction.)

In [14]:
# Example Testing Code for Fake News Classifier

sample_news = [
    "The government has announced a new healthcare policy to help rural areas.",
    "Aliens have landed in New York and taken over the Empire State Building!",
    "The stock market hit an all-time high today, with major gains in tech stocks.",
    "Scientists discover a new species of flying fish that can breathe on land."
]

# Transform the sample news using the same TF-IDF vectorizer
sample_tfidf = extraction.transform(sample_news)

# Predict the labels (Fake or Real)
predictions = model.predict(sample_tfidf)

# Display the results
for news, label in zip(sample_news, predictions):
    print(f"News: {news}")
    print(f"Prediction: {label}")
    print("-" * 80)

News: The government has announced a new healthcare policy to help rural areas.
Prediction: FAKE
--------------------------------------------------------------------------------
News: Aliens have landed in New York and taken over the Empire State Building!
Prediction: REAL
--------------------------------------------------------------------------------
News: The stock market hit an all-time high today, with major gains in tech stocks.
Prediction: FAKE
--------------------------------------------------------------------------------
News: Scientists discover a new species of flying fish that can breathe on land.
Prediction: FAKE
--------------------------------------------------------------------------------
