In [3]:
import zipfile

# Unzip the file
with zipfile.ZipFile("dataset_fnd.zip", 'r') as zip_ref:
    zip_ref.extractall("fake_news_data")  # Extracts to a folder

In [6]:
import pandas as pd

# Load the datasets
true_news = pd.read_csv("/content/fake_news_data/News _dataset/True.csv")
fake_news = pd.read_csv("/content/fake_news_data/News _dataset/Fake.csv")

In [7]:
# Labeling: 1 = Real, 0 = Fake
true_news['label'] = 1
fake_news['label'] = 0

# Combine & Shuffle
df = pd.concat([true_news, fake_news]).sample(frac=1).reset_index(drop=True)

# Keep only important columns
df = df[['text', 'label']]

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

model = LogisticRegression()
model.fit(X_train_tfidf, y_train)

y_pred = model.predict(X_test_tfidf)

print("Model Accuracy:", accuracy_score(y_test, y_pred))

Model Accuracy: 0.987750556792873


In [10]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      4744
           1       0.98      0.99      0.99      4236

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980



To check if model is overfitting, comparing training and test accuracy.
If both are similar then model is not overfitting.

In [12]:
train_accuracy = model.score(X_train_tfidf, y_train)
test_accuracy = model.score(X_test_tfidf, y_test)

print("Training Accuracy:", train_accuracy)
print("Test Accuracy:", test_accuracy)

Training Accuracy: 0.9904226293223454
Test Accuracy: 0.987750556792873


Since, the difference is only ~0.26%, which is very small.
The model generalizes well and is not just memorizing the training data.