In [5]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter

# Load the dataset
data = pd.read_csv("cleaned_dataset.csv")

# Download required NLTK resources (if not already downloaded)
nltk.download("stopwords", quiet=True)
nltk.download("punkt", quiet=True)

True

In [6]:
# Set of English stopwords
stop_words = set(stopwords.words("english"))

# Initialize counters for titles and text
title_counter = Counter()
text_counter = Counter()

# Tokenize titles and texts, and update counters for "Fake" labels
for title, text, label in zip(data["title"], data["text"], data["label"]):
    # Check if label is "Fake" and title/text are valid strings
    if label == "Fake" and isinstance(title, str) and isinstance(text, str):
        title_words = [
            word.lower()
            for word in word_tokenize(title)
            if word.isalpha() and word.lower() not in stop_words
        ]
        text_words = [
            word.lower()
            for word in word_tokenize(text)
            if word.isalpha() and word.lower() not in stop_words
        ]
        title_counter.update(title_words)
        text_counter.update(text_words)


In [8]:
# Get keywords
top_keywords_title = title_counter.most_common(5)
top_keywords_text = text_counter.most_common(5)

In [9]:
print("Top 5 Keywords Associated with Fake News Titles:")
for keyword, count in top_keywords_title:
    print(f"{keyword}: {count} times")

Top 5 Keywords Associated with Fake News Titles:
trump: 135 times
hillary: 129 times
clinton: 121 times
title: 91 times
us: 59 times


In [10]:
print("Top 5 Keywords Associated with Fake News Texts:")
for keyword, count in top_keywords_text:
    print(f"{keyword}: {count} times")

Top 5 Keywords Associated with Fake News Texts:
clinton: 1990 times
trump: 1975 times
one: 1419 times
us: 1385 times
said: 1359 times
