In [4]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

# Load the dataset
data = pd.read_csv("cleaned_dataset.csv")

In [5]:
# Check for missing values in the text column
print("Missing values in 'text':", data['text'].isna().sum())

# Extract keywords for fake news and handle NaN values
fake_news_data = data[data['label'] == "Fake"]

# Drop rows with NaN in 'text' or fill them with a placeholder
fake_news_data = fake_news_data.dropna(subset=['text'])

# Initialize CountVectorizer
vectorizer = CountVectorizer(stop_words='english')

Missing values in 'text': 46


In [6]:
# Fit and transform the text data
X = vectorizer.fit_transform(fake_news_data["text"])
word_frequencies = X.toarray().sum(axis=0)
feature_names = vectorizer.get_feature_names_out()
keywords = set(feature_names[word_frequencies.argsort()[-10:][::-1]])

# Count occurrences of sites
site_counts = data["site_url"].value_counts()
fake_site_counts = data[data["label"] == "Fake"]["site_url"].value_counts()
fake_news_percentage = (fake_site_counts / site_counts).fillna(0)

In [7]:
# Prediction function
def fakenewsprediction(title, news_source):
    title_contains_keyword = any(keyword in title.lower() for keyword in keywords)
    source_fake_percentage = fake_news_percentage.get(news_source, 0.0)

    if title_contains_keyword and source_fake_percentage > 0.5:
        return "Fake News"
    return "Real News"

# Example prediction
text_input = "Breaking: election week is over"
source_input = "der-postillon.com"
prediction = fakenewsprediction(text_input, source_input)
print(f"Prediction: {prediction}")


Prediction: Fake News
