In [None]:

#libraries
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Embedding, Layer, LayerNormalization, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
import joblib
#used for data cleaning
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

#Loading the fake news dataset
import kagglehub
path = kagglehub.dataset_download("emineyetm/fake-news-detection-datasets")
print("Path to dataset files:", path)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
#Creating the dataframes for true and fake
dataframeT = pd.read_csv('/kaggle/input/fake-news-detection-datasets/News _dataset/True.csv')
dataframeF = pd.read_csv('/kaggle/input/fake-news-detection-datasets/News _dataset/Fake.csv')

In [None]:
dataframeT

In [None]:
dataframeF

In [None]:
#Combining the datasets into one dataframe
dataframeT['label'] = 1
dataframeF['label'] = 0
dataframe = pd.concat([dataframeT, dataframeF], ignore_index=True)

In [None]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = re.sub(r'[^\w\s]', '', text)  #removes punctuation
    text = re.sub(r'\d+', '', text)  #removes numbers
    text = text.lower()  #turns everything to lowercase
    text = ' '.join([word for word in text.split() if word not in stop_words]) #gets rid of stop words
    return text

dataframe['text'] = dataframe['text'].apply(clean_text) #applies changes

In [None]:
#some train test splitting, 20%
X = dataframe['text']
y = dataframe['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=142857)

In [None]:
#turns everything into vectors
vectorizer = TfidfVectorizer(max_features=5000)
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)
joblib.dump(vectorizer, ("/kaggle/working/version4_vectorizer.pkl"))

In [None]:
#This project uses the naive bayes model
model = MultinomialNB()
model.fit(X_train_vectorized, y_train) #doe the actual data fitting
joblib.dump(model, ("/kaggle/working/version4_NB.pkl"))

In [None]:
predictions = model.predict(X_test_vectorized) #stores alll of the predictions into a variable
print(f"Accuracy: {accuracy_score(y_test, predictions):.2f}") #compares with actual results
print(classification_report(y_test, predictions))