In [None]:
# importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from keras import Sequential
from keras.layers import Embedding, Dense, LSTM
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
import nltk
from nltk.stem import WordNetLemmatizer
import re
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')
from nltk.corpus import stopwords

# download necessary nltk data
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Initialize stopwords and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Load datasets
df_fake = pd.read_csv(r"\Fake.csv")
df_true = pd.read_csv(r"\True.csv")

# Label datasets
df_true['status'] = 0
df_fake['status'] = 1

# Merge datasets
df = pd.concat([df_true, df_fake]).reset_index(drop=True)

# Retain only the title and label
df = df[['title', 'status']]

# Shuffle data
df = df.sample(frac=1).reset_index(drop=True)

# Check for null values
df.dropna(inplace=True)

# Preprocessing function
def preprocess_text(text):
    # Remove non-alphabetic characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    # Tokenize and remove stopwords
    tokens = [word for word in text.split() if word not in stop_words]
    # Lemmatize tokens
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

# Apply preprocessing
df['cleaned_title'] = df['title'].apply(preprocess_text)

# Check maximum sequence length
df['title_length'] = df['cleaned_title'].apply(lambda x: len(x.split()))
max_length = max(df['title_length'])

# Word embedding
def word_embedding(text, vocab_size=5000, max_length=40):
    encoded = one_hot(text, vocab_size)
    padded = pad_sequences([encoded], maxlen=max_length, padding='pre')
    return padded[0]

# Prepare data for training
vocab_size = 5000
max_length = 40
df['embedded_title'] = df['cleaned_title'].apply(lambda x: word_embedding(x, vocab_size, max_length))
X = np.array(df['embedded_title'].tolist())
y = df['status'].values

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Create LSTM model
embedded_features = 40
model = Sequential([
    Embedding(vocab_size, embedded_features, input_length=max_length),
    LSTM(100, dropout=0.2, recurrent_dropout=0.2),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jones\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jones\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jones\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:

# Train model
history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=64)

# Evaluate model
def evaluate_model():
    print("\nModel Evaluation")
    y_pred = (model.predict(X_test) > 0.4).astype(int).flatten()
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))
evaluate_model()


Epoch 1/10
[1m562/562[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 24ms/step - accuracy: 0.8525 - loss: 0.3423 - val_accuracy: 0.9302 - val_loss: 0.1669
Epoch 2/10
[1m562/562[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 21ms/step - accuracy: 0.9539 - loss: 0.1230 - val_accuracy: 0.9410 - val_loss: 0.1519
Epoch 3/10
[1m562/562[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 22ms/step - accuracy: 0.9635 - loss: 0.0951 - val_accuracy: 0.9419 - val_loss: 0.1586
Epoch 4/10
[1m562/562[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 20ms/step - accuracy: 0.9733 - loss: 0.0756 - val_accuracy: 0.9393 - val_loss: 0.1710
Epoch 5/10
[1m562/562[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 21ms/step - accuracy: 0.9768 - loss: 0.0644 - val_accuracy: 0.9385 - val_loss: 0.1853
Epoch 6/10
[1m562/562[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 21ms/step - accuracy: 0.9805 - loss: 0.0556 - val_accuracy: 0.9382 - val_loss: 0.1952
Epoch 7/10
[1m5

In [4]:

# Function for real-time prediction
def predict_news(text):
    cleaned_text = preprocess_text(text)
    embedded_text = word_embedding(cleaned_text, vocab_size, max_length)
    prediction = model.predict(np.array([embedded_text]))
    return 'Fake News' if prediction[0][0] > 0.4 else 'Real News'

# Example prediction
example_news = "Former CIA Director Slams Trump Over UN Bullying, Openly Suggests He’s Acting Like A Dictator (TWEET)"
print("Prediction:", predict_news(example_news))


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
Prediction: Fake News
