# PreProcessing

In [None]:
import pandas as pd
import numpy as np

true = pd.read_csv('True.csv')
fake = pd.read_csv('Fake.csv')

true['label'] = 1
fake['label'] = 0

true.head()

news = pd.concat([fake, true], axis = 0)


news = news.drop(['title', 'subject', 'date'], axis = 1)

news = news.sample(frac = 1)

news.reset_index(inplace = True)

news.drop(['index'], axis = 1, inplace = True)


# Feature Extraction

In [None]:
import re

def wordopt(text):
    text = text.lower()
    text = re.sub(r'@\S+', '', text)
    text = re.sub(r'https?://\S+|www\.\S+', '', text)

    # remove digits
    text = re.sub(r'\d+', '', text)

    # remove newline characters
    text = re.sub(r'\n', '', text)


    return text

news['text'] = news['text'].apply(wordopt)


x = news['text']
y = news['label']



# Model Training (Logistic Regression)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, stratify = y, random_state = 2)

vectorizer = TfidfVectorizer()
vectorizer.fit(x_train)

x_train = vectorizer.transform(x_train)
x_test = vectorizer.transform(x_test)

model = LogisticRegression()
model.fit(x_train, y_train)

# Logistic Regression report

In [30]:
prediction = model.predict(x_test)
print(accuracy_score(y_test, prediction))

print(classification_report(y_test, prediction))

0.9868596881959911
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      4696
           1       0.99      0.99      0.99      4284

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980



# LSTM Based model

In [35]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D
from sklearn.preprocessing import LabelEncoder

# Tokenization
tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")
# Convert x_train and x_test back to raw text if they are sparse matrices
if not isinstance(x_train, pd.Series):
    x_train = news['text'][y_train.index]  # Extract original text
    x_test = news['text'][y_test.index]

tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")
tokenizer.fit_on_texts(x_train)


x_train_seq = tokenizer.texts_to_sequences(x_train)
x_test_seq = tokenizer.texts_to_sequences(x_test)

max_length = 300  # Adjust based on dataset analysis
x_train_pad = pad_sequences(x_train_seq, maxlen=max_length, padding="post")
x_test_pad = pad_sequences(x_test_seq, maxlen=max_length, padding="post")

# Build LSTM model
model_lstm = Sequential([
    Embedding(5000, 128, input_length=max_length),
    SpatialDropout1D(0.2),
    LSTM(100, dropout=0.2, recurrent_dropout=0.2),
    Dense(1, activation="sigmoid")
])

model_lstm.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
model_lstm.fit(x_train_pad, y_train, epochs=5, batch_size=64, validation_data=(x_test_pad, y_test))



# Evaluate Model
y_pred_lstm = (model_lstm.predict(x_test_pad) > 0.5).astype("int32")
print(accuracy_score(y_test, y_pred_lstm))


Epoch 1/5
[1m562/562[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m263s[0m 462ms/step - accuracy: 0.7921 - loss: 0.4074 - val_accuracy: 0.9242 - val_loss: 0.2138
Epoch 2/5
[1m562/562[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m276s[0m 486ms/step - accuracy: 0.9325 - loss: 0.1923 - val_accuracy: 0.9463 - val_loss: 0.1653
Epoch 3/5
[1m562/562[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m324s[0m 490ms/step - accuracy: 0.9384 - loss: 0.1695 - val_accuracy: 0.9568 - val_loss: 0.1337
Epoch 4/5
[1m562/562[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m276s[0m 490ms/step - accuracy: 0.9605 - loss: 0.1232 - val_accuracy: 0.9621 - val_loss: 0.1199
Epoch 5/5
[1m562/562[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m324s[0m 493ms/step - accuracy: 0.9635 - loss: 0.1118 - val_accuracy: 0.9867 - val_loss: 0.0515




[1m281/281[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 80ms/step
0.9867483296213808


In [36]:
model_lstm.save("lstm_fake_news_model.keras")
