In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, save_model, load_model
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from google.colab import files
import pickle  # Import pickle to save the tokenizer

In [None]:
# Load your dataset
# Assuming you have two CSV files: 'fake.csv' and 'real.csv'
fake_df = pd.read_csv('/content/Fake.csv')
real_df = pd.read_csv('/content/True.csv')


In [None]:
# Combine datasets
fake_df['label'] = 0  # Fake label
real_df['label'] = 1  # Real label
df = pd.concat([fake_df, real_df])

In [None]:
# Data Cleaning
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'<.*?>+', '', text)
    text = re.sub(r'[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), '', text)
    text = re.sub(r'\n', '', text)
    text = re.sub(r'\w*\d\w*', '', text)
    return text

df['text'] = df['text'].apply(clean_text)

In [None]:

# Splitting the dataset
X = df['text']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
# Tokenization and Padding
tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_test_sequences = tokenizer.texts_to_sequences(X_test)

In [None]:
# Padding the sequences to ensure uniform input size
max_length = 200
X_train_padded = pad_sequences(X_train_sequences, maxlen=max_length, padding='post')
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_length, padding='post')

In [None]:
# Save the tokenizer
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

files.download('tokenizer.pickle')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:

# Model Architecture
model = Sequential([
    Embedding(input_dim=5000, output_dim=128, input_length=max_length),
    Bidirectional(LSTM(128, return_sequences=True)),
    Dropout(0.5),
    Bidirectional(LSTM(128)),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])



In [None]:
# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
# Train the model
model.fit(X_train_padded, y_train, epochs=5, validation_split=0.2, batch_size=64)

Epoch 1/5
[1m393/393[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 49ms/step - accuracy: 0.8873 - loss: 0.2296 - val_accuracy: 0.9704 - val_loss: 0.0866
Epoch 2/5
[1m393/393[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 48ms/step - accuracy: 0.9777 - loss: 0.0664 - val_accuracy: 0.9734 - val_loss: 0.0778
Epoch 3/5
[1m393/393[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 51ms/step - accuracy: 0.9906 - loss: 0.0298 - val_accuracy: 0.9819 - val_loss: 0.0610
Epoch 4/5
[1m393/393[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 51ms/step - accuracy: 0.9968 - loss: 0.0120 - val_accuracy: 0.9855 - val_loss: 0.0553
Epoch 5/5
[1m393/393[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 50ms/step - accuracy: 0.9972 - loss: 0.0091 - val_accuracy: 0.9793 - val_loss: 0.0579


<keras.src.callbacks.history.History at 0x7a14db4cff70>

In [None]:
# Save the model
model.save('fake_news_model.h5')



In [None]:

files.download('fake_news_model.h5')
# Download the tokenizer file
files.download('tokenizer.pickle')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:

# Predictions on test set
y_pred = (model.predict(X_test_padded) > 0.5).astype("int32")

[1m421/421[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 14ms/step


In [None]:

# Evaluate performance
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')

Accuracy: 0.98
Precision: 0.99
Recall: 0.97
F1 Score: 0.98
