In [3]:
import os
import pandas as pd # Import pandas here


files_in_content = os.listdir('/content/')
print("Files in /content/ directory:", files_in_content)

data_file_name = 'FakeNewsNet.csv'

if data_file_name in files_in_content:
    data_file_path = os.path.join('/content/', data_file_name)
    print(f"\nIdentified potential data file: {data_file_path}")
    df = pd.read_csv(data_file_path)
    display(df.head())
    print(df.shape)
else:
    print(f"\nData file '{data_file_name}' not found in /content/ directory.")


Files in /content/ directory: ['.config', 'FakeNewsNet.csv', 'FakeNewsClassifierUsingLSTM.ipynb', 'sample_data']

Identified potential data file: /content/FakeNewsNet.csv


Unnamed: 0,title,news_url,source_domain,tweet_num,real
0,Kandi Burruss Explodes Over Rape Accusation on...,http://toofab.com/2017/05/08/real-housewives-a...,toofab.com,42,1
1,People's Choice Awards 2018: The best red carp...,https://www.today.com/style/see-people-s-choic...,www.today.com,0,1
2,Sophia Bush Sends Sweet Birthday Message to 'O...,https://www.etonline.com/news/220806_sophia_bu...,www.etonline.com,63,1
3,Colombian singer Maluma sparks rumours of inap...,https://www.dailymail.co.uk/news/article-33655...,www.dailymail.co.uk,20,1
4,Gossip Girl 10 Years Later: How Upper East Sid...,https://www.zerchoo.com/entertainment/gossip-g...,www.zerchoo.com,38,1


(23196, 5)


In [4]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Handle missing values in the 'title' column
df['title'] = df['title'].fillna('')

# Preprocessing function
def preprocess_text(text):
    text = text.lower() # Convert to lowercase
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text) # Remove special characters
    return text

import re # Import the re module

df['cleaned_title'] = df['title'].apply(preprocess_text)

# Tokenization
max_words = 10000 # Maximum number of words to keep
tokenizer = Tokenizer(num_words=max_words, oov_token="<oov>")
tokenizer.fit_on_texts(df['cleaned_title'])
sequences = tokenizer.texts_to_sequences(df['cleaned_title'])

# Padding
max_length = 100 # Maximum length of sequences
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post')

print("Original title:", df['title'][0])
print("Cleaned title:", df['cleaned_title'][0])
print("Tokenized sequence:", sequences[0])
print("Padded sequence:", padded_sequences[0])
print("Shape of padded sequences:", padded_sequences.shape)

Original title: Kandi Burruss Explodes Over Rape Accusation on 'Real Housewives of Atlanta' Reunion (Video)
Cleaned title: kandi burruss explodes over rape accusation on real housewives of atlanta reunion video
Tokenized sequence: [4548, 5098, 3760, 45, 1030, 5099, 8, 117, 273, 5, 1100, 236, 67]
Padded sequence: [4548 5098 3760   45 1030 5099    8  117  273    5 1100  236   67    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0]
Shape of padded sequences: (23196, 100)


In [5]:
from sklearn.model_selection import train_test_split

# Assuming 'real' is the target variable (0 for fake, 1 for real)
labels = df['real'].values

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)

print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train: (18556, 100)
Shape of X_test: (4640, 100)
Shape of y_train: (18556,)
Shape of y_test: (4640,)


In [7]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout

embedding_dim = 16

model = Sequential([
    Embedding(max_words, embedding_dim), # Removed input_length
    Bidirectional(LSTM(64, return_sequences=True)),
    Bidirectional(LSTM(32)),
    Dense(6, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid') # Sigmoid for binary classification
])

model.summary()

In [8]:
from tensorflow.keras.optimizers import Adam

model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])

model.summary()

In [9]:
num_epochs = 10
history = model.fit(X_train, y_train, epochs=num_epochs, validation_data=(X_test, y_test))

Epoch 1/10
[1m580/580[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 21ms/step - accuracy: 0.7515 - loss: 0.5536 - val_accuracy: 0.8220 - val_loss: 0.4169
Epoch 2/10
[1m580/580[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 20ms/step - accuracy: 0.8163 - loss: 0.3884 - val_accuracy: 0.8321 - val_loss: 0.3852
Epoch 3/10
[1m580/580[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 20ms/step - accuracy: 0.8614 - loss: 0.3265 - val_accuracy: 0.8351 - val_loss: 0.3915
Epoch 4/10
[1m580/580[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 21ms/step - accuracy: 0.8727 - loss: 0.2917 - val_accuracy: 0.8364 - val_loss: 0.4417
Epoch 5/10
[1m580/580[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 20ms/step - accuracy: 0.8882 - loss: 0.2609 - val_accuracy: 0.8325 - val_loss: 0.4553
Epoch 6/10
[1m580/580[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 20ms/step - accuracy: 0.8950 - loss: 0.2374 - val_accuracy: 0.8213 - val_loss: 0.4653
Epoch 7/10
[1m5

In [11]:
loss, accuracy = model.evaluate(X_test, y_test)

print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")

[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - accuracy: 0.8037 - loss: 0.6975
Test Loss: 0.7079
Test Accuracy: 0.8054


In [12]:
model.summary()