<a href="https://colab.research.google.com/github/NavanjanaLAV/SE4050-deeplearning-2025/blob/IT22113054LSTM/LMTS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import pandas as pd
import os
os.listdir()

['.config', 'sample_data']

In [6]:
true_df = pd.read_csv("/content/sample_data/fake and true news/Fake.csv")
fake_df = pd.read_csv("/content/sample_data/fake and true news/True.csv")

In [7]:
# Add a label column: 1 for real, 0 for fake
true_df['label'] = 1
fake_df['label'] = 0

In [8]:
# Combine the two dataframes
data = pd.concat([true_df, fake_df], axis=0).reset_index(drop=True)

In [9]:
# Shuffle the rows to mix fake and real articles
data = data.sample(frac=1, random_state=42).reset_index(drop=True)

In [10]:
# Display the first few rows
print(data.head())

# Check the label distribution
print(data['label'].value_counts())

                                               title  \
0  Ben Stein Calls Out 9th Circuit Court: Committ...   
1  Trump drops Steve Bannon from National Securit...   
2  Puerto Rico expects U.S. to lift Jones Act shi...   
3   OOPS: Trump Just Accidentally Confirmed He Le...   
4  Donald Trump heads for Scotland to reopen a go...   

                                                text       subject  \
0  21st Century Wire says Ben Stein, reputable pr...       US_News   
1  WASHINGTON (Reuters) - U.S. President Donald T...  politicsNews   
2  (Reuters) - Puerto Rico Governor Ricardo Rosse...  politicsNews   
3  On Monday, Donald Trump once again embarrassed...          News   
4  GLASGOW, Scotland (Reuters) - Most U.S. presid...  politicsNews   

                  date  label  
0    February 13, 2017      1  
1       April 5, 2017       0  
2  September 27, 2017       0  
3         May 22, 2017      1  
4       June 24, 2016       0  
label
1    23481
0    21417
Name: count, dtype: in

In [11]:
import numpy as np
import re
import string
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [12]:
def clean_text(text):
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Remove punctuations
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Convert to lowercase
    text = text.lower()
    # Remove extra whitespace
    text = text.strip()
    return text

data['text'] = data['text'].apply(clean_text)


In [13]:
data.head()

Unnamed: 0,title,text,subject,date,label
0,Ben Stein Calls Out 9th Circuit Court: Committ...,st century wire says ben stein reputable profe...,US_News,"February 13, 2017",1
1,Trump drops Steve Bannon from National Securit...,washington reuters us president donald trump ...,politicsNews,"April 5, 2017",0
2,Puerto Rico expects U.S. to lift Jones Act shi...,reuters puerto rico governor ricardo rossello...,politicsNews,"September 27, 2017",0
3,OOPS: Trump Just Accidentally Confirmed He Le...,on monday donald trump once again embarrassed ...,News,"May 22, 2017",1
4,Donald Trump heads for Scotland to reopen a go...,glasgow scotland reuters most us presidential...,politicsNews,"June 24, 2016",0


In [14]:
X = data['text'].values
y = data['label'].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

In [15]:
vocab_size = 10000  # number of words to keep
oov_token = "<OOV>"

tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_token)
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

In [16]:
max_length = 300  # typical for news text

X_train_pad = pad_sequences(X_train_seq, maxlen=max_length, padding='post', truncating='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_length, padding='post', truncating='post')

print(X_train_pad.shape, X_test_pad.shape)


(35918, 300) (8980, 300)


In [17]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

model = Sequential([
    Embedding(input_dim=10000, output_dim=128, input_length=300),
    LSTM(128, dropout=0.2, recurrent_dropout=0.2),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()




In [18]:
history = model.fit(
    X_train_pad, y_train,
    epochs=5,
    batch_size=64,
    validation_split=0.2
)


Epoch 1/5
[1m449/449[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m532s[0m 1s/step - accuracy: 0.7585 - loss: 0.4916 - val_accuracy: 0.8064 - val_loss: 0.3731
Epoch 2/5
[1m449/449[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m501s[0m 1s/step - accuracy: 0.8695 - loss: 0.3484 - val_accuracy: 0.9291 - val_loss: 0.2184
Epoch 3/5
[1m449/449[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m528s[0m 1s/step - accuracy: 0.9052 - loss: 0.2705 - val_accuracy: 0.9582 - val_loss: 0.1296
Epoch 4/5
[1m449/449[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m504s[0m 1s/step - accuracy: 0.9585 - loss: 0.1339 - val_accuracy: 0.9971 - val_loss: 0.0138
Epoch 5/5
[1m449/449[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m502s[0m 1s/step - accuracy: 0.9948 - loss: 0.0224 - val_accuracy: 0.9975 - val_loss: 0.0084


In [19]:
loss, accuracy = model.evaluate(X_test_pad, y_test)
print(f"Test Accuracy: {accuracy:.2f}")


[1m281/281[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 130ms/step - accuracy: 0.9987 - loss: 0.0064
Test Accuracy: 1.00
