In [1]:
# Import packages
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam

### Load the Datasets

In [2]:
test = pd.read_csv('test_df_2.csv')

In [3]:
combined = pd.read_csv('combined_train_df_2.csv')

  combined = pd.read_csv('combined_train_df_2.csv')


In [4]:
test.head()

Unnamed: 0,id,dataset,text,text_clean,chunk_id,title,binary_label,label
0,LIAR 2_6178,LIAR 2,"According to the FBI, Mexican drug cartels are...",according to the fbi mexican drug cartels are ...,1,False,1,3.0
1,LIAR 2_18432,LIAR 2,Says Transportation Secretary Ray LaHood will ...,says transportation secretary ray lahood will ...,1,False,0,1.0
2,Kaggle 2 - News Project_53040,Kaggle 2 - News Project,the outlines of an agreement they say would st...,the outlines of an agreement they say would st...,3,False,1,
3,Kaggle 3 - Fake News Detection_19847,Kaggle 3 - Fake News Detection,asks why MGM didn't notice the behavior of gun...,asks why mgm didnt notice the behavior of gunm...,8,False,1,
4,Fakeddit_406006,Fakeddit,i made a us map and put a touch of red on her,i made a us map and put a touch of red on her,1,True,0,


In [5]:
combined.head()

Unnamed: 0,id,dataset,text,text_clean,chunk_id,title,binary_label,label
0,Fakeddit_0,Fakeddit,my walgreens offbrand mucinex was engraved wit...,my walgreens offbrand mucinex was engraved wit...,1.0,True,1.0,
1,Fakeddit_1,Fakeddit,this concerned sink with a tiny hat,this concerned sink with a tiny hat,1.0,True,0.0,
2,Fakeddit_2,Fakeddit,hackers leak emails from uae ambassador to us,hackers leak emails from uae ambassador to us,1.0,True,1.0,
3,Fakeddit_3,Fakeddit,this flower in my neighborhood,this flower in my neighborhood,1.0,True,1.0,
4,Fakeddit_4,Fakeddit,puppy taking in the view,puppy taking in the view,1.0,True,1.0,


### Data Processing

In [6]:
# Remove LIAR 2 middle category (half-true & half-false)
combined = combined[combined['label']!=3]
test = test[test['label']!=3]

### RNN

In [7]:
# Drop NAs and select relevant columns
combined = combined[['text_clean', 'binary_label', 'dataset']].dropna()
test = test[['text_clean', 'binary_label', 'dataset']].dropna()


# Extract components
train_texts = combined['text_clean'].tolist()
train_labels = combined['binary_label'].tolist()

val_texts = test['text_clean'].tolist()
val_labels = test['binary_label'].tolist()
val_sources = test['dataset'].tolist()  # for grouped eval

In [8]:
# Tokenize
tokenizer = Tokenizer(num_words=20000, oov_token="<OOV>")
tokenizer.fit_on_texts(train_texts)

train_seqs = tokenizer.texts_to_sequences(train_texts)
val_seqs = tokenizer.texts_to_sequences(val_texts)

max_len = 60

X_train = pad_sequences(train_seqs, maxlen=max_len, padding='post', truncating='post')
X_val = pad_sequences(val_seqs, maxlen=max_len, padding='post', truncating='post')

In [9]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip -q glove.6B.zip -d /content/


--2025-05-01 22:41:47--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2025-05-01 22:41:47--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2025-05-01 22:41:48--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

In [10]:
import os
print("/content/glove.6B.100d.txt" in os.listdir("/content"))

False


In [11]:
embedding_dim = 100
glove_path = '/content/glove.6B.100d.txt'

# Load GloVe
glove_dict = {}
with open(glove_path, encoding='utf8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype='float32')
        glove_dict[word] = vector

# Build embedding matrix
vocab_size = len(tokenizer.word_index) + 1
embedding_matrix = np.zeros((vocab_size, embedding_dim))

for word, i in tokenizer.word_index.items():
    embedding_vector = glove_dict.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [12]:
# Model Architecture
model = Sequential([
    Embedding(input_dim=vocab_size,
              output_dim=embedding_dim,
              weights=[embedding_matrix],
              input_length=max_len,
              trainable=False),
    Bidirectional(LSTM(64, return_sequences=False)),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

model.compile(
    loss='binary_crossentropy',
    optimizer=Adam(1e-3),
    metrics=['accuracy']
)



In [13]:
from tensorflow.keras.callbacks import EarlyStopping

# Early stopping callback
early_stop = EarlyStopping(
    monitor='val_loss',
    patience=2,
    restore_best_weights=True
)

# Fit the mdodel
history = model.fit(
    X_train, np.array(train_labels),
    validation_data=(X_val, np.array(val_labels)),
    batch_size=32,
    epochs=10,
    callbacks=[early_stop]
)

Epoch 1/10
[1m13102/13102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m130s[0m 9ms/step - accuracy: 0.7665 - loss: 0.4908 - val_accuracy: 0.5973 - val_loss: 0.7636
Epoch 2/10
[1m13102/13102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m124s[0m 9ms/step - accuracy: 0.8121 - loss: 0.4217 - val_accuracy: 0.5584 - val_loss: 0.8580
Epoch 3/10
[1m13102/13102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m124s[0m 9ms/step - accuracy: 0.8237 - loss: 0.4018 - val_accuracy: 0.5454 - val_loss: 0.9353


In [14]:
from sklearn.metrics import classification_report, roc_auc_score, roc_curve

# Predict probabilities
y_val_probs = model.predict(X_val).ravel()
y_val_preds = (y_val_probs >= 0.5).astype(int)

# Classification metrics
print(classification_report(val_labels, y_val_preds, digits=4))

# AUC-ROC score
auc_score = roc_auc_score(val_labels, y_val_probs)
print(f"AUC-ROC: {auc_score:.4f}")


[1m605/605[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step
              precision    recall  f1-score   support

           0     0.6042    0.4784    0.5340      9333
           1     0.5930    0.7080    0.6454     10017

    accuracy                         0.5973     19350
   macro avg     0.5986    0.5932    0.5897     19350
weighted avg     0.5984    0.5973    0.5917     19350

AUC-ROC: 0.6417


In [17]:
results_RNN = pd.DataFrame({
    'true_label': y_val_probs,
    'pred_label': y_val_preds,
    'dataset': val_sources
})