In [19]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/

Additional Tasks
Task 5: Transformer-Based Embeddings and Classification

Downloading the dataset into Google Collab

In [20]:
!kaggle datasets download -d emineyetm/fake-news-detection-datasets

Dataset URL: https://www.kaggle.com/datasets/emineyetm/fake-news-detection-datasets
License(s): unknown
fake-news-detection-datasets.zip: Skipping, found more recently modified local copy (use --force to force download)


Unzip the datasets

In [21]:
import zipfile
zip_ref = zipfile.ZipFile('/content/fake-news-detection-datasets.zip', 'r')
zip_ref.extractall('/content')
zip_ref.close()

Importing libraries

In [3]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding
from sklearn.model_selection import train_test_split
import tensorflow as tf
from transformers import TFBertForSequenceClassification, BertTokenizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [23]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [4]:
# Define the file paths
true_news_path = '/content/News _dataset/True.csv'
fake_news_path = '/content/News _dataset/Fake.csv'

# Read the datasets
true_news_df = pd.read_csv(true_news_path)
fake_news_df = pd.read_csv(fake_news_path)

In [25]:
# Print the contents of the True news dataset
print("True News Dataset:")
true_news_df.head()  # Print the first 5 rows

True News Dataset:


Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


In [26]:
# Print the contents of the Fake news dataset
print("\nFake News Dataset:")
fake_news_df.head()  # Print the first 5 rows


Fake News Dataset:


Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [5]:
# combining datasets
true_news_df['label'] = 1  # Label for true news
fake_news_df['label'] = 0   # Label for fake news
combined_df = pd.concat([true_news_df, fake_news_df], ignore_index=True)

In [6]:
combined_df.sample(5)

Unnamed: 0,title,text,subject,date,label
21853,Mother Of DACA Recipient Who Died Rescuing Fl...,Donald Trump is set to end the Deferred Action...,News,"September 4, 2017",0
2139,Trump blames 'both sides' for Virginia violenc...,WASHINGTON/NEW YORK (Reuters) - U.S. President...,politicsNews,"August 15, 2017",1
2467,Putin: We'll have to retaliate against 'illega...,"SAVONLINNA, Finland (Reuters) - President Vlad...",politicsNews,"July 27, 2017",1
37377,Keith Olbermann to Betsy DeVos: “The Hurricane...,The lefty lunatic of the day just can t keep h...,Government News,"Aug 26, 2017",0
10554,"After primary win, Senate Banking chair may mo...",WASHINGTON (Reuters) - Senate Banking Committe...,politicsNews,"March 3, 2016",1


In [7]:
# Define a function to clean the text
def clean_text(text):
    # Remove punctuation and special characters
    text = re.sub(r'[^\w\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    text = ' '.join(word for word in text.split() if word not in stop_words)
    return text

# Apply the cleaning function to the 'text' column
combined_df['cleaned_text'] = combined_df['text'].apply(clean_text)

In [8]:
combined_df['cleaned_text'][0]

'washington reuters head conservative republican faction us congress voted month huge expansion national debt pay tax cuts called fiscal conservative sunday urged budget restraint 2018 keeping sharp pivot way among republicans us representative mark meadows speaking cbs face nation drew hard line federal spending lawmakers bracing battle january return holidays wednesday lawmakers begin trying pass federal budget fight likely linked issues immigration policy even november congressional election campaigns approach republicans seek keep control congress president donald trump republicans want big budget increase military spending democrats also want proportional increases nondefense discretionary spending programs support education scientific research infrastructure public health environmental protection trump administration already willing say going increase nondefense discretionary spending 7 percent meadows chairman small influential house freedom caucus said program democrats saying 

Implement and Evaluate Transformer-Based Embeddings

In [32]:
# Load pre-trained BERT model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
bert_model = TFBertModel.from_pretrained(model_name)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [9]:
# Prepare the data
X = combined_df['cleaned_text'].values
y = combined_df['label'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [37]:
# Tokenize and generate embeddings in batches
max_length = 128  # Adjust based on your needs
batch_size = 8  # Adjust as needed

def generate_embeddings(texts):
    all_embeddings = []
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i + batch_size]
        input_ids = []
        attention_masks = []
        for text in batch_texts:
            encoded_dict = tokenizer.encode_plus(
                                text,
                                add_special_tokens=True,
                                max_length=max_length,
                                pad_to_max_length=True,
                                return_attention_mask=True,
                                return_tensors='tf',
                           )
            input_ids.append(encoded_dict['input_ids'])
            attention_masks.append(encoded_dict['attention_mask'])
        input_ids = tf.concat(input_ids, axis=0)
        attention_masks = tf.concat(attention_masks, axis=0)
        embeddings = bert_model(input_ids, attention_mask=attention_masks)[0][:, 0, :]
        all_embeddings.append(embeddings)

    # Concatenate all batch embeddings
    return tf.concat(all_embeddings, axis=0)

In [38]:
train_embeddings = generate_embeddings(X_train)



In [39]:
test_embeddings = generate_embeddings(X_test)

In [41]:
# Reshape the embeddings to include a timestep dimension
train_embeddings = tf.expand_dims(train_embeddings, axis=1)  # Add timestep dimension
test_embeddings = tf.expand_dims(test_embeddings, axis=1)  # Add timestep dimension

In [42]:
# Build and train the RNN model
model = Sequential()
model.add(LSTM(64, input_shape=(train_embeddings.shape[1], train_embeddings.shape[2])))  # input_shape is now (timesteps, features)
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(train_embeddings, y_train, epochs=3, batch_size=32, validation_split=0.2)

  super().__init__(**kwargs)


Epoch 1/3
[1m898/898[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 7ms/step - accuracy: 0.9063 - loss: 0.2253 - val_accuracy: 0.9674 - val_loss: 0.0883
Epoch 2/3
[1m898/898[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - accuracy: 0.9665 - loss: 0.0889 - val_accuracy: 0.9570 - val_loss: 0.1054
Epoch 3/3
[1m898/898[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5ms/step - accuracy: 0.9715 - loss: 0.0785 - val_accuracy: 0.9761 - val_loss: 0.0677


<keras.src.callbacks.history.History at 0x799db96cbeb0>

In [43]:
# Evaluate the model
loss, accuracy = model.evaluate(test_embeddings, y_test)
print(f'Test Loss: {loss}')
print(f'Test Accuracy: {accuracy}')

[1m281/281[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9765 - loss: 0.0649
Test Loss: 0.0638928934931755
Test Accuracy: 0.975278377532959


In [44]:
# Build and train the GRU model
gru_model = Sequential()
gru_model.add(tf.keras.layers.GRU(64, input_shape=(train_embeddings.shape[1], train_embeddings.shape[2])))
gru_model.add(Dense(1, activation='sigmoid'))
gru_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
gru_model.fit(train_embeddings, y_train, epochs=3, batch_size=32, validation_split=0.2)

Epoch 1/3
[1m898/898[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 8ms/step - accuracy: 0.9046 - loss: 0.2227 - val_accuracy: 0.9325 - val_loss: 0.1678
Epoch 2/3
[1m898/898[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 6ms/step - accuracy: 0.9641 - loss: 0.0950 - val_accuracy: 0.9635 - val_loss: 0.0976
Epoch 3/3
[1m898/898[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 6ms/step - accuracy: 0.9697 - loss: 0.0794 - val_accuracy: 0.9737 - val_loss: 0.0697


<keras.src.callbacks.history.History at 0x799db393bfd0>

In [45]:
# Evaluate the GRU model
loss, accuracy = gru_model.evaluate(test_embeddings, y_test)
print(f'GRU Test Loss: {loss}')
print(f'GRU Test Accuracy: {accuracy}')

[1m281/281[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9763 - loss: 0.0666
GRU Test Loss: 0.06627225875854492
GRU Test Accuracy: 0.975612461566925


In [46]:
# Build and train the Bi-LSTM model
bi_lstm_model = Sequential()
bi_lstm_model.add(tf.keras.layers.Bidirectional(LSTM(64), input_shape=(train_embeddings.shape[1], train_embeddings.shape[2])))
bi_lstm_model.add(Dense(1, activation='sigmoid'))
bi_lstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
bi_lstm_model.fit(train_embeddings, y_train, epochs=3, batch_size=32, validation_split=0.2)

Epoch 1/3


  super().__init__(**kwargs)


[1m898/898[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 9ms/step - accuracy: 0.9110 - loss: 0.2135 - val_accuracy: 0.9605 - val_loss: 0.1022
Epoch 2/3
[1m898/898[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 11ms/step - accuracy: 0.9659 - loss: 0.0895 - val_accuracy: 0.9688 - val_loss: 0.0818
Epoch 3/3
[1m898/898[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 5ms/step - accuracy: 0.9709 - loss: 0.0733 - val_accuracy: 0.9747 - val_loss: 0.0684


<keras.src.callbacks.history.History at 0x799db3f76650>

In [47]:
# Evaluate the Bi-LSTM model
loss, accuracy = bi_lstm_model.evaluate(test_embeddings, y_test)
print(f'Bi-LSTM Test Loss: {loss}')
print(f'Bi-LSTM Test Accuracy: {accuracy}')

[1m281/281[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9757 - loss: 0.0629
Bi-LSTM Test Loss: 0.06397845596075058
Bi-LSTM Test Accuracy: 0.9753897786140442


In [10]:
# Split data
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [11]:
# Load pre-trained BERT model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = TFBertForSequenceClassification.from_pretrained(model_name, num_labels=1) # Binary classification

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
# Tokenize and prepare data for BERT
def prepare_data(texts, labels):
  input_ids = []
  attention_masks = []
  for text in texts:
      encoded_dict = tokenizer.encode_plus(
          text,
          add_special_tokens=True,
          max_length=128,
          padding='max_length',
          truncation=True,
          return_attention_mask=True,
          return_tensors='tf'
      )
      input_ids.append(encoded_dict['input_ids'])
      attention_masks.append(encoded_dict['attention_mask'])

  return tf.concat(input_ids, axis=0), tf.concat(attention_masks, axis=0), tf.convert_to_tensor(labels)

In [13]:
train_input_ids, train_attention_masks, train_labels = prepare_data(X_train, y_train)
val_input_ids, val_attention_masks, val_labels = prepare_data(X_val, y_val)
test_input_ids, test_attention_masks, test_labels = prepare_data(X_test, y_test)

In [14]:
# Define the loss function
loss_fn = tf.keras.losses.BinaryCrossentropy(from_logits=True)

# Define the metrics
metrics = [tf.keras.metrics.BinaryAccuracy('accuracy')]

# Define the optimizer
optimizer = tf.keras.optimizers.AdamW(learning_rate=2e-5)

# Define a training step function
@tf.function
def train_step(inputs, labels):
  with tf.GradientTape() as tape:
    predictions = model(inputs, training=True)['logits']
    loss = loss_fn(labels, predictions)
  gradients = tape.gradient(loss, model.trainable_variables)
  optimizer.apply_gradients(zip(gradients, model.trainable_variables))
  metrics[0].update_state(labels, tf.sigmoid(predictions))  # Update accuracy metric
  return loss

# Training loop
epochs = 3
batch_size = 64
for epoch in range(epochs):
  for i in range(0, len(train_input_ids), batch_size):
    batch_input_ids = train_input_ids[i:i + batch_size]
    batch_attention_masks = train_attention_masks[i:i + batch_size]
    batch_labels = train_labels[i:i + batch_size]
    loss = train_step({'input_ids': batch_input_ids, 'attention_mask': batch_attention_masks}, batch_labels)
    print(f"Epoch {epoch + 1}, Batch {i // batch_size + 1}, Loss: {loss.numpy()}")

  # Reset metrics for the next epoch
  metrics[0].reset_state()

Epoch 1, Batch 1, Loss: 0.7131232023239136
Epoch 1, Batch 2, Loss: 0.694682776927948
Epoch 1, Batch 3, Loss: 0.6841837763786316
Epoch 1, Batch 4, Loss: 0.6651768684387207
Epoch 1, Batch 5, Loss: 0.6429643630981445
Epoch 1, Batch 6, Loss: 0.6498124003410339
Epoch 1, Batch 7, Loss: 0.5913978219032288
Epoch 1, Batch 8, Loss: 0.5805033445358276
Epoch 1, Batch 9, Loss: 0.533775806427002
Epoch 1, Batch 10, Loss: 0.5502156019210815
Epoch 1, Batch 11, Loss: 0.5049439072608948
Epoch 1, Batch 12, Loss: 0.4859670400619507
Epoch 1, Batch 13, Loss: 0.46355754137039185
Epoch 1, Batch 14, Loss: 0.43990808725357056
Epoch 1, Batch 15, Loss: 0.4100261330604553
Epoch 1, Batch 16, Loss: 0.4206002354621887
Epoch 1, Batch 17, Loss: 0.3789950907230377
Epoch 1, Batch 18, Loss: 0.4101824164390564
Epoch 1, Batch 19, Loss: 0.3441901206970215
Epoch 1, Batch 20, Loss: 0.3613189458847046
Epoch 1, Batch 21, Loss: 0.29232674837112427
Epoch 1, Batch 22, Loss: 0.3285531997680664
Epoch 1, Batch 23, Loss: 0.2478114366531

In [16]:
# Make predictions
predictions = model.predict({'input_ids': test_input_ids, 'attention_mask': test_attention_masks})
predicted_labels = (tf.sigmoid(predictions.logits) > 0.5).numpy().astype(int).flatten()



In [17]:
# Evaluate the model
accuracy = accuracy_score(y_test, predicted_labels)
precision = precision_score(y_test, predicted_labels)
recall = recall_score(y_test, predicted_labels)
f1 = f1_score(y_test, predicted_labels)

In [18]:
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-score: {f1}")

Accuracy: 0.9986636971046771
Precision: 0.9990651293237769
Recall: 0.99813200498132
F1-score: 0.9985983491667965
