In [46]:
!pip install langdetect
!pip install contractions

# Libraries for general purpose
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Text cleaning
import re
import string
import nltk
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import stopwords

# Data preprocessing
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from langdetect import detect, LangDetectException
import contractions
from nltk.tokenize import word_tokenize

# Naive Bayes
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

# PyTorch LSTM
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

# Tokenization for LSTM
from collections import Counter
from gensim.models import Word2Vec

# Transformers library for BERT
import transformers
from transformers import BertModel
from transformers import BertTokenizer
from transformers import AdamW, get_linear_schedule_with_warmup
from sklearn.metrics import classification_report, confusion_matrix

import time

# Set seed for reproducibility
import random
seed_value = 2042
random.seed(seed_value)
np.random.seed(seed_value)
torch.manual_seed(seed_value)
torch.cuda.manual_seed_all(seed_value)

# Set style for plots
sns.set_style("whitegrid")
sns.despine()
plt.rc("figure", autolayout=True)
plt.rc("axes", labelweight="bold", labelsize="large", titleweight="bold", titlepad=10)

#
text_col_name = 'clear_text'
nltk.download('punkt')



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

<Figure size 640x480 with 0 Axes>

In [47]:
torch.cuda.is_available()

True

In [48]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [49]:
device = torch.device("cuda")

In [50]:
device

device(type='cuda')

## Text preprocess for LSTM

In [51]:
data = pd.read_csv('drive//MyDrive/data/Data.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,subject,date,label,all_text,clear_text
0,0,politicsNews,2017-12-31 00:00:00,real,"As U.S. budget fight looms, Republicans flip t...",as us budget fight looms republicans flip thei...
1,1,politicsNews,2017-12-29 00:00:00,real,U.S. military to accept transgender recruits o...,us military to accept transgender recruits on ...
2,2,politicsNews,2017-12-31 00:00:00,real,Senior U.S. Republican senator: 'Let Mr. Muell...,senior us republican senator let mr mueller do...
3,3,politicsNews,2017-12-30 00:00:00,real,FBI Russia probe helped by Australian diplomat...,fbi russia probe helped by australian diplomat...
4,4,politicsNews,2017-12-29 00:00:00,real,Trump wants Postal Service to charge 'much mor...,trump wants postal service to charge much more...


In [52]:
def tokenize(data):
    data['tokens'] = data.apply(lambda row: word_tokenize(row['clear_text'].lower()), axis=1)

In [53]:
data = data.replace({'label': {'real': 1, 'fake': 0}})
tokenize(data)

In [54]:
X = data['clear_text'].to_list()

data['label']
y = data['label'].to_list()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [55]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, stratify=y_train, random_state=42)

In [56]:
(unique, counts) = np.unique(y_train, return_counts=True)
np.asarray((unique, counts)).T

array([[    0, 11175],
       [    1, 13558]])

In [57]:
def sorted_corpus(data):
    ##Create vocabulary of words from column
    corpus = []
    for text in data['tokens']:
      corpus.extend(text)
    count_words = Counter(corpus)
    return count_words.most_common()

In [58]:
sorted_words = sorted_corpus(data)

In [59]:
def vocab_to_int(sorted_vocab, column):
    vocab_to_int = {word:n+1 for n, (word, counter) in enumerate(sorted_words)}
    text_int = []
    for text in column:
        article_num = [vocab_to_int[word] for word in text]
        text_int.append(article_num)
    return text_int

In [60]:
text_int = vocab_to_int(sorted_words, data['tokens'])

In [61]:
def pad_tokens(text_int, seq_len):
  ##Add padding to tokens
    features = np.zeros((len(text_int), seq_len), dtype = int)
    for n, article in enumerate(text_int):
        if len(article) <= seq_len:
            zeros = list(np.zeros(seq_len - len(article)))
            new = zeros + article
        else:
            new = article[: seq_len]
        features[n, :] = np.array(new)

    return features

In [62]:
tokenized_column = pad_tokens(text_int, 50)

In [63]:
tokenized_column.shape

(38647, 50)

## Word2Vec Embedding

In [64]:
Word2vec_train_data = list(map(lambda x: x.split(), X_train))
EMBEDDING_DIM = 50
word2vec_model = Word2Vec(Word2vec_train_data, vector_size=EMBEDDING_DIM)

In [65]:
print(f"Vocabulary size: {len(sorted_words) + 1}")

Vocabulary size: 226636


In [66]:
VOCAB_SIZE = len(sorted_words) + 1 #+1 for the padding

In [67]:
sorted_words[0]

('the', 864439)

In [68]:
# Define an empty embedding matrix of shape (VOCAB_SIZE, EMBEDDING_DIM)
embedding_matrix = np.zeros((VOCAB_SIZE, EMBEDDING_DIM))

# Fill the embedding matrix with pre-trained values from word2vec
for n, (word, token) in enumerate(sorted_words):
    # Check if the word is present in the word2vec model's vocabulary
    if word in word2vec_model.wv.key_to_index:
        # If the word is present, retrieve its embedding vector and add it to the embedding matrix
        embedding_vector = word2vec_model.wv[word]
        embedding_matrix[n] = embedding_vector

# Print the shape of the embedding matrix
print("Embedding Matrix Shape:", embedding_matrix.shape)

Embedding Matrix Shape: (226636, 50)


In [69]:
X = tokenized_column
y = data['label'].values

In [70]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=seed_value)

X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, stratify=y_train, random_state=seed_value)

## TORCH LOADER ALLELUJA

In [71]:
train_data = TensorDataset(torch.from_numpy(X_train), torch.from_numpy(y_train))
test_data = TensorDataset(torch.from_numpy(X_test), torch.from_numpy(y_test))
valid_data = TensorDataset(torch.from_numpy(X_valid), torch.from_numpy(y_valid))

In [72]:
BATCH_SIZE = 32

In [73]:
train_loader = DataLoader(train_data, shuffle=True, batch_size=BATCH_SIZE, drop_last=True)
valid_loader = DataLoader(valid_data, shuffle=False, batch_size=BATCH_SIZE, drop_last=True)
test_loader = DataLoader(test_data, shuffle=False, batch_size=BATCH_SIZE, drop_last=True)

In [74]:
class Attention(nn.Module):
    def __init__(self, hidden_dim, is_bidirectional):
        super(Attention, self).__init__()
        self.is_bidirectional = is_bidirectional
        # The attention linear layer which transforms the input data to the hidden space
        self.attn = nn.Linear(hidden_dim * (4 if is_bidirectional else 2), hidden_dim * (2 if is_bidirectional else 1))
        # The linear layer that calculates the attention scores
        self.v = nn.Linear(hidden_dim * (2 if is_bidirectional else 1), 1, bias=False)

    def forward(self, hidden, encoder_outputs):
        seq_len = encoder_outputs.size(1)
        # Concatenate the last two hidden states in case of a bidirectional LSTM
        if self.is_bidirectional:
            hidden = torch.cat((hidden[-2], hidden[-1]), dim=-1)
        else:
            hidden = hidden[-1]
        # Repeat the hidden state across the sequence length
        hidden_repeated = hidden.unsqueeze(1).repeat(1, seq_len, 1)
        # Calculate attention weights
        attn_weights = torch.tanh(self.attn(torch.cat((hidden_repeated, encoder_outputs), dim=2)))
        # Compute attention scores
        attn_weights = self.v(attn_weights).squeeze(2)
        # Apply softmax to get valid probabilities
        return nn.functional.softmax(attn_weights, dim=1)


In [75]:

class LSTM_Classifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_classes, lstm_layers, dropout, is_bidirectional):
        super(LSTM_Classifier, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = lstm_layers
        self.is_bidirectional = is_bidirectional

        # The Embedding layer that converts input words to embeddings
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        # LSTM layer which processes the embeddings
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, lstm_layers, batch_first=True, bidirectional=is_bidirectional)
        # Attention layer to compute the context vector
        self.attention = Attention(hidden_dim, is_bidirectional)
        # Fully connected layer which classifies the context vector into classes
        self.fc = nn.Linear(hidden_dim * (2 if is_bidirectional else 1), num_classes)
        # Apply LogSoftmax to outputs for numerical stability
        self.softmax = nn.LogSoftmax(dim=1)
        # Dropout layer for regularisation
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, hidden):
        # Transform words to embeddings
        embedded = self.embedding(x)
        # Pass embeddings to LSTM
        out, hidden = self.lstm(embedded, hidden)
        # Calculate attention weights
        attn_weights = self.attention(hidden[0], out)
        # Calculate context vector by taking the weighted sum of LSTM outputs
        context = attn_weights.unsqueeze(1).bmm(out).squeeze(1)
        # Classify the context vector
        out = self.softmax(self.fc(context))
        return out, hidden

    def init_hidden(self, batch_size):
        # Factor determines the size of hidden states depending on bidirectionality
        factor = 2 if self.is_bidirectional else 1
        # Initial hidden and cell states are zero
        h0 = torch.zeros(self.num_layers * factor, batch_size, self.hidden_dim).to(device)
        c0 = torch.zeros(self.num_layers * factor, batch_size, self.hidden_dim).to(device)
        return (h0, c0)

In [76]:
NUM_CLASSES = 2 #We are dealing with a multiclass classification of 5 classes
HIDDEN_DIM = 50 #number of neurons of the internal state (internal neural network in the LSTM)
LSTM_LAYERS = 1 #Number of stacked LSTM layers

IS_BIDIRECTIONAL = False # Set this to False for unidirectional LSTM, and True for bidirectional LSTM

LR = 4e-4 #Learning rate
DROPOUT = 0.5 #LSTM Dropout
EPOCHS = 10 #Number of training epoch

model = LSTM_Classifier(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM, NUM_CLASSES, LSTM_LAYERS, DROPOUT, IS_BIDIRECTIONAL)

model = model.to(device)

# Initialize the embedding layer with the previously defined embedding matrix
model.embedding.weight.data.copy_(torch.from_numpy(embedding_matrix))
# Allow the embedding matrix to be fine-tuned to better adapt to our dataset and get higher accuracy
model.embedding.weight.requires_grad = True

# Set up the criterion (loss function)
criterion = nn.NLLLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay = 5e-6)

print(model)

LSTM_Classifier(
  (embedding): Embedding(226636, 50)
  (lstm): LSTM(50, 50, batch_first=True)
  (attention): Attention(
    (attn): Linear(in_features=100, out_features=50, bias=True)
    (v): Linear(in_features=50, out_features=1, bias=False)
  )
  (fc): Linear(in_features=50, out_features=2, bias=True)
  (softmax): LogSoftmax(dim=1)
  (dropout): Dropout(p=0.5, inplace=False)
)


In [77]:
total_step = len(train_loader)
total_step_val = len(valid_loader)

early_stopping_patience = 4
early_stopping_counter = 0

valid_acc_max = 0 # Initialize best accuracy top 0

for e in range(EPOCHS):

    #lists to host the train and validation losses of every batch for each epoch
    train_loss, valid_loss  = [], []
    #lists to host the train and validation accuracy of every batch for each epoch
    train_acc, valid_acc  = [], []

    #lists to host the train and validation predictions of every batch for each epoch
    y_train_list, y_val_list = [], []

    #initalize number of total and correctly classified texts during training and validation
    correct, correct_val = 0, 0
    total, total_val = 0, 0
    running_loss, running_loss_val = 0, 0


    ####TRAINING LOOP####

    model.train()

    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device) #load features and targets in device

        h = model.init_hidden(labels.size(0))

        model.zero_grad() #reset gradients

        output, h = model(inputs,h) #get output and hidden states from LSTM network

        loss = criterion(output, labels)
        loss.backward()

        running_loss += loss.item()

        optimizer.step()

        y_pred_train = torch.argmax(output, dim=1) #get tensor of predicted values on the training set
        y_train_list.extend(y_pred_train.squeeze().tolist()) #transform tensor to list and the values to the list

        correct += torch.sum(y_pred_train==labels).item() #count correctly classified texts per batch
        total += labels.size(0) #count total texts per batch

    train_loss.append(running_loss / total_step)
    train_acc.append(100 * correct / total)

    ####VALIDATION LOOP####

    with torch.no_grad():

        model.eval()

        for inputs, labels in valid_loader:
            inputs, labels = inputs.to(device), labels.to(device)

            val_h = model.init_hidden(labels.size(0))

            output, val_h = model(inputs, val_h)

            val_loss = criterion(output, labels)
            running_loss_val += val_loss.item()

            y_pred_val = torch.argmax(output, dim=1)
            y_val_list.extend(y_pred_val.squeeze().tolist())

            correct_val += torch.sum(y_pred_val==labels).item()
            total_val += labels.size(0)

        valid_loss.append(running_loss_val / total_step_val)
        valid_acc.append(100 * correct_val / total_val)

    #Save model if validation accuracy increases
    if np.mean(valid_acc) >= valid_acc_max:
        torch.save(model.state_dict(), './state_dict.pt')
        print(f'Epoch {e+1}:Validation accuracy increased ({valid_acc_max:.6f} --> {np.mean(valid_acc):.6f}).  Saving model ...')
        valid_acc_max = np.mean(valid_acc)
        early_stopping_counter=0 #reset counter if validation accuracy increases
    else:
        print(f'Epoch {e+1}:Validation accuracy did not increase')
        early_stopping_counter+=1 #increase counter if validation accuracy does not increase

    if early_stopping_counter > early_stopping_patience:
        print('Early stopped at epoch :', e+1)
        break

    print(f'\tTrain_loss : {np.mean(train_loss):.4f} Val_loss : {np.mean(valid_loss):.4f}')
    print(f'\tTrain_acc : {np.mean(train_acc):.3f}% Val_acc : {np.mean(valid_acc):.3f}%')

Epoch 1:Validation accuracy increased (0.000000 --> 99.838083).  Saving model ...
	Train_loss : 0.1023 Val_loss : 0.0117
	Train_acc : 95.114% Val_acc : 99.838%
Epoch 2:Validation accuracy did not increase
	Train_loss : 0.0138 Val_loss : 0.0124
	Train_acc : 99.761% Val_acc : 99.773%
Epoch 3:Validation accuracy increased (99.838083 --> 99.854275).  Saving model ...
	Train_loss : 0.0096 Val_loss : 0.0083
	Train_acc : 99.781% Val_acc : 99.854%
Epoch 4:Validation accuracy did not increase
	Train_loss : 0.0065 Val_loss : 0.0098
	Train_acc : 99.854% Val_acc : 99.790%
Epoch 5:Validation accuracy did not increase
	Train_loss : 0.0038 Val_loss : 0.0098
	Train_acc : 99.915% Val_acc : 99.757%
Epoch 6:Validation accuracy did not increase
	Train_loss : 0.0030 Val_loss : 0.0069
	Train_acc : 99.927% Val_acc : 99.838%
Epoch 7:Validation accuracy did not increase
	Train_loss : 0.0019 Val_loss : 0.0095
	Train_acc : 99.960% Val_acc : 99.822%
Epoch 8:Validation accuracy did not increase
Early stopped at ep

In [78]:
# Loading the best model
model.load_state_dict(torch.load('./state_dict.pt'))

<All keys matched successfully>

In [79]:
def evaluate_model(model, test_loader):
    model.eval()
    y_pred_list = []
    y_test_list = []

    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            test_h = model.init_hidden(labels.size(0))

            output, val_h = model(inputs, test_h)
            y_pred_test = torch.argmax(output, dim=1)
            y_pred_list.extend(y_pred_test.squeeze().tolist())
            y_test_list.extend(labels.squeeze().tolist())

    return y_pred_list, y_test_list

y_pred_list, y_test_list = evaluate_model(model, test_loader)

In [80]:
print('Classification Report for Bi-LSTM :\n', classification_report(y_test_list, y_pred_list, target_names=['fake', 'true']))

Classification Report for Bi-LSTM :
               precision    recall  f1-score   support

        fake       1.00      1.00      1.00      3485
        true       1.00      1.00      1.00      4227

    accuracy                           1.00      7712
   macro avg       1.00      1.00      1.00      7712
weighted avg       1.00      1.00      1.00      7712



In [81]:
test_article = "Philosophers amongst you will be familiar with the work of Rene Descartes – a mathematician, epistemologist, and rationalist – much of his work laid the ground for modern philosophy and in particular the strand that has grown out of Hobbes and Locke that informs a lot of the 17th century and the formation of states and societies thereafter. \
There is one eerie and unsettling aspect of his life that is gaining greater attention. Descartes had a relationship with a servant (Helen van der Strom), and their relationship produced a young daughter Francine, to whom Descartes was very attached. Tragically, Francine died of scarlet fever, aged five, and so distraught was Descartes that he had a robot or automata (clockwork, lifelike doll) built in her likeness.\
He transported this ‘doll’ with him whenever he travelled (in a casket), and on a trip to visit Queen Christina of Sweden, the crew of the ship on which he was travelling became so alarmed (it was a stormy night) by the robot and Descartes murmurings with it, that they invaded his quarters, seized and broke the ‘doll’ and threw it overboard. Descartes was further traumatized, and whilst it is not clear the incident immediately impacted his health, he died soon after.\
Technology scares us Descartes ‘doll’ is enjoying renewed attention for what it suggests about the relations between humans and machines, how robots can potentially replace and even supplant humans in different ways and for the manner in which this can cause consternation.\
The relationship between human and machine is a theme that will cut through the advance (or decline) of the world, and we have written about it frequently (i.e. ‘Talos’). As my limited vision can perceive, will attempt a classification that says there are at least two aspects of this megatrend – the risks that machines take over our (human) world (AI), and the risks that machine led worlds start to exist outside the human one (Defi, Web3/metaverse).\
The bad news is that in the case of the former, there is an unknown risk that machines could injure the human race (weaponized AI, the use of AI by ‘bad’ humans and the use of robots in war not to mention the creation of chemical and biological weapons by AI that I referred to in ‘The Final Problem’)"

In [90]:
def vocab_to_int_one(sorted_vocab, text):
    vocab_to_int = {word:n+1 for n, (word, counter) in enumerate(sorted_words)}
    text_int = []
    for word in text:
      try:
        text_int.append(vocab_to_int[word])
      except KeyError:
        continue
    return text_int


def pad_tokens_one(article, seq_len):
  ##Add padding to tokens
    features = np.zeros((len(text_int), seq_len), dtype = int)
    if len(article) <= seq_len:
        zeros = list(np.zeros(seq_len - len(article)))
        new = zeros + article
    else:
        new = article[: seq_len]
    return new

def text_preprocess(txt, sorted_vocab):
    txt_int = vocab_to_int_one(sorted_vocab, test_article)
    ready = pad_tokens_one(txt_int, 50)
    return ready


In [92]:
test_article_int = text_preprocess(test_article, sorted_words)

In [131]:
test_h = model.init_hidden(1)
input = torch.LongTensor([test_article_int]).to(device)
# test_h = (torch.zeros(1, 50).to(device), torch.zeros(1, 50).to(device))

test_h



(tensor([[[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
           0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
           0., 0., 0., 0.]]], device='cuda:0'),
 tensor([[[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
           0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
           0., 0., 0., 0.]]], device='cuda:0'))

In [132]:
output, val_h = model(input, test_h)
output

tensor([[-3.9117e-04, -7.8466e+00]], device='cuda:0',
       grad_fn=<LogSoftmaxBackward0>)

In [133]:
y_pred_test = torch.argmax(output, dim=1)

In [134]:
y_pred_test

tensor([0], device='cuda:0')

In [136]:
def predict(model, txt, sorted_words):
    txt_int = text_preprocess(txt, sorted_words)
    test_h = model.init_hidden(1)
    input = torch.LongTensor([test_article_int]).to(device)
    output, val_h = model(input, test_h)
    y_pred_test = int(torch.argmax(output, dim=1))
    return 'fake' if  y_pred_test==0 else 'true'

predict(model, test_article, sorted_words)



'fake'