In [None]:
import numpy as np
import pandas as pd
from tensorflow import keras as keras
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
import re
import nltk
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split


In [None]:
from tensorflow.keras import layers

In [None]:
import tensorflow as tf

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Import the dataset, and merge the fake and true news.

In [None]:
fake_news = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Fake.csv')
true_news = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/True.csv')

fake_news['target']=0
true_news['target']=1

data = pd.concat([fake_news,true_news])
data = data.drop(columns=['title','subject','date'])
data.head()

Unnamed: 0,text,target
0,Donald Trump just couldn t wish all Americans ...,0
1,House Intelligence Committee Chairman Devin Nu...,0
2,"On Friday, it was revealed that former Milwauk...",0
3,"On Christmas day, Donald Trump announced that ...",0
4,Pope Francis used his annual Christmas Day mes...,0


Change the words to lower cases.

In [None]:
data['text'] = data['text'].apply(lambda x:x.lower())
data['text'].head()

0    donald trump just couldn t wish all americans ...
1    house intelligence committee chairman devin nu...
2    on friday, it was revealed that former milwauk...
3    on christmas day, donald trump announced that ...
4    pope francis used his annual christmas day mes...
Name: text, dtype: object

In [None]:
x = data['text']
y = data['target']

First, split the data with 50% of train data and 50% of test and validation sets.

In [None]:
x_train, x_temp, y_train, y_temp = train_test_split(x, y , test_size=0.5,train_size=0.5, random_state=334)


Use 20% of the remaining data as test data and the 30% is validation data.

In [None]:
x_val, x_test, y_val, y_test = train_test_split(x_temp, y_temp , test_size=0.8,train_size=0.2, random_state=334)

Tokenizing the texts firts.

In [None]:
nltk.download('wordnet')
nltk.download('omw-1.4') # for Lemmatization

x_train = x_train.apply(lambda x :x.lower()) # all lower cases
x_test = x_test.apply(lambda x :x.lower())
x_val = x_val.apply(lambda x :x.lower())

lemm = WordNetLemmatizer()
# Lemmatization is the process of grouping together the different inflected forms of a word
# so they can be analyzed as a single item
x_train = x_train.apply(lambda x: lemm.lemmatize(x))
x_test = x_test.apply(lambda x: lemm.lemmatize(x))
x_val = x_val.apply(lambda x: lemm.lemmatize(x))

token = Tokenizer(num_words=10000,oov_token="<OOV>") # replace any unknown words with OOV, split
token.fit_on_texts(x_train) #  Tokenizer stores everything in the word_index during fit_on_texts
token.fit_on_texts(x_test)
token.fit_on_texts(x_val)

word_index = token.word_index  # so the len(word_index) = # unique word in x.
# text x -> sequences of integers, takes only 10000 most common words

# padding the shorter sentences with zeroes, and truncating some of the longer sequences to be shorter
# post: pad and truncate occur at the end of the sequence (pre=beginning)
train_seq = token.texts_to_sequences(x_train)
train_padd = pad_sequences(train_seq, padding="post", truncating="post",maxlen=200)

test_seq = token.texts_to_sequences(x_test)
test_padd = pad_sequences(test_seq, padding="post", truncating="post",maxlen=200)

val_seq = token.texts_to_sequences(x_val)
val_padd = pad_sequences(val_seq, padding="post", truncating="post",maxlen=200)


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [None]:
vocab_size = 20000  # Only consider the top 20k words
maxlen = 200  # Only consider the first 200 words of each movie review

Buliding a transformer block.

In [None]:
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super().__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

In [None]:
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super().__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

Set the model parameters.

In [None]:
embed_dim = 32  # Embedding size for each token
num_heads = 2  # Number of attention heads
ff_dim = 32  # Hidden layer size in feed forward network inside transformer

inputs = layers.Input(shape=(maxlen,))
embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)


In [None]:
x = embedding_layer(inputs)

Fit the model.

In [None]:
transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
x = transformer_block(x)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dropout(0.1)(x)
x = layers.Dense(20, activation="relu")(x)
x = layers.Dropout(0.1)(x)
outputs = layers.Dense(2, activation="softmax")(x)

model = keras.Model(inputs=inputs, outputs=outputs)

In [None]:
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
history = model.fit(
    train_padd, y_train, batch_size=32, epochs=2, validation_data=(val_padd, y_val)
)

Epoch 1/2
Epoch 2/2


Check the test accuracy. It achieves the 99% accuracy.

In [None]:
model.evaluate(test_padd, y_test)



[0.005689647514373064, 0.9988307356834412]

Get the predicted labels.

In [None]:
predict = model.predict(test_padd)
pred_labels = np.argmax(predict, axis=1)



Reset the index of y_test and x_test so that the index can starts from 1 to **2000**

In [None]:
y_test_reset = y_test.reset_index(drop=True)
x_test_reset = x_test.reset_index(drop=True)

Find the index where the predicted labels and the true labels are different.

In [None]:
index = np.where(pred_labels-y_test_reset!=0)[0]

Here are some examples of incorrectly classified cases. For example, the first news is fake but it's classified as true news. This type of expressions can confuse the machine: Iran s Supreme Leader Ayatollah Ali Khamenei on Thursday said there is no guarantee a full agreement will be reached by the end of June, the AFP news agency reported.

In the second example, the true label is true news but the machine says it's fake news. Unlike the true news, it does not provide any reliable source, instead it conveys the historical facts in a narrative tone. The content is also negative with the expressions like 'GOP leaders have unleashed a stunning level of vitriol against their party‚Äôs most successful presidential candidate.', so I think the machine might think that this one is fake news often used for negative compaining in politics.

In [None]:
print(pred_labels[index])
print(y_test_reset[index])
print(x_test_reset[index])

[1 0 1 1 1 0 0 1 0 0 1 0 1 1 1 1 1 1 1 1 1]
2108     0
3291     1
4155     0
4264     0
5873     0
5917     1
6577     1
6749     0
7854     1
8440     1
9351     0
10971    1
12419    0
12524    0
12540    0
12946    0
15128    0
17213    0
17391    0
17558    0
17877    0
Name: target, dtype: int64
2108      the nuclear industry is a necessity, for ener...
3291     gop leaders have unleashed a stunning level of...
4155     to hear donald trump tell it, he s the best, h...
4264     u.s.-led coalition air strikes killed dozens o...
5873      reuters is reporting: u.s. president donald t...
5917     charleston, s.c. (reuters) - the white former ...
6577     pyongyang - north koreans stage a demonstratio...
6749      a number of european politicians, including j...
7854     u.s. president barack obama visited a street m...
8440     (reuters) - the  class of 2012  grew up with m...
9351     donald trump has been fundraising like crazy f...
10971                                            

Take only the first 2000 samples from the test data.

In [None]:
test_2000 = test_padd[:2000]

Apply MC dropout.

In [None]:

y_samples = np.stack([model(test_2000,training=True) for sample in range(100)])

Find the empirical lower and upper bound for the prediction intervals.

In [None]:
lower_bound_1 = np.percentile(y_samples[:, :, 0], 2.5, axis=0)
upper_bound_1 = np.percentile(y_samples[:, :, 0], 97.5, axis=0)
lower_bound_2 = np.percentile(y_samples[:, :, 1], 2.5, axis=0)
upper_bound_2 = np.percentile(y_samples[:, :, 1], 97.5, axis=0)

95% predicton interval for predicting fake news and true news, respectively, for each case.

In [None]:
pi_95_fake = np.column_stack((lower_bound_1, upper_bound_1))
pi_95_true = np.column_stack((lower_bound_2, upper_bound_2))

Fine the widest 95% interval to predict the true news.

In [None]:
argmax = np.argmax(pi_95_true[:,1]-pi_95_true[:,0],axis=0)
print(pi_95_true[argmax])
print(y_test_reset[argmax])
print(x_test_reset[argmax])

[0.33392321 0.76636347]
1
 (the dec. 1 story was refiled to correct gates’ first name to richard in paragraph 3) (reuters) - michael flynn’s plea of guilty on friday to lying to the federal bureau of investigation made him the fourth person known to be charged in a u.s. justice department investigation of ties between president donald trump’s 2016 election campaign and russia. the other three known to be charged by special counsel robert mueller’s probe are: ** former trump campaign manager paul manafort and richard gates. a grand jury in october indicted manafort, a longtime republican political consultant, and gates, a business associate.  the two men pleaded not guilty on oct. 30 to the 12-count indictment, whose charges include conspiracy to launder money, conspiracy against the united states and failing to register as foreign agents of ukraine’s former pro-russian government.  manafort has agreed to an $11.65 million bail deal that would result in his release from house arrest and

The widest 95% interval to predict the fake news provides the same result. The origianl text includes the expressions like 'transportation minister yisrael katz told the jerusalem post'. Though it's true label is fake news, it can fool machines by using this kind of expressions. I found out that it also fooled chatGPT. (I re-ran the algorithm to check if everything works before submitting it, so I don't think the text appears in here to you will be the same as what I wrote before, but what I found was the fake news with the title 'ISRAEL WILL NAME New Train Station Near Western Wall After President Donald Trump'.)

In [None]:
argmax = np.argmax(pi_95_fake[:,1]-pi_95_fake[:,0],axis=0)
print(pi_95_fake[argmax])
print(y_test_reset[argmax])
print(x_test_reset[argmax])

[0.2336365  0.66607672]
1
 (the dec. 1 story was refiled to correct gates’ first name to richard in paragraph 3) (reuters) - michael flynn’s plea of guilty on friday to lying to the federal bureau of investigation made him the fourth person known to be charged in a u.s. justice department investigation of ties between president donald trump’s 2016 election campaign and russia. the other three known to be charged by special counsel robert mueller’s probe are: ** former trump campaign manager paul manafort and richard gates. a grand jury in october indicted manafort, a longtime republican political consultant, and gates, a business associate.  the two men pleaded not guilty on oct. 30 to the 12-count indictment, whose charges include conspiracy to launder money, conspiracy against the united states and failing to register as foreign agents of ukraine’s former pro-russian government.  manafort has agreed to an $11.65 million bail deal that would result in his release from house arrest and