In [1]:
!pip install bert-for-tf2
!pip install sentencepiece



In [2]:
try:
    %tensorflow_version 2.x
except Exception:
    pass
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, LSTM, Dropout, Activation, Embedding, Bidirectional
import tensorflow_hub as hub
from tensorflow.keras import layers
import bert
import pandas as pd
import re
import csv
import numpy as np
import math

In [3]:
print(tf.__version__)

2.4.1


In [4]:
BertTokenizer = bert.bert_tokenization.FullTokenizer
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",
                            trainable=False)
vocabulary_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
to_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = BertTokenizer(vocabulary_file, to_lower_case)

In [5]:
train_all = pd.read_csv("all.tsv", delimiter = "\t")
train_all.isnull().values.any()
train_all.shape

(12791, 14)

In [6]:
def preprocess_text(sen):
    # Removing html tags
    sentence = remove_tags(sen)

    # Remove punctuations and numbers
    sentence = re.sub('[^a-zA-Z]', ' ', sentence)

    # Single character removal
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)

    # Removing multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)

    return sentence

In [7]:
TAG_RE = re.compile(r'<[^>]+>')

def remove_tags(text):
    return TAG_RE.sub('', text)

In [8]:
articles = []
labels = []

with open("all.tsv", 'r', encoding = 'utf-8') as csvfile:
    reader = csv.reader(csvfile, delimiter='\t')
    next(reader)
    for row in reader:
        labels.append(row[2])
        if(len(row)>13):
            article = row[2]+row[3]+row[4]+row[5]+row[7]+row[13]
        else:
            article = row[1]
        articles.append(preprocess_text(article))

print(len(articles))

12791


In [9]:
def tokenize_articles(txt):
    return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(txt))

In [10]:
tokenized_articles = [tokenize_articles(article) for article in articles]
print(len(tokenized_articles))

12791


In [11]:
y = train_all['label']
print(y)
label_val = ['true', 'halfTrue', 'mostlyTrue', 'barelyTrue', 'false', 'pantsFire']
#label_val = ['real', 'fake']
label_token = [0, 1, 2, 3, 4, 5]
y = np.array(list(map(lambda x: label_val.index(x), y)))
print(y)

0             false
1          halfTrue
2        mostlyTrue
3             false
4          halfTrue
            ...    
12786      halfTrue
12787    barelyTrue
12788    barelyTrue
12789    barelyTrue
12790         false
Name: label, Length: 12791, dtype: object
[4 1 2 ... 3 3 4]


In [12]:
input_with_len = [[tokenized_articles[i], y[i], len(tokenized_articles[i])]
                 for i in range(0, len(articles))]

In [13]:
input_with_len.sort(key=lambda x: x[2])
print(input_with_len[0])

[[6270], 4, 1]


In [14]:
import random

sorted_articles_labels = [(article_lab[0], article_lab[1]) for article_lab in input_with_len]
print(len(sorted_articles_labels))
print(sorted_articles_labels[0])
random.shuffle(sorted_articles_labels)

12791
([6270], 4)


In [15]:
processed_dataset = tf.data.Dataset.from_generator(lambda: sorted_articles_labels, output_types=(tf.int32, tf.int32))

In [26]:
BATCH_SIZE = 512
batched_dataset = processed_dataset.padded_batch(BATCH_SIZE, padded_shapes=((None, ), ()))
next(iter(batched_dataset))

(<tf.Tensor: shape=(512, 469), dtype=int32, numpy=
 array([[ 2026,  2767,  2057, ...,     0,     0,     0],
        [19369,  2317, 15666, ...,     0,     0,     0],
        [ 1037,  3660, 12211, ...,     0,     0,     0],
        ...,
        [ 2758,  2039,  2000, ...,     0,     0,     0],
        [ 2758,  1999,  4419, ...,     0,     0,     0],
        [ 2076,  1996,  3841, ...,     0,     0,     0]])>,
 <tf.Tensor: shape=(512,), dtype=int32, numpy=
 array([2, 1, 5, 0, 1, 1, 0, 2, 3, 3, 3, 4, 4, 2, 4, 0, 3, 1, 0, 0, 0, 2,
        2, 1, 0, 1, 4, 4, 1, 1, 3, 4, 4, 0, 3, 4, 1, 1, 1, 5, 2, 4, 2, 2,
        3, 0, 3, 4, 1, 0, 1, 5, 1, 1, 4, 5, 2, 1, 1, 1, 4, 3, 4, 5, 0, 3,
        3, 0, 3, 4, 5, 5, 3, 4, 1, 0, 4, 4, 3, 0, 0, 4, 2, 1, 4, 5, 0, 0,
        0, 4, 4, 1, 4, 3, 0, 4, 2, 2, 2, 4, 2, 3, 1, 3, 5, 4, 4, 2, 0, 1,
        0, 2, 1, 1, 1, 1, 4, 3, 4, 4, 4, 1, 0, 1, 5, 4, 1, 3, 3, 5, 1, 2,
        0, 1, 4, 4, 0, 2, 0, 1, 1, 2, 2, 3, 2, 3, 5, 1, 5, 0, 2, 3, 5, 3,
        1, 1, 5, 5, 4, 2, 

In [27]:
TOTAL_BATCHES = math.ceil(len(sorted_articles_labels) / BATCH_SIZE)
print(TOTAL_BATCHES)
TEST_BATCHES = TOTAL_BATCHES // 20
batched_dataset.shuffle(TOTAL_BATCHES)
test_data = batched_dataset.take(TEST_BATCHES)
train_data = batched_dataset.skip(TEST_BATCHES)

25


In [28]:
class TEXT_MODEL(tf.keras.Model):
    
    def __init__(self,
                 vocabulary_size,
                 embedding_dimensions=128,
                 cnn_filters=50,
                 dnn_units=512,
                 model_output_classes=2,
                 dropout_rate=0.1,
                 training=False,
                 name="text_model"):
        super(TEXT_MODEL, self).__init__(name=name)
        
        self.embedding = layers.Embedding(vocabulary_size,
                                          embedding_dimensions)
        self.cnn_layer1 = layers.Conv1D(filters=cnn_filters,
                                        kernel_size=2,
                                        padding="valid",
                                        activation="relu")
        self.cnn_layer2 = layers.Conv1D(filters=cnn_filters,
                                        kernel_size=3,
                                        padding="valid",
                                        activation="relu")
        self.cnn_layer3 = layers.Bidirectional(LSTM(embedding_dimensions))
        self.pool = layers.GlobalMaxPool1D()

        
        self.dense_1 = layers.Dense(units=dnn_units, activation="relu")
        self.dropout = layers.Dropout(rate=dropout_rate)
        if model_output_classes == 2:
            self.last_dense = layers.Dense(units=1,
                                           activation="sigmoid")
        else:
            self.last_dense = layers.Dense(units=model_output_classes,
                                           activation="softmax")
    
    def call(self, inputs, training):
        l = self.embedding(inputs)
        l_1 = self.cnn_layer1(l) 
        l_1 = self.pool(l_1) 
        l_2 = self.cnn_layer2(l) 
        l_2 = self.pool(l_2)
        l_3 = self.cnn_layer3(l)
        
        concatenated = tf.concat([l_1, l_2, l_3], axis=-1) # (batch_size, 3 * cnn_filters)
        concatenated = self.dense_1(concatenated)
        concatenated = self.dropout(concatenated, training)
        model_output = self.last_dense(concatenated)
        
        return model_output

In [29]:
VOCAB_LENGTH = len(tokenizer.vocab)
EMB_DIM = 200
CNN_FILTERS = 100
DNN_UNITS = 256
OUTPUT_CLASSES = 6

DROPOUT_RATE = 0.2

NB_EPOCHS = 5

In [30]:
text_model = TEXT_MODEL(vocabulary_size=VOCAB_LENGTH,
                        embedding_dimensions=EMB_DIM,
                        cnn_filters=CNN_FILTERS,
                        dnn_units=DNN_UNITS,
                        model_output_classes=OUTPUT_CLASSES,
                        dropout_rate=DROPOUT_RATE)

In [31]:
if OUTPUT_CLASSES == 2:
    text_model.compile(loss="binary_crossentropy",
                       optimizer="adam",
                       metrics=["accuracy"])
else:
    text_model.compile(loss="sparse_categorical_crossentropy",
                       optimizer="adam",
                       metrics=["sparse_categorical_accuracy"])

In [32]:
text_model.fit(train_data, epochs=NB_EPOCHS)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x14f0cc12470>

In [33]:
results = text_model.evaluate(test_data)
print(results)

[2.2361536026000977, 0.234375]


In [25]:
text_model.summary()

Model: "text_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        multiple                  6104400   
_________________________________________________________________
conv1d (Conv1D)              multiple                  40100     
_________________________________________________________________
conv1d_1 (Conv1D)            multiple                  60100     
_________________________________________________________________
bidirectional (Bidirectional multiple                  641600    
_________________________________________________________________
global_max_pooling1d (Global multiple                  0         
_________________________________________________________________
dense (Dense)                multiple                  153856    
_________________________________________________________________
dropout (Dropout)            multiple                  0