In [1]:
!pip install bert-for-tf2
!pip install sentencepiece



In [3]:
try:
    %tensorflow_version 2.x
except Exception:
    pass
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras import layers
import bert
import pandas as pd
import re
import csv
import numpy as np
import math

In [3]:
print(tf.__version__)

2.4.1


In [4]:
BertTokenizer = bert.bert_tokenization.FullTokenizer
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",
                            trainable=False)
vocabulary_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
to_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = BertTokenizer(vocabulary_file, to_lower_case)

In [4]:
train_all = pd.read_csv("all.tsv", delimiter = "\t")
train_all.isnull().values.any()
train_all.shape

(12791, 14)

In [6]:
def preprocess_text(sen):
    # Removing html tags
    sentence = remove_tags(sen)

    # Remove punctuations and numbers
    sentence = re.sub('[^a-zA-Z]', ' ', sentence)

    # Single character removal
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)

    # Removing multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)

    return sentence

In [7]:
TAG_RE = re.compile(r'<[^>]+>')

def remove_tags(text):
    return TAG_RE.sub('', text)

In [8]:
articles = []
labels = []

with open("all.tsv", 'r', encoding = 'utf-8') as csvfile:
    reader = csv.reader(csvfile, delimiter='\t')
    next(reader)
    for row in reader:
        labels.append(row[2])
        if(len(row)>13):
            article = row[2]+row[3]+row[4]+row[5]+row[7]+row[13]
        else:
            article = row[1]
        articles.append(preprocess_text(article))

print(len(articles))

12791


In [5]:
print(train_all.label.unique())

['false' 'halfTrue' 'mostlyTrue' 'true' 'barelyTrue' 'pantsFire']


In [9]:
def tokenize_articles(txt):
    return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(txt))

In [10]:
tokenized_articles = [tokenize_articles(article) for article in articles]
print(len(tokenized_articles))

12791


In [11]:
y = train_all['label']
label_val = ['true', 'halfTrue', 'mostlyTrue', 'barelyTrue', 'false', 'pantsFire']
#label_val = ['real', 'fake']
label_token = [0, 0, 0, 1, 1, 1]
y = np.array(list(map(lambda x: label_token[label_val.index(x)], y)))
print(y)

[1 0 0 ... 1 1 1]


In [12]:
input_with_len = [[tokenized_articles[i], y[i], len(tokenized_articles[i])]
                 for i in range(0, len(articles))]

In [13]:
input_with_len.sort(key=lambda x: x[2])
print(input_with_len[0])

[[6270], 1, 1]


In [14]:
sorted_articles_labels = [(article_lab[0], article_lab[1]) for article_lab in input_with_len]
print(len(sorted_articles_labels))
print(sorted_articles_labels[0])

12791
([6270], 1)


In [15]:
processed_dataset = tf.data.Dataset.from_generator(lambda: sorted_articles_labels, output_types=(tf.int32, tf.int32))

In [16]:
BATCH_SIZE = 32
batched_dataset = processed_dataset.padded_batch(BATCH_SIZE, padded_shapes=((None, ), ()))
next(iter(batched_dataset))

(<tf.Tensor: shape=(32, 16), dtype=int32, numpy=
 array([[ 6270,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0],
        [ 6270,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0],
        [ 2006,  8639,  2510,  5558,  7295, 19186,  2226,  5205,  2890,
         14289, 16558,  5555,  2078,     0,     0,     0],
        [10210,  2102, 19615,  2038,  4013,  3601,  2501, 11324, 21197,
          6644, 10643,  8540,  2063,     0,     0,     0],
        [ 2006, 12195, 15827,  2943, 20709,  3600,  8112, 28994,  5178,
          3372,  3207,  5302, 23185,     0,     0,     0],
        [ 8112, 25316, 22555,  5160,  2005,  2012,  3425,  3423,  3314,
         20740,  6856,  8540,  5243,  2739,  2713,     0],
        [ 3032, 18897,  8112,  5747,  3097,  3343,  2510, 20444,  2078,
          9056, 24147, 22084,  1056, 28394,  2102,     0],
        [ 2019, 10812,  5653,  275

In [17]:
TOTAL_BATCHES = math.ceil(len(sorted_articles_labels) / BATCH_SIZE)
print(TOTAL_BATCHES)
TEST_BATCHES = TOTAL_BATCHES // 10
batched_dataset.shuffle(TOTAL_BATCHES)
test_data = batched_dataset.take(TEST_BATCHES)
train_data = batched_dataset.skip(TEST_BATCHES)

400


In [18]:
class TEXT_MODEL(tf.keras.Model):
    
    def __init__(self,
                 vocabulary_size,
                 embedding_dimensions=128,
                 cnn_filters=50,
                 dnn_units=512,
                 model_output_classes=2,
                 dropout_rate=0.1,
                 training=False,
                 name="text_model"):
        super(TEXT_MODEL, self).__init__(name=name)
        
        self.embedding = layers.Embedding(vocabulary_size,
                                          embedding_dimensions)
        self.cnn_layer1 = layers.Conv1D(filters=cnn_filters,
                                        kernel_size=2,
                                        padding="valid",
                                        activation="relu")
        self.cnn_layer2 = layers.Conv1D(filters=cnn_filters,
                                        kernel_size=3,
                                        padding="valid",
                                        activation="relu")
        self.cnn_layer3 = layers.Conv1D(filters=cnn_filters,
                                        kernel_size=4,
                                        padding="valid",
                                        activation="relu")
        self.pool = layers.GlobalMaxPool1D()
        
        self.dense_1 = layers.Dense(units=dnn_units, activation="relu")
        self.dropout = layers.Dropout(rate=dropout_rate)
        if model_output_classes == 2:
            self.last_dense = layers.Dense(units=1,
                                           activation="sigmoid")
        else:
            self.last_dense = layers.Dense(units=model_output_classes,
                                           activation="softmax")
    
    def call(self, inputs, training):
        l = self.embedding(inputs)
        l_1 = self.cnn_layer1(l) 
        l_1 = self.pool(l_1) 
        l_2 = self.cnn_layer2(l) 
        l_2 = self.pool(l_2)
        l_3 = self.cnn_layer3(l)
        l_3 = self.pool(l_3) 
        
        concatenated = tf.concat([l_1, l_2, l_3], axis=-1) # (batch_size, 3 * cnn_filters)
        concatenated = self.dense_1(concatenated)
        concatenated = self.dropout(concatenated, training)
        model_output = self.last_dense(concatenated)
        
        return model_output

In [19]:
VOCAB_LENGTH = len(tokenizer.vocab)
EMB_DIM = 200
CNN_FILTERS = 100
DNN_UNITS = 256
OUTPUT_CLASSES = 2

DROPOUT_RATE = 0.2

NB_EPOCHS = 20

In [20]:
text_model = TEXT_MODEL(vocabulary_size=VOCAB_LENGTH,
                        embedding_dimensions=EMB_DIM,
                        cnn_filters=CNN_FILTERS,
                        dnn_units=DNN_UNITS,
                        model_output_classes=OUTPUT_CLASSES,
                        dropout_rate=DROPOUT_RATE)

In [21]:
if OUTPUT_CLASSES == 2:
    text_model.compile(loss="binary_crossentropy",
                       optimizer="adam",
                       metrics=["accuracy"])
else:
    text_model.compile(loss="sparse_categorical_crossentropy",
                       optimizer="adam",
                       metrics=["sparse_categorical_accuracy"])

In [22]:
text_model.fit(train_data, epochs=NB_EPOCHS)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x204857d0ba8>

In [23]:
results = text_model.evaluate(test_data)
print(results)

[2.592012882232666, 0.5859375]


In [24]:
text_model.summary()

Model: "text_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        multiple                  6104400   
_________________________________________________________________
conv1d (Conv1D)              multiple                  40100     
_________________________________________________________________
conv1d_1 (Conv1D)            multiple                  60100     
_________________________________________________________________
conv1d_2 (Conv1D)            multiple                  80100     
_________________________________________________________________
global_max_pooling1d (Global multiple                  0         
_________________________________________________________________
dense (Dense)                multiple                  77056     
_________________________________________________________________
dropout (Dropout)            multiple                  0

In [6]:
from collections import Counter
print(Counter(train_all['label'].values))

Counter({'halfTrue': 2627, 'false': 2507, 'mostlyTrue': 2454, 'barelyTrue': 2103, 'true': 2053, 'pantsFire': 1047})
