Fact Checking system using Bert.<br>
To test the system just run all the cells :) (This version takes approximately 1 hour to run)

## Imports

In [0]:
# Necessary to install on Google Colab, outside it more will be necessary! (e.g. Tensorflow,Keras)

!pip install bert-tensorflow
!pip install nltk
!pip install google
!pip install readability-lxml
!pip install h5py

Collecting bert-tensorflow
[?25l  Downloading https://files.pythonhosted.org/packages/a6/66/7eb4e8b6ea35b7cc54c322c816f976167a43019750279a8473d355800a93/bert_tensorflow-1.0.1-py2.py3-none-any.whl (67kB)
[K     |████████████████████████████████| 71kB 3.4MB/s 
Installing collected packages: bert-tensorflow
Successfully installed bert-tensorflow-1.0.1
Collecting readability-lxml
  Downloading https://files.pythonhosted.org/packages/af/a7/8ea52b2d3de4a95c3ed8255077618435546386e35af8969744c0fa82d0d6/readability-lxml-0.7.1.tar.gz
Collecting cssselect (from readability-lxml)
  Downloading https://files.pythonhosted.org/packages/3b/d4/3b5c17f00cce85b9a1e6f91096e1cc8e8ede2e1be8e96b87ce1ed09e92c5/cssselect-1.1.0-py2.py3-none-any.whl
Building wheels for collected packages: readability-lxml
  Building wheel for readability-lxml (setup.py) ... [?25l[?25hdone
  Created wheel for readability-lxml: filename=readability_lxml-0.7.1-cp36-none-any.whl size=16480 sha256=d1bff3453e0f23f3e2dde7b0c93e508a

In [0]:
import tensorflow as tf
import pandas as pd
import tensorflow_hub as hub
import os
import re
import numpy as np
from bert.tokenization import FullTokenizer
from tqdm import tqdm_notebook
from tensorflow.keras import backend as K
from googlesearch import search 
import json
import requests
from bs4 import BeautifulSoup
from readability import Document
from nltk.tokenize import sent_tokenize

import nltk
nltk.download('punkt')

sess = tf.Session()

bert_path = "https://tfhub.dev/google/bert_multi_cased_L-12_H-768_A-12/1"
max_seq_length = 256

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


## Bert Tokenization Implementation Code


In [0]:
class PaddingInputExample(object):
  pass
class InputExample(object):
    def __init__(self, guid, text_a, text_b=None, label=None):
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b
        self.label = label

def create_tokenizer_from_hub_module():
    bert_module =  hub.Module(bert_path)
    tokenization_info = bert_module(signature="tokenization_info", as_dict=True)
    vocab_file, do_lower_case = sess.run(
        [
            tokenization_info["vocab_file"],
            tokenization_info["do_lower_case"],
        ]
    )

    return FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case)

def convert_single_example(tokenizer, example, max_seq_length=256):
    if isinstance(example, PaddingInputExample):
        input_ids = [0] * max_seq_length
        input_mask = [0] * max_seq_length
        segment_ids = [0] * max_seq_length
        label = 0
        return input_ids, input_mask, segment_ids, label

    tokens_a = tokenizer.tokenize(example.text_a)
    if len(tokens_a) > max_seq_length - 2:
        tokens_a = tokens_a[0 : (max_seq_length - 2)]

    tokens = []
    segment_ids = []
    tokens.append("[CLS]")
    segment_ids.append(0)
    for token in tokens_a:
        tokens.append(token)
        segment_ids.append(0)
    tokens.append("[SEP]")
    segment_ids.append(0)

    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_mask = [1] * len(input_ids)

    while len(input_ids) < max_seq_length:
        input_ids.append(0)
        input_mask.append(0)
        segment_ids.append(0)

    assert len(input_ids) == max_seq_length
    assert len(input_mask) == max_seq_length
    assert len(segment_ids) == max_seq_length

    return input_ids, input_mask, segment_ids, example.label

def convert_examples_to_features(tokenizer, examples, max_seq_length=256):
    """Convert a set of `InputExample`s to a list of `InputFeatures`."""

    input_ids, input_masks, segment_ids, labels = [], [], [], []
    for example in examples:
        input_id, input_mask, segment_id, label = convert_single_example(
            tokenizer, example, max_seq_length
        )
        input_ids.append(input_id)
        input_masks.append(input_mask)
        segment_ids.append(segment_id)
        labels.append(label)
    return (
        np.array(input_ids),
        np.array(input_masks),
        np.array(segment_ids),
        #np.array(labels).reshape(-1, 1),
        np.array(labels),
    )

def convert_text_to_examples(texts, labels):
    """Create InputExamples"""
    InputExamples = []
    for text, label in zip(texts, labels):
        InputExamples.append(
            InputExample(guid=None, text_a=" ".join(text), text_b=None, label=label)
        )
    return InputExamples
  
def _truncate_seq_pair(tokens_a, tokens_b, max_length):
  while True:
    total_length = len(tokens_a) + len(tokens_b)
    if total_length <= max_length:
      break
    if len(tokens_a) > len(tokens_b):
      tokens_a.pop()
    else:
      tokens_b.pop()  
  
def convert_single_example_m(tokenizer, example, max_seq_length=256):
    if isinstance(example, PaddingInputExample):
        input_ids = [0] * max_seq_length
        input_mask = [0] * max_seq_length
        segment_ids = [0] * max_seq_length
        label = 0
        return input_ids, input_mask, segment_ids, label

    tokens_a = tokenizer.tokenize(example.text_a)
    tokens_b = tokenizer.tokenize(example.text_b)
      
    _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)    

    tokens = []
    segment_ids = []
    tokens.append("[CLS]")
    segment_ids.append(0)
    for token in tokens_a:
        tokens.append(token)
        segment_ids.append(0)
    tokens.append("[SEP]")
    segment_ids.append(0)
    
    for token in tokens_b:
      tokens.append(token)
      segment_ids.append(1)
    tokens.append("[SEP]")
    segment_ids.append(1)

    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_mask = [1] * len(input_ids)

    while len(input_ids) < max_seq_length:
        input_ids.append(0)
        input_mask.append(0)
        segment_ids.append(0)

    assert len(input_ids) == max_seq_length
    assert len(input_mask) == max_seq_length
    assert len(segment_ids) == max_seq_length

    return input_ids, input_mask, segment_ids, example.label

def convert_examples_to_features_m(tokenizer, examples, max_seq_length=256):
    input_ids, input_masks, segment_ids, labels = [], [], [], []
    for example in examples:
        input_id, input_mask, segment_id, label = convert_single_example_m(
            tokenizer, example, max_seq_length
        )
        input_ids.append(input_id)
        input_masks.append(input_mask)
        segment_ids.append(segment_id)
        labels.append(label)
    return (
        np.array(input_ids),
        np.array(input_masks),
        np.array(segment_ids),
        #np.array(labels).reshape(-1, 1),
        np.array(labels),
    )

def convert_text_to_examples_m(texts,texts2, labels):
    InputExamples = []
    for text,text2, label in zip(texts,texts2, labels):
        InputExamples.append(
            InputExample(guid=None, text_a=" ".join(text), text_b=" ".join(text2), label=label)
        )
    return InputExamples

## Check Worthiness Dataset and Embeddings

Binary Text Classifier that learns to distinguish between what is a claim and what is a fact, claims are considered check-worthy. The dataset is better explained in my paper at http://ceur-ws.org/Vol-2380/paper_119.pdf where the fact that this dataset approximates well the concept of check-worthy is also proved.

In [0]:
!wget https://github.com/LFavano/utils/raw/master/mega.zip
!unzip mega.zip

dataset = pd.read_csv('mega.csv', sep='\t',error_bad_lines=False)

dataset['label']=dataset['label'].astype('int8')
dataset.info()
dataset = dataset.sample(frac=1).reset_index(drop=True)

totalRows = 50000
trainRatio = 0.8
splitIndex = int(trainRatio*totalRows)

dataset = dataset[:totalRows]

train_df = dataset[:splitIndex]
test_df = dataset[splitIndex:]

train_df.info()
test_df.info()

--2019-09-28 15:03:11--  https://github.com/LFavano/utils/raw/master/mega.zip
Resolving github.com (github.com)... 192.30.253.112
Connecting to github.com (github.com)|192.30.253.112|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/LFavano/utils/master/mega.zip [following]
--2019-09-28 15:03:11--  https://raw.githubusercontent.com/LFavano/utils/master/mega.zip
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 75380769 (72M) [application/zip]
Saving to: ‘mega.zip’


2019-09-28 15:03:12 (169 MB/s) - ‘mega.zip’ saved [75380769/75380769]

Archive:  mega.zip
  inflating: mega.csv                
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2207328 entries, 0 to 2207327
Data columns (total 2 columns):
h

In [0]:
# Create datasets
train_text = train_df['headline_text'].tolist()
train_text = [' '.join(t.split()[0:max_seq_length]) for t in train_text]
train_text = np.array(train_text, dtype=object)[:, np.newaxis]
train_label = np.asarray(pd.get_dummies(train_df.label), dtype = np.int8)

test_text = test_df['headline_text'].tolist()
test_text = [' '.join(t.split()[0:max_seq_length]) for t in test_text]
test_text = np.array(test_text, dtype=object)[:, np.newaxis]
test_label = np.asarray(pd.get_dummies(test_df.label), dtype = np.int8)

In [0]:
# Instantiate tokenizer
tokenizer = create_tokenizer_from_hub_module()

# Convert data to InputExample format
train_examples = convert_text_to_examples(train_text, train_label)
test_examples = convert_text_to_examples(test_text, test_label)

# Convert to features
(train_input_ids, train_input_masks, train_segment_ids, train_labels 
) = convert_examples_to_features(tokenizer, train_examples, max_seq_length=max_seq_length)
(test_input_ids, test_input_masks, test_segment_ids, test_labels
) = convert_examples_to_features(tokenizer, test_examples, max_seq_length=max_seq_length)

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore








## Evidence Detection Dataset and Embeddings

Binary Text classifier that learn to distinguish between evidences that are supporting a claim versus evidences that are confuting it. 

In [0]:
# An experimental dataset that contains the complete articles too in addition to the claims, extracted from Politifact. I do not own this.
!wget https://github.com/LFavano/utils/raw/master/politifact10.zip 
!unzip politifact10.zip

dataset = pd.read_csv('politifact10.csv', sep='\t',error_bad_lines=False)

dataset.info()

# Just a way to set as 0 the labels Pants on Fire, False and Mostly False, while as 1 the labels True, Half True, Barely True
dataset.label = 5-dataset.label
dataset.label[dataset.label.isin([0,1,2])] = 0
dataset.label[dataset.label.isin([3,4,5])] = 1

train_df = dataset[:(int(trainRatio*len(dataset)))]
test_df = dataset[(int(trainRatio*len(dataset))):]

train_df.info()
test_df.info()

--2019-09-28 15:03:55--  https://github.com/LFavano/utils/raw/master/politifact10.zip
Resolving github.com (github.com)... 192.30.253.112
Connecting to github.com (github.com)|192.30.253.112|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/LFavano/utils/master/politifact10.zip [following]
--2019-09-28 15:03:55--  https://raw.githubusercontent.com/LFavano/utils/master/politifact10.zip
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 19220984 (18M) [application/zip]
Saving to: ‘politifact10.zip’


2019-09-28 15:03:56 (130 MB/s) - ‘politifact10.zip’ saved [19220984/19220984]

Archive:  politifact10.zip
  inflating: politifact10.csv        
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10464 entries, 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8371 entries, 0 to 8370
Data columns (total 7 columns):
index                      8371 non-null int64
Speaker                    8371 non-null object
label                      8371 non-null int64
claim                      8371 non-null object
PolitiFact explanations    8371 non-null object
url                        8371 non-null object
body                       8371 non-null object
dtypes: int64(2), object(5)
memory usage: 457.9+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2093 entries, 8371 to 10463
Data columns (total 7 columns):
index                      2093 non-null int64
Speaker                    2093 non-null object
label                      2093 non-null int64
claim                      2093 non-null object
PolitiFact explanations    2093 non-null object
url                        2093 non-null object
body                       2093 non-null object
dtypes: int64(2), object(5)
memory usage: 114.5+ KB


In [0]:
# Create datasets
train_text = train_df['claim'].tolist()
train_text = [' '.join(t.split()[0:max_seq_length]) for t in train_text]
train_text = np.array(train_text, dtype=object)[:, np.newaxis]

train_text2 = train_df['body'].tolist()
train_text2 = [' '.join(t.split()[0:max_seq_length]) for t in train_text2]
train_text2 = np.array(train_text2, dtype=object)[:, np.newaxis]

train_label = np.asarray(pd.get_dummies(train_df.label), dtype = np.int8)

test_text = test_df['claim'].tolist()
test_text = [' '.join(t.split()[0:max_seq_length]) for t in test_text]
test_text = np.array(test_text, dtype=object)[:, np.newaxis]

test_text2 = test_df['body'].tolist()
test_text2 = [' '.join(t.split()[0:max_seq_length]) for t in test_text2]
test_text2 = np.array(test_text2, dtype=object)[:, np.newaxis]

test_label = np.asarray(pd.get_dummies(test_df.label), dtype = np.int8)

In [0]:
# Instantiate tokenizer
tokenizer = create_tokenizer_from_hub_module()

# Convert data to InputExample format
train_examples = convert_text_to_examples_m(train_text, train_text2, train_label)
test_examples = convert_text_to_examples_m(test_text, test_text2, test_label)

# Convert to features
(train_input_ids2, train_input_masks2, train_segment_ids2, train_labels2 
) = convert_examples_to_features_m(tokenizer, train_examples, max_seq_length=max_seq_length)
(test_input_ids2, test_input_masks2, test_segment_ids2, test_labels2
) = convert_examples_to_features_m(tokenizer, test_examples, max_seq_length=max_seq_length)

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


## Relevance Detection Dataset and Embeddings

Very simple text classifier that assign a score to how related the text of the article is with respect to the claim. The dataset is part of the "FakeNewsChallenge". http://www.fakenewschallenge.org/

In [0]:
!wget https://github.com/LFavano/utils/raw/master/train_bodies.csv
!wget https://github.com/LFavano/utils/raw/master/train_stances.csv
  
_COL_1 = ['Headline', 'Body ID', 'Stance']
_COL_2 = ['Body ID', 'articleBody']
_COL_3 = ['Headline', 'Body ID', 'Stance', 'articleBody']

link = pd.read_csv("train_stances.csv", index_col=None, names=_COL_1, sep=',')
bodies = pd.read_csv("train_bodies.csv", index_col=None, names=_COL_2, sep=',')

link.Stance[link.Stance=='agree'] = 1
link.Stance[link.Stance=='disagree'] = 1
link.Stance[link.Stance=='discuss'] = 1
link.Stance[link.Stance=='unrelated'] = 0

link = link[1:]
bodies = bodies [1:]

complete = link.merge(bodies, on='Body ID')
complete = complete[1:]
complete.info()

complete = complete.sample(frac=1).reset_index(drop=True)

train_df = complete[:10000]
test_df = complete[10000:12000]

train_df.info()
test_df.info()

--2019-09-28 15:04:53--  https://github.com/LFavano/utils/raw/master/train_bodies.csv
Resolving github.com (github.com)... 192.30.253.112
Connecting to github.com (github.com)|192.30.253.112|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/LFavano/utils/master/train_bodies.csv [following]
--2019-09-28 15:04:54--  https://raw.githubusercontent.com/LFavano/utils/master/train_bodies.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3752301 (3.6M) [text/plain]
Saving to: ‘train_bodies.csv’


2019-09-28 15:04:54 (44.6 MB/s) - ‘train_bodies.csv’ saved [3752301/3752301]

--2019-09-28 15:04:55--  https://github.com/LFavano/utils/raw/master/train_stances.csv
Resolving github.com (github.com)... 192.30.253.1

In [0]:
# Create datasets (Only take up to max_seq_length words for memory)
train_text = train_df['Headline'].tolist()
train_text = [' '.join(t.split()[0:max_seq_length]) for t in train_text]
train_text = np.array(train_text, dtype=object)[:, np.newaxis]

train_text2 = train_df['articleBody'].tolist()
train_text2 = [' '.join(t.split()[0:max_seq_length]) for t in train_text2]
train_text2 = np.array(train_text2, dtype=object)[:, np.newaxis]

train_label = np.asarray(pd.get_dummies(train_df.Stance), dtype = np.int8)

test_text = test_df['Headline'].tolist()
test_text = [' '.join(t.split()[0:max_seq_length]) for t in test_text]
test_text = np.array(test_text, dtype=object)[:, np.newaxis]

test_text2 = test_df['articleBody'].tolist()
test_text2 = [' '.join(t.split()[0:max_seq_length]) for t in test_text2]
test_text2 = np.array(test_text2, dtype=object)[:, np.newaxis]

test_label = np.asarray(pd.get_dummies(test_df.Stance), dtype = np.int8)

In [0]:
# Instantiate tokenizer
tokenizer = create_tokenizer_from_hub_module()

# Convert data to InputExample format
train_examples = convert_text_to_examples_m(train_text, train_text2, train_label)
test_examples = convert_text_to_examples_m(test_text, test_text2, test_label)

# Convert to features
(train_input_ids3, train_input_masks3, train_segment_ids3, train_labels3
) = convert_examples_to_features_m(tokenizer, train_examples, max_seq_length=max_seq_length)
(test_input_ids3, test_input_masks3, test_segment_ids3, test_labels3
) = convert_examples_to_features_m(tokenizer, test_examples, max_seq_length=max_seq_length)

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


## Model

A very simple FFNN model that let us obtain three text classifiers, trained on a small subset of the whole dataset, for a single epoch (highly improvable).

The use of GPU is highly suggested.

In [0]:
class BertLayer(tf.keras.layers.Layer):
    def __init__(self, n_fine_tune_layers=10, **kwargs):
        self.n_fine_tune_layers = n_fine_tune_layers
        self.trainable = True
        self.output_size = 768
        super(BertLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        self.bert = hub.Module(
            bert_path,
            trainable=self.trainable,
            name="{}_module".format(self.name)
        )
        trainable_vars = self.bert.variables
        
        # Remove unused layers
        trainable_vars = [var for var in trainable_vars if not "/cls/" in var.name]
        
        # Select how many layers to fine tune
        trainable_vars = trainable_vars[-self.n_fine_tune_layers :]
        
        # Add to trainable weights
        for var in trainable_vars:
            self._trainable_weights.append(var)
        
        # Add non-trainable weights
        for var in self.bert.variables:
            if var not in self._trainable_weights:
                self._non_trainable_weights.append(var)
        
        super(BertLayer, self).build(input_shape)

    def call(self, inputs):
        inputs = [K.cast(x, dtype="int32") for x in inputs]
        input_ids, input_mask, segment_ids = inputs
        bert_inputs = dict(
            input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids
        )
        result = self.bert(inputs=bert_inputs, signature="tokens", as_dict=True)[
            "pooled_output"
        ]
        return result

    def compute_output_shape(self, input_shape):
        return (input_shape[0], self.output_size)

In [0]:
# Build a very simple FFNN model
def build_model(max_seq_length): 
    in_id = tf.keras.layers.Input(shape=(max_seq_length,), name="input_ids")
    in_mask = tf.keras.layers.Input(shape=(max_seq_length,), name="input_masks")
    in_segment = tf.keras.layers.Input(shape=(max_seq_length,), name="segment_ids")
    bert_inputs = [in_id, in_mask, in_segment]

    bert_output = BertLayer(n_fine_tune_layers=10)(bert_inputs)
    dense = tf.keras.layers.Dense(256, activation='relu')(bert_output)
    pred = tf.keras.layers.Dense(2, activation='softmax')(dense)
    

    model = tf.keras.models.Model(inputs=bert_inputs, outputs=pred)
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()
    
    return model

def initialize_vars(sess):
    sess.run(tf.local_variables_initializer())
    sess.run(tf.global_variables_initializer())
    sess.run(tf.tables_initializer())
    K.set_session(sess)

In [0]:
modelWorthy = build_model(max_seq_length)

initialize_vars(sess)

modelWorthy.fit(
    [train_input_ids, train_input_masks, train_segment_ids], 
    train_labels,
    validation_data=([test_input_ids, test_input_masks, test_segment_ids], test_labels),
    epochs=1,
    batch_size=32
)

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 256)]        0                                            
__________________________________________________________________________________________________
input_masks (InputLayer)        [(None, 256)]        0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 256)]        0                                            
__________________________________________________________________________________________________
bert_layer (BertLayer)          (None, 768)          178565115   input_ids[0][0]                  
                                                                 input_masks[0][0]            

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where




<tensorflow.python.keras.callbacks.History at 0x7f40350710f0>

In [0]:
tf.keras.models.save_model(modelWorthy,'worthy.h5')

In [0]:
del modelWorthy
sess.close()
K.clear_session()
sess = tf.Session()

In [0]:
modelEvidences = build_model(max_seq_length)

initialize_vars(sess)

modelEvidences.fit(
    [train_input_ids2, train_input_masks2, train_segment_ids2], 
    train_labels2,
    validation_data=([test_input_ids2, test_input_masks2, test_segment_ids2], test_labels2),
    epochs=1,
    batch_size=32
)

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 256)]        0                                            
__________________________________________________________________________________________________
input_masks (InputLayer)        [(None, 256)]        0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 256)]        0                                            
__________________________________________________________________________________________________
bert_layer (BertLayer)          (None, 768)          178565115   input_ids[0][0]                  
                                                                 input_masks[0][0]            

<tensorflow.python.keras.callbacks.History at 0x7f403193ffd0>

In [0]:
tf.keras.models.save_model(modelEvidences,'evidences.h5')

In [0]:
del modelEvidences
sess.close()
K.clear_session()
sess = tf.Session()

In [0]:
modelRelevance = build_model(max_seq_length)

initialize_vars(sess)

modelRelevance.fit(
    [train_input_ids3, train_input_masks3, train_segment_ids3], 
    train_labels3,
    validation_data=([test_input_ids3, test_input_masks3, test_segment_ids3], test_labels3),
    epochs=1,
    batch_size=32
)

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 256)]        0                                            
__________________________________________________________________________________________________
input_masks (InputLayer)        [(None, 256)]        0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 256)]        0                                            
__________________________________________________________________________________________________
bert_layer (BertLayer)          (None, 768)          178565115   input_ids[0][0]                  
                                                                 input_masks[0][0]            

<tensorflow.python.keras.callbacks.History at 0x7f4027161b70>

In [0]:
tf.keras.models.save_model(modelRelevance,'relevance.h5')

In [0]:
del modelRelevance
sess.close()
K.clear_session()
sess = tf.Session()

## Testing

The three models are used together to evaluate the evidences collected from google.

In [0]:
# Tokenizer is initialized in case that the previous code was not run
tokenizer = create_tokenizer_from_hub_module()

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


In [0]:
# The three different models are loaded
modelWorthy = tf.keras.models.load_model('worthy.h5',custom_objects={'BertLayer': BertLayer})
modelRelevance = tf.keras.models.load_model('relevance.h5',custom_objects={'BertLayer': BertLayer})
modelEvidences = tf.keras.models.load_model('evidences.h5',custom_objects={'BertLayer': BertLayer})

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


In [0]:
# A couple of paragraphs of a speech from Donald Trump
speech = "Madam President, Mr. Secretary-General, world leaders, ambassadors, and distinguished delegates: One year ago, I stood before you for the first time in this grand hall. I addressed the threats facing our world, and I presented a vision to achieve a brighter future for all of humanity. Today, I stand before the United Nations General Assembly to share the extraordinary progress we’ve made. In less than two years, my administration has accomplished more than almost any administration in the history of our country. Didn’t expect that reaction, but that’s okay. (Laughter and applause.) America’s economy is booming like never before. Since my election, we’ve added $10 trillion in wealth. The stock market is at an all-time high in history, and jobless claims are at a 50-year low. African American, Hispanic American, and Asian American unemployment have all achieved their lowest levels ever recorded. We’ve added more than 4 million new jobs, including half a million manufacturing jobs. We have passed the biggest tax cuts and reforms in American history. We’ve started the construction of a major border wall, and we have greatly strengthened border security. We have secured record funding for our military — $700 billion this year, and $716 billion next year. Our military will soon be more powerful than it has ever been before. In other words, the United States is stronger, safer, and a richer country than it was when I assumed office less than two years ago. We are standing up for America and for the American people. And we are also standing up for the world. This is great news for our citizens and for peace-loving people everywhere. We believe that when nations respect the rights of their neighbors, and defend the interests of their people, they can better work together to secure the blessings of safety, prosperity, and peace. Each of us here today is the emissary of a distinct culture, a rich history, and a people bound together by ties of memory, tradition, and the values that make our homelands like nowhere else on Earth. That is why America will always choose independence and cooperation over global governance, control, and domination. I honor the right of every nation in this room to pursue its own customs, beliefs, and traditions. The United States will not tell you how to live or work or worship. We only ask that you honor our sovereignty in return."

In [0]:
# Tokenization into sentences
sentences = sent_tokenize(speech)

In [0]:
# Each sentence is evaluated for check-worthiness and ranked
ranked = {}

for sentence in sentences:
  test_examples = convert_text_to_examples([sentence], [0])
  (test_input_ids, test_input_masks, test_segment_ids, test_labels) = convert_examples_to_features(tokenizer, test_examples, max_seq_length=max_seq_length)

  pred = modelWorthy.predict([test_input_ids, test_input_masks, test_segment_ids])
  
  ranked[sentence]= pred[0][1]
  
import operator
sortedRank = sorted(ranked.items(), key=operator.itemgetter(1),reverse=True)

In [0]:
# Each claim is used as a query for google, importance is obtained based on relevance of results, supportive/confuting stance of evidences is evaluated. 
# 3 articles are evaluated for each claim.
proofsNumber = 3

# Only the top 15% of claims are considered
for sentence in sortedRank[:(int(0.15*len(sortedRank)))]:
  sentence = sentence[0]
  print("\n--> Sentence:",sentence)
  proofsFound = 0
  query = sentence
  query = re.sub('"', ' ', query)
  
  for link in search(query, tld="co.uk", num=50, start=0, pause=2): 
    pdf=0
    print("Analyzed Link: ",link)
      
    try:
      response = requests.get(link, timeout=5)
    except:
      continue
      
    content_type = response.headers.get('content-type')

    try:
      if 'application/pdf' in content_type:
        continue
        pdf=1
    except:
      continue
    
    if(pdf==1):
      try:
        f = io.BytesIO(response.content)
        reader = PdfFileReader(f)
        summary = reader.getPage(0).extractText().split('\n')
        summary2 = reader.getPage(1).extractText().split('\n')
        summary3 = reader.getPage(2).extractText().split('\n')
        summary = str(summary)
        summary2 = str(summary2)
        summary3 = str(summary3)
        summary = summary + summary2+ summary3
      except:
        continue
    else:
      try:
        response = requests.get(link, timeout=10)
      except:
        continue
      try:
        doc = Document(response.text)
      except:
        continue
      try:
        summary = doc.summary()
      except:
        continue

    body = re.sub('<.*?>', ' ', summary)
    body = re.sub(r'^https?:\/\/.*[\r\n]*', ' ', body, flags=re.MULTILINE)
    body = re.sub('&#13;', ' ', body)
    body = re.sub('[^A-Za-z0-9.,!? ]+', ' ', body)
    body = re.sub(' +', ' ', body)

    print("Article Content is:",body[:100],"...")
    if(len(body)<250):
      continue
    
    if(body=='' or body==' ' or body.startswith(" 403") or body.startswith(" Forbidden") or body.startswith(" 404")):
      continue
    
    test_examples = convert_text_to_examples_m([sentence],[body], [0])
    (test_input_ids, test_input_masks, test_segment_ids, test_labels) = convert_examples_to_features_m(tokenizer, test_examples, max_seq_length=max_seq_length)

    relevance = modelRelevance.predict([test_input_ids, test_input_masks, test_segment_ids])
    
    stance = modelEvidences.predict([test_input_ids, test_input_masks, test_segment_ids])
   
    
    if(relevance[0][1]<0.5):
      print("--> Document Not Relevant")
      continue
    else:
      print("--> Document Is Relevant")
      
    print("--> Importance:",relevance[0][1])
    
    print("Likelihood to be supporting:",stance[0][1])
    
    if(stance[0][1]>0.5):
      print("--> Document is Supporting")
    else:
      print("--> Document is not Supporting")

    proofsFound += 1
    
    if(proofsFound>=proofsNumber):
      break


--> Sentence: The United States will not tell you how to live or work or worship.
Analyzed Link:  https://www.youtube.com/watch?v=esUMI1wnFhU
Article Content is:  Rating is available when the video has been rented. This feature is not available right now. Please ...
Analyzed Link:  https://www.cbsnews.com/live-news/donald-trump-un-speech-united-nations-general-assembly-address-today-live-updates-2018-09-25/
Article Content is:  Trump touts America first policies at U.N. General Assembly President Trump excoriated the ideology ...
--> Document Is Relevant
--> Importance: 0.7908329
Likelihood to be supporting: 0.47012958
--> Document is not Supporting
Analyzed Link:  https://www.whitehouse.gov/briefings-statements/remarks-president-trump-73rd-session-united-nations-general-assembly-new-york-ny/
Article Content is:  United Nations Headquarters New York, New York 10 38 A.M. EDT THE PRESIDENT Madam President, Mr. Se ...
--> Document Is Relevant
--> Importance: 0.9274754
Likelihood to be su