<h1>Import Packages</h1>

In [101]:
# Import Packages
from __future__ import division, print_function, unicode_literals
import argparse
import h5py
import numpy as np
import tensorflow as tf
import pandas as pd
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score
from sklearn.cross_validation import train_test_split
from sklearn.utils import shuffle
from sklearn.preprocessing import LabelEncoder
import os

<h1>Import Loss Functions and Models</h1>
<br><br>
Here we import loss functions and model functions from 'loss.py' and 'network.py' implemented by Yang et. al., <i>Investigating Capsule Networks with Dynamic Routing for Text Classification </i> (2018). Additionally, we implemented two more model functions, namely <i>short_text_capsule_model()</i> and <i>long_text_capsule_model()</i>, based on capsule_model_B, as per the models discussed in Goldani et. al., <i>Detecting Fake News with Capsule Neural Networks</i> (2020)

In [3]:
# Import loss functions and models
from loss import spread_loss, cross_entropy, margin_loss
from network import baseline_model_kimcnn, baseline_model_cnn, capsule_model_A, capsule_model_B, short_text_capsule_model, long_text_capsule_model

Using TensorFlow backend.


<h1>Load and Preprocess LIAR Dataset</h1>
<br><br>
We now load and preprocess the LIAR dataset to prepare it for GLoVe. Reference for code: <a href = "https://github.com/KunojiLym/metis_project_4">link</a> 

In [64]:
# read the LIAR dataset

liar_credit_hist_headers = ['speaker_bt', 'speaker_f', 'speaker_ht', 'speaker_mt', 'speaker_pof']
liar_speaker_headers = ['speaker', 'speaker_job', 'speaker_us_state', 'speaker_affiliation'] + liar_credit_hist_headers
liar_column_headers = ['id', 'label', 'statement', 'subjects']  + liar_speaker_headers + ['context']

# read the pre-prepared training, validation and test sets

liar_train = pd.read_csv("./data/liar_dataset/train.tsv", sep='\t', names=liar_column_headers, index_col='id')
liar_valid = pd.read_csv("./data/liar_dataset/valid.tsv", sep='\t', names=liar_column_headers, index_col='id')
liar_test = pd.read_csv("./data/liar_dataset/test.tsv", sep='\t', names=liar_column_headers, index_col='id')

# we will use the validation set for model selection; the test set is to be left for judging the final model

liar_train

Unnamed: 0_level_0,label,statement,subjects,speaker,speaker_job,speaker_us_state,speaker_affiliation,speaker_bt,speaker_f,speaker_ht,speaker_mt,speaker_pof,context
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2635.json,false,Says the Annies List political group supports ...,abortion,dwayne-bohac,State representative,Texas,republican,0.0,1.0,0.0,0.0,0.0,a mailer
10540.json,half-true,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0.0,0.0,1.0,1.0,0.0,a floor speech.
324.json,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,President,Illinois,democrat,70.0,71.0,160.0,163.0,9.0,Denver
1123.json,false,Health care reform legislation is likely to ma...,health-care,blog-posting,,,none,7.0,19.0,3.0,5.0,44.0,a news release
9028.json,half-true,The economic turnaround started at the end of ...,"economy,jobs",charlie-crist,,Florida,democrat,15.0,9.0,20.0,19.0,2.0,an interview on CNN
12465.json,true,The Chicago Bears have had more starting quart...,education,robin-vos,Wisconsin Assembly speaker,Wisconsin,republican,0.0,3.0,2.0,5.0,1.0,a an online opinion-piece
2342.json,barely-true,Jim Dunnam has not lived in the district he re...,candidates-biography,republican-party-texas,,Texas,republican,3.0,1.0,1.0,3.0,1.0,a press release.
153.json,half-true,I'm the only person on this stage who has work...,ethics,barack-obama,President,Illinois,democrat,70.0,71.0,160.0,163.0,9.0,"a Democratic debate in Philadelphia, Pa."
5602.json,half-true,"However, it took $19.5 million in Oregon Lotte...",jobs,oregon-lottery,,,organization,0.0,0.0,1.0,0.0,1.0,a website
9741.json,mostly-true,Says GOP primary opponents Glenn Grothman and ...,"energy,message-machine-2014,voting-record",duey-stroebel,State representative,Wisconsin,republican,0.0,0.0,0.0,1.0,0.0,an online video


In [6]:
liar_label_order = ['true', 'mostly-true', 'half-true', 'barely-true', 'false', 'pants-fire']

In [7]:
# Replace na with empty strings
liar_train['speaker_affiliation'].fillna('', inplace=True)
liar_valid['speaker_affiliation'].fillna('', inplace=True)
liar_test['speaker_affiliation'].fillna('', inplace=True)

liar_train['speaker_job'].fillna('', inplace=True)
liar_valid['speaker_job'].fillna('', inplace=True)
liar_test['speaker_job'].fillna('', inplace=True)

liar_train['context'].fillna('', inplace=True)
liar_valid['context'].fillna('', inplace=True)
liar_test['context'].fillna('', inplace=True)

In [8]:
# Explore data to show where inconsistent entries are present
liar_speakers_full = liar_train[liar_speaker_headers]

liar_speakers = liar_speakers_full.drop_duplicates().sort_values(by='speaker')
#liar_speakers['count_in_set'] = liar_speakers_full.groupby(liar_speaker_headers, as_index=False).size().values

liar_speakers['speaker'].value_counts()

kasim-reed                                  2
danny-tarkanian                             2
robert-puente                               2
mary-olson                                  1
mary-jordan                                 1
todd-tiahrt                                 1
pink-pony                                   1
georgia-state-road-and-tollway-authority    1
donzella-james                              1
james-florio                                1
cory-booker                                 1
christine-gilbert                           1
myra-crownover                              1
jon-kyl                                     1
bill-mccollum                               1
town-hall-audience-member                   1
candy-crowley                               1
ameripac                                    1
j-james-rohack                              1
gayle-smith                                 1
john-thune                                  1
kirk-cox                          

In [9]:
# Kasim Reed
liar_train[liar_train['speaker']=='kasim-reed'][liar_speaker_headers].drop_duplicates()

Unnamed: 0_level_0,speaker,speaker_job,speaker_us_state,speaker_affiliation,speaker_bt,speaker_f,speaker_ht,speaker_mt,speaker_pof
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
4659.json,kasim-reed,,,democrat,1.0,0.0,5.0,7.0,1.0
13163.json,kasim-reed,Atlanta Mayor,Georgia,democrat,0.0,0.0,1.0,0.0,0.0


In [10]:
# Danny Tarkanian
liar_train[liar_train['speaker']=='danny-tarkanian'][liar_speaker_headers].drop_duplicates()

Unnamed: 0_level_0,speaker,speaker_job,speaker_us_state,speaker_affiliation,speaker_bt,speaker_f,speaker_ht,speaker_mt,speaker_pof
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1815.json,danny-tarkanian,Businessman,Nevada,republican,0.0,0.0,0.0,1.0,0.0
12280.json,danny-tarkanian,,Nevada,republican,1.0,0.0,1.0,0.0,0.0


In [11]:
# Robert Puente
liar_train[liar_train['speaker']=='robert-puente'][liar_speaker_headers].drop_duplicates()

Unnamed: 0_level_0,speaker,speaker_job,speaker_us_state,speaker_affiliation,speaker_bt,speaker_f,speaker_ht,speaker_mt,speaker_pof
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
10887.json,robert-puente,,,none,0.0,0.0,1.0,0.0,0.0
8784.json,robert-puente,"CEO, San Antonio Water System",Texas,democrat,0.0,1.0,0.0,0.0,0.0


In [12]:
# Replace NaNs with actual values
liar_train['speaker_affiliation'].mask(liar_train['speaker_affiliation'] == 'none', inplace=True)

for speaker in ['danny-tarkanian', 'kasim-reed', 'robert-puente']:
    liar_train.loc[liar_train['speaker']==speaker, liar_speaker_headers] = \
        liar_train.loc[liar_train['speaker']==speaker, liar_speaker_headers].fillna(method='bfill')
    liar_train.loc[liar_train['speaker']==speaker, liar_speaker_headers] = \
        liar_train.loc[liar_train['speaker']==speaker, liar_speaker_headers].fillna(method='ffill')

In [36]:
# Replace NaNs with empty strings
for header in liar_column_headers:
    if header not in ['id', 'label', 'statement']:
        liar_train[header] = liar_train[header].fillna('')

liar_train.head()

Unnamed: 0_level_0,label,statement,subjects,speaker,speaker_job,speaker_us_state,speaker_affiliation,speaker_bt,speaker_f,speaker_ht,speaker_mt,speaker_pof,context
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2635.json,false,Says the Annies List political group supports ...,abortion,dwayne-bohac,State representative,Texas,republican,0,1,0,0,0,a mailer
10540.json,half-true,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0,0,1,1,0,a floor speech.
324.json,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,President,Illinois,democrat,70,71,160,163,9,Denver
1123.json,false,Health care reform legislation is likely to ma...,health-care,blog-posting,,,none,7,19,3,5,44,a news release
9028.json,half-true,The economic turnaround started at the end of ...,"economy,jobs",charlie-crist,,Florida,democrat,15,9,20,19,2,an interview on CNN


<h2>Combine Features for Passing into Neural Network</h2>
<br><br>
We must now combine all the meta-data into the sentence features. (Reference: <a href = "https://www.kaggle.com/code/therealcyberlord/fake-news-detection-using-rnn/notebook">link</a>) Then, we will create batch-wise lookup tables of these sentences, during training, to pass as the input to our capsule networks.

In [130]:
X_train = liar_train
# news_df.drop('title', axis=1, inplace=True)

for header in liar_column_headers:
    if header not in ['id', 'label']:
        X_train['statement'] = X_train['statement'] + str(X_train[header])

In [131]:
drop_cols = [header for header in liar_column_headers if header not in ['id', 'label', 'statement']]

In [132]:
# X_train.drop(columns = ['id', 'subjects', 'speaker', 'speaker_job', 'speaker_us_state', 'speaker_affiliation', 'speaker_bt', 'speaker_f', 'speaker_ht', 'speaker_mt', 'speaker_pof', 'context'])
X_train = X_train.drop(columns = drop_cols)
X_train.head()

Unnamed: 0_level_0,label,statement
id,Unnamed: 1_level_1,Unnamed: 2_level_1
2635.json,false,Says the Annies List political group supports ...
10540.json,half-true,When did the decline of coal start? It started...
324.json,mostly-true,"Hillary Clinton agrees with John McCain ""by vo..."
1123.json,false,Health care reform legislation is likely to ma...
9028.json,half-true,The economic turnaround started at the end of ...


In [133]:
Y_train = X_train['label']
X_train = X_train.drop('label', axis = 1)
X_train.head()

Unnamed: 0_level_0,statement
id,Unnamed: 1_level_1
2635.json,Says the Annies List political group supports ...
10540.json,When did the decline of coal start? It started...
324.json,"Hillary Clinton agrees with John McCain ""by vo..."
1123.json,Health care reform legislation is likely to ma...
9028.json,The economic turnaround started at the end of ...


In [134]:
Y_train.head()

id
2635.json           false
10540.json      half-true
324.json      mostly-true
1123.json           false
9028.json       half-true
Name: label, dtype: object

In [135]:
labelEnc = LabelEncoder()
labelEnc.fit(Y_train)
labelEnc.classes_

array(['barely-true', 'false', 'half-true', 'mostly-true', 'pants-fire',
       'true'], dtype=object)

In [136]:
labels = ['barely-true', 'false', 'half-true', 'mostly-true', 'pants-fire', 'true']

In [137]:
Y_train = labelEnc.transform(Y_train)

In [140]:
Y_train.shape

(10240,)

In [138]:
oneHotEnc = OneHotEncoder()
oneHotEnc.fit(Y_train)



OneHotEncoder(categorical_features='all', dtype=<type 'float'>,
       handle_unknown='error', n_values='auto', sparse=True)

In [143]:
oneHotEnc.categories__

AttributeError: 'OneHotEncoder' object has no attribute 'categories__'

<h2>Load GloVe</h2>

In [14]:
from gensim.models import KeyedVectors, Doc2Vec
from gensim.scripts.glove2word2vec import glove2word2vec

In [17]:
glove_file = './data/glove.6B.300d.txt'
tmp_file = './data/glovetmp.txt'

if not os.path.isfile(tmp_file):
    _ = glove2word2vec(glove_file, tmp_file)

glove_model = KeyedVectors.load_word2vec_format(tmp_file)

<h2>Convert GLoVe Model into an Embedding Matrix in Tensorflow</h2>
<br><br>
We now convert Glove Model into an embedding matrix in tensorflow. Reference: <a href = "https://stackoverflow.com/questions/53353978/how-to-project-my-word2vec-model-in-tensorflow">link</a>
<br><br>
This embedding matrix will later be used to create our embedding lookup tables, as implemented by Yang et. al.

In [20]:
vec_size = glove_model.vector_size
vocab_size = len(glove_model.vocab)

# Create the embedding matrix where words are indexed alphabetically
embedding_mat = np.zeros(shape=(vocab_size, vec_size), dtype='int32')
for idx, word in enumerate(sorted(glove_model.vocab)):
    embedding_mat[idx] = glove_model.get_vector(word)

# Setup the embedding matrix for tensorflow
# Static embeddings, i.e., non-trainable embeddings for short_text_capsule_model
static_embeddings = tf.Variable(embedding_mat, trainable = False)

# Non-static embeddings, i.e. trainable embeddings for long_text_capsule_model
nonstatic_embeddings = tf.Variable(embedding_mat, trainable = True)

In [None]:
# Create empty lookup tables (Will be filled batch-wise during training)
max_sent = len(liar_train[0])

In [None]:
class BatchGenerator(object):
    """Generate and hold batches."""
    def __init__(self, dataset,label, batch_size,input_size, is_shuffle=True):
      self._dataset = dataset
      self._label = label
      self._batch_size = batch_size    
      self._cursor = 0      
      self._input_size = input_size      
      
      if is_shuffle:
          index = np.arange(len(self._dataset))
          np.random.shuffle(index)
          self._dataset = np.array(self._dataset)[index]
          self._label = np.array(self._label)[index]
      else:
          self._dataset = np.array(self._dataset)
          self._label = np.array(self._label)
    def next(self):
      if self._cursor + self._batch_size > len(self._dataset):
          self._cursor = 0
      """Generate a single batch from the current cursor position in the data."""      
      batch_x = self._dataset[self._cursor : self._cursor + self._batch_size,:]
      batch_y = self._label[self._cursor : self._cursor + self._batch_size]
      self._cursor += self._batch_size
      return batch_x, batch_y