<h1>Import Packages</h1>

In [432]:
# Import Packages
from __future__ import division, print_function, unicode_literals
import argparse
import h5py
import numpy as np
import tensorflow as tf
import pandas as pd
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score
from sklearn.cross_validation import train_test_split
from sklearn.utils import shuffle
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import tqdm
import time
import operator
import os

<h1>Import Loss Functions and Models</h1>
<br><br>
Here we import loss functions and model functions from 'loss.py' and 'network.py' implemented by Yang et. al., <i>Investigating Capsule Networks with Dynamic Routing for Text Classification </i> (2018). Additionally, we implemented two more model functions, namely <i>short_text_capsule_model()</i> and <i>long_text_capsule_model()</i>, based on capsule_model_B, as per the models discussed in Goldani et. al., <i>Detecting Fake News with Capsule Neural Networks</i> (2020)

In [433]:
# Import loss functions and models
from loss import spread_loss, cross_entropy, margin_loss
from network import baseline_model_kimcnn, baseline_model_cnn, capsule_model_A, capsule_model_B, short_text_capsule_model, long_text_capsule_model

<h1>Load and Preprocess LIAR Dataset</h1>
<br><br>
We now load and preprocess the LIAR dataset to prepare it for GLoVe. Reference for code: <a href = "https://github.com/KunojiLym/metis_project_4">link</a> <a href = "https://www.kaggle.com/code/hendrixwilsonj/liar-data-analysis/notebook">link</a>

In [434]:
# read the LIAR dataset
def read_dataframe(tsv_file):
    
    # creates a "dataframe" or "df" for short. This is similar to a 2-D python dict.
    df = pd.read_csv(tsv_file, delimiter='\t', dtype=object)
    
    # replaces all "null" or "NaN" values with an empty string
    df.fillna("", inplace=True)
    
    # labels the columns in the dataset using the data dictionary described in the README
    df.columns = [
        'id',                # Column 1: the ID of the statement ([ID].json).
        'label',             # Column 2: the label.
        'statement',         # Column 3: the statement.
        'subjects',          # Column 4: the subject(s).
        'speaker',           # Column 5: the speaker.
        'speaker_job_title', # Column 6: the speaker's job title.
        'state_info',        # Column 7: the state info.
        'party_affiliation', # Column 8: the party affiliation.
        
        # Column 9-13: the total credit history count, including the current statement.
        'count_1', # barely true counts.
        'count_2', # false counts.
        'count_3', # half true counts.
        'count_4', # mostly true counts.
        'count_5', # pants on fire counts.
        
        'context' # Column 14: the context (venue / location of the speech or statement).
    ]
    
    return df

liar_credit_hist_headers = ['speaker_bt', 'speaker_f', 'speaker_ht', 'speaker_mt', 'speaker_pof']
liar_speaker_headers = ['speaker', 'speaker_job', 'speaker_us_state', 'speaker_affiliation'] + liar_credit_hist_headers
liar_column_headers = ['id', 'label', 'statement', 'subjects']  + liar_speaker_headers + ['context']

# read the pre-prepared training, validation and test sets

# liar_train = pd.read_csv("./data/liar_dataset/train.tsv", sep='\t', names=liar_column_headers, index_col='id')
# liar_valid = pd.read_csv("./data/liar_dataset/valid.tsv", sep='\t', names=liar_column_headers, index_col='id')
# liar_test = pd.read_csv("./data/liar_dataset/test.tsv", sep='\t', names=liar_column_headers, index_col='id')

liar_train = read_dataframe("data/liar_dataset/train.tsv")
liar_valid = read_dataframe("data/liar_dataset/valid.tsv")
liar_test = read_dataframe("data/liar_dataset/test.tsv")

# we will use the validation set for model selection; the test set is to be left for judging the final model

liar_train

Unnamed: 0,id,label,statement,subjects,speaker,speaker_job_title,state_info,party_affiliation,count_1,count_2,count_3,count_4,count_5,context
0,10540.json,half-true,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0,0,1,1,0,a floor speech.
1,324.json,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,President,Illinois,democrat,70,71,160,163,9,Denver
2,1123.json,false,Health care reform legislation is likely to ma...,health-care,blog-posting,,,none,7,19,3,5,44,a news release
3,9028.json,half-true,The economic turnaround started at the end of ...,"economy,jobs",charlie-crist,,Florida,democrat,15,9,20,19,2,an interview on CNN
4,12465.json,true,The Chicago Bears have had more starting quart...,education,robin-vos,Wisconsin Assembly speaker,Wisconsin,republican,0,3,2,5,1,a an online opinion-piece
5,2342.json,barely-true,Jim Dunnam has not lived in the district he re...,candidates-biography,republican-party-texas,,Texas,republican,3,1,1,3,1,a press release.
6,153.json,half-true,I'm the only person on this stage who has work...,ethics,barack-obama,President,Illinois,democrat,70,71,160,163,9,"a Democratic debate in Philadelphia, Pa."
7,5602.json,half-true,"However, it took $19.5 million in Oregon Lotte...",jobs,oregon-lottery,,,organization,0,0,1,0,1,a website
8,9741.json,mostly-true,Says GOP primary opponents Glenn Grothman and ...,"energy,message-machine-2014,voting-record",duey-stroebel,State representative,Wisconsin,republican,0,0,0,1,0,an online video
9,7115.json,mostly-true,"For the first time in history, the share of th...",elections,robert-menendez,U.S. Senator,New Jersey,democrat,1,3,1,3,0,a speech


In [435]:
liar_train['statement'].iloc[1280]

'Says Hillary Clintons campaign hasnt been clear about when she wiped herserverof her work emails.'

In [436]:
liar_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10239 entries, 0 to 10238
Data columns (total 14 columns):
id                   10239 non-null object
label                10239 non-null object
statement            10239 non-null object
subjects             10239 non-null object
speaker              10239 non-null object
speaker_job_title    10239 non-null object
state_info           10239 non-null object
party_affiliation    10239 non-null object
count_1              10239 non-null object
count_2              10239 non-null object
count_3              10239 non-null object
count_4              10239 non-null object
count_5              10239 non-null object
context              10239 non-null object
dtypes: object(14)
memory usage: 1.1+ MB


In [437]:
liar_label_order = ['true', 'mostly-true', 'half-true', 'barely-true', 'false', 'pants-fire']

In [438]:
liar_train['count_5'].isna() == True

0        False
1        False
2        False
3        False
4        False
5        False
6        False
7        False
8        False
9        False
10       False
11       False
12       False
13       False
14       False
15       False
16       False
17       False
18       False
19       False
20       False
21       False
22       False
23       False
24       False
25       False
26       False
27       False
28       False
29       False
         ...  
10209    False
10210    False
10211    False
10212    False
10213    False
10214    False
10215    False
10216    False
10217    False
10218    False
10219    False
10220    False
10221    False
10222    False
10223    False
10224    False
10225    False
10226    False
10227    False
10228    False
10229    False
10230    False
10231    False
10232    False
10233    False
10234    False
10235    False
10236    False
10237    False
10238    False
Name: count_5, Length: 10239, dtype: bool

In [439]:
# Replace na with empty strings
liar_train['state_info'].fillna('', inplace=True)
liar_valid['state_info'].fillna('', inplace=True)
liar_test['state_info'].fillna('', inplace=True)

liar_train['speaker'].fillna('', inplace=True)
liar_valid['speaker'].fillna('', inplace=True)
liar_test['speaker'].fillna('', inplace=True)

liar_train['subjects'].fillna('', inplace=True)
liar_valid['subjects'].fillna('', inplace=True)
liar_test['subjects'].fillna('', inplace=True)

liar_train['party_affiliation'].fillna('', inplace=True)
liar_valid['party_affiliation'].fillna('', inplace=True)
liar_test['party_affiliation'].fillna('', inplace=True)

liar_train['speaker_job_title'].fillna('', inplace=True)
liar_valid['speaker_job_title'].fillna('', inplace=True)
liar_test['speaker_job_title'].fillna('', inplace=True)

liar_train['context'].fillna('', inplace=True)
liar_valid['context'].fillna('', inplace=True)
liar_test['context'].fillna('', inplace=True)

In [440]:
liar_train['statement'].head()

0    When did the decline of coal start? It started...
1    Hillary Clinton agrees with John McCain "by vo...
2    Health care reform legislation is likely to ma...
3    The economic turnaround started at the end of ...
4    The Chicago Bears have had more starting quart...
Name: statement, dtype: object

In [441]:
liar_train = liar_train.reset_index(drop = True)
liar_train.head()

Unnamed: 0,id,label,statement,subjects,speaker,speaker_job_title,state_info,party_affiliation,count_1,count_2,count_3,count_4,count_5,context
0,10540.json,half-true,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0,0,1,1,0,a floor speech.
1,324.json,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,President,Illinois,democrat,70,71,160,163,9,Denver
2,1123.json,false,Health care reform legislation is likely to ma...,health-care,blog-posting,,,none,7,19,3,5,44,a news release
3,9028.json,half-true,The economic turnaround started at the end of ...,"economy,jobs",charlie-crist,,Florida,democrat,15,9,20,19,2,an interview on CNN
4,12465.json,true,The Chicago Bears have had more starting quart...,education,robin-vos,Wisconsin Assembly speaker,Wisconsin,republican,0,3,2,5,1,a an online opinion-piece


In [442]:
liar_train['statement'].str.split().str.len().max()

467

In [443]:
liar_train['statement'].str.split().str.len().idxmax()

1279

In [444]:
liar_train['statement'].iloc[1279]

'Hospitals, doctors, MRIs, surgeries and so forth are more extensively used and far more expensive in this country than they are in many other countries.\'\'\thealth-care\tmitt-romney\tFormer governor\tMassachusetts\trepublican\t34\t32\t58\t33\t19\ta Fox News Sunday interview\n9874.json\tbarely-true\tObamacare cuts seniors Medicare.\thealth-care,medicare\ted-gillespie\tRepublican strategist\tWashington, D.C.\trepublican\t2\t3\t2\t2\t1\ta campaign email.\n3072.json\tmostly-true\tThe refusal of many federal employees to fly coach costs taxpayers $146 million annually.\tgovernment-efficiency,transparency\tnewsmax\tMagazine and website\tFlorida\tnone\t0\t0\t0\t1\t0\tan e-mail solicitation\n2436.json\tmostly-true\tFlorida spends more than $300 million a year just on children repeating pre-K through 3rd grade.\teducation\talex-sink\t\tFlorida\tdemocrat\t1\t2\t2\t4\t0\tfigures cites on campaign website\n9721.json\ttrue\tMilwaukee County Sheriff David A. Clarke Jr. advised citizens to point th

In [445]:
liar_train = liar_train.drop(index = 1279, axis = 1)
liar_train = liar_train.reset_index(drop = True)

In [446]:
len(liar_train['statement'])

10238

In [447]:
liar_train['statement'].iloc[1279]

'Says Hillary Clintons campaign hasnt been clear about when she wiped herserverof her work emails.'

In [448]:
liar_train['statement'].str.split().str.len().max()

309

In [449]:
liar_train['statement'].str.split().str.len().idxmax()

7548

In [450]:
liar_train['statement'].iloc[7548]

'The vast majority of the money I got was from small donors all across the country.\'\'\tcampaign-finance\tbarack-obama\tPresident\tIllinois\tdemocrat\t70\t71\t160\t163\t9\tan interview with CNBC\'s John Harwood \n5802.json\ttrue\tThe Democrat-controlled Senate, it hasnt passed a budget in more than 1,000 days.\tdeficit,federal-budget,job-accomplishments\tjohn-boehner\tSpeaker of the House of Representatives\tOhio\trepublican\t13\t22\t11\t4\t2\ta video address\n6153.json\thalf-true\tThe median income in America has dropped by 10 percent in the last four years.\teconomy,income,jobs\tmitt-romney\tFormer governor\tMassachusetts\trepublican\t34\t32\t58\t33\t19\ta speech\n6515.json\tbarely-true\tSays U.S. Senate hopeful Tammy Baldwin voted for a $1 trillion stimulus bill that included a wasteful $800,000 to replace light bulbs.\tenergy,federal-budget,stimulus\tcrossroads-gps\tConservative advocacy group\t\trepublican\t9\t1\t4\t1\t2\ta television ad\n385.json\ttrue\tSays Clinton did not read

In [451]:
liar_train = liar_train.drop(index = 7548, axis = 1)
liar_train = liar_train.reset_index(drop = True)

In [452]:
liar_train['statement'].str.split().str.len().max()

235

In [453]:
liar_train['statement'].str.split().str.len().idxmax()

6117

In [454]:
liar_train['statement'].iloc[6117]

'Georgia has the most restrictive ballot access laws in the country.\telections\tmary-n\t\t\tindependent\t0\t0\t0\t0\t0\ta television interview\n3666.json\tfalse\tOn whether the federal debt limit should be raised\tfederal-budget\tbarack-obama\tPresident\tIllinois\tdemocrat\t70\t71\t160\t163\t9\tan interview with George Stephanopoulos\n4608.json\thalf-true\tWhen George W. Bush was governor of Texas, the percentage uninsured went down. Under his successor, Rick Perry, its gone up.\tchildren,health-care,new-hampshire-2012\tmitt-romney\tFormer governor\tMassachusetts\trepublican\t34\t32\t58\t33\t19\ta Republicam debate in Hanover, N.H.\n11023.json\tfalse\tSays his book,The Art of the Deal,is the No. 1 selling business book of all time.\tcandidates-biography,job-accomplishments\tdonald-trump\tPresident-Elect\tNew York\trepublican\t63\t114\t51\t37\t61\tremarks on CNN.\n2096.json\thalf-true\tDemocrats in the 1930s wanted President Franklin Roosevelt to be more conservative.\thistory,pundits,

In [455]:
liar_train = liar_train.drop(index = 6117, axis = 1)
liar_train = liar_train.reset_index(drop = True)

In [456]:
liar_train['statement'].str.split().str.len().max()

88

In [457]:
liar_train['statement'].str.split().str.len().idxmax()

2140

In [458]:
liar_train['statement'].iloc[2140]

'The fact is that although we have had a president who is opposed to abortion over the last eight years, abortions have not gone down.\'\'\tabortion\tbarack-obama\tPresident\tIllinois\tdemocrat\t70\t71\t160\t163\t9\ta TV interview with megachurch pastor Rick Warren in Lake Forest, Calif.\n2724.json\ttrue\tMost of the jobs that we lost were lost before the economic policies we put in place had any effect.\teconomy,job-accomplishments,jobs,stimulus\tbarack-obama\tPresident\tIllinois\tdemocrat\t70\t71\t160\t163\t9\tan interview on The Daily Show with Jon Stewart"'

In [459]:
liar_train = liar_train.drop(index = 2140, axis = 1)
liar_train = liar_train.reset_index(drop = True)

In [460]:
liar_train['statement'].str.split().str.len().max()

66

In [461]:
liar_train['statement'].str.split().str.len().idxmax()

4190

In [465]:
liar_train['statement'].iloc[4190]

"Let's pay attention to kids who are not going to college, which ends up being about 60 percent of the kids... and get them trained for the jobs that are there. Because, you know, there are auto mechanic jobs paying $50,000, $60,000 that they can't get filled. There are airline mechanic jobs paying a lot of money that can't get filled. Remarks at AFSCME forum 6/19/2007"

In [468]:
liar_train['context'].iloc[4190] = liar_train['statement'].iloc[4190][-33:]
liar_train['context'].iloc[4190]

'Remarks at AFSCME forum 6/19/2007'

In [470]:
liar_train['statement'].iloc[4190] = liar_train['statement'].iloc[4190][0:-34]
print(liar_train['statement'].iloc[4190])

Let's pay attention to kids who are not going to college, which ends up being about 60 percent of the kids... and get them trained for the jobs that are there. Because, you know, there are auto mechanic jobs paying $50,000, $60,000 that they can't get filled. There are airline mechanic jobs paying a lot of money that can't get filled.


<h2>Preprocess for GloVe</h2>
<br><br>
Reference: <a href = "https://www.kaggle.com/code/christofhenkel/how-to-preprocessing-for-glove-part1-eda/notebook">link</a>

In [473]:
from gensim.parsing.preprocessing import remove_stopwords, preprocess_string, preprocess_documents

In [474]:
liar_train.head()

Unnamed: 0,id,label,statement,subjects,speaker,speaker_job_title,state_info,party_affiliation,count_1,count_2,count_3,count_4,count_5,context
0,10540.json,half-true,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0,0,1,1,0,a floor speech.
1,324.json,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,President,Illinois,democrat,70,71,160,163,9,Denver
2,1123.json,false,Health care reform legislation is likely to ma...,health-care,blog-posting,,,none,7,19,3,5,44,a news release
3,9028.json,half-true,The economic turnaround started at the end of ...,"economy,jobs",charlie-crist,,Florida,democrat,15,9,20,19,2,an interview on CNN
4,12465.json,true,The Chicago Bears have had more starting quart...,education,robin-vos,Wisconsin Assembly speaker,Wisconsin,republican,0,3,2,5,1,a an online opinion-piece


In [475]:
train_size = len(liar_train['statement'])
tokenized_train_statement, tokenized_train_subjects, tokenized_train_speaker, tokenized_train_job_title, tokenized_train_state, tokenized_train_party, tokenized_train_context = [], [], [], [], [], [], []
for i in range(0, train_size):
    tokenized_train_statement.append(remove_stopwords(liar_train['statement'].iloc[i]))


# tokenized_train_statement = preprocess_string([remove_stopwords(liar_train['statement'].iloc[i] for i in range(len(liar_train['statement'])))])

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
27

3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447


6560
6561
6562
6563
6564
6565
6566
6567
6568
6569
6570
6571
6572
6573
6574
6575
6576
6577
6578
6579
6580
6581
6582
6583
6584
6585
6586
6587
6588
6589
6590
6591
6592
6593
6594
6595
6596
6597
6598
6599
6600
6601
6602
6603
6604
6605
6606
6607
6608
6609
6610
6611
6612
6613
6614
6615
6616
6617
6618
6619
6620
6621
6622
6623
6624
6625
6626
6627
6628
6629
6630
6631
6632
6633
6634
6635
6636
6637
6638
6639
6640
6641
6642
6643
6644
6645
6646
6647
6648
6649
6650
6651
6652
6653
6654
6655
6656
6657
6658
6659
6660
6661
6662
6663
6664
6665
6666
6667
6668
6669
6670
6671
6672
6673
6674
6675
6676
6677
6678
6679
6680
6681
6682
6683
6684
6685
6686
6687
6688
6689
6690
6691
6692
6693
6694
6695
6696
6697
6698
6699
6700
6701
6702
6703
6704
6705
6706
6707
6708
6709
6710
6711
6712
6713
6714
6715
6716
6717
6718
6719
6720
6721
6722
6723
6724
6725
6726
6727
6728
6729
6730
6731
6732
6733
6734
6735
6736
6737
6738
6739
6740
6741
6742
6743
6744
6745
6746
6747
6748
6749
6750
6751
6752
6753
6754
6755
6756
6757
6758
6759


8543
8544
8545
8546
8547
8548
8549
8550
8551
8552
8553
8554
8555
8556
8557
8558
8559
8560
8561
8562
8563
8564
8565
8566
8567
8568
8569
8570
8571
8572
8573
8574
8575
8576
8577
8578
8579
8580
8581
8582
8583
8584
8585
8586
8587
8588
8589
8590
8591
8592
8593
8594
8595
8596
8597
8598
8599
8600
8601
8602
8603
8604
8605
8606
8607
8608
8609
8610
8611
8612
8613
8614
8615
8616
8617
8618
8619
8620
8621
8622
8623
8624
8625
8626
8627
8628
8629
8630
8631
8632
8633
8634
8635
8636
8637
8638
8639
8640
8641
8642
8643
8644
8645
8646
8647
8648
8649
8650
8651
8652
8653
8654
8655
8656
8657
8658
8659
8660
8661
8662
8663
8664
8665
8666
8667
8668
8669
8670
8671
8672
8673
8674
8675
8676
8677
8678
8679
8680
8681
8682
8683
8684
8685
8686
8687
8688
8689
8690
8691
8692
8693
8694
8695
8696
8697
8698
8699
8700
8701
8702
8703
8704
8705
8706
8707
8708
8709
8710
8711
8712
8713
8714
8715
8716
8717
8718
8719
8720
8721
8722
8723
8724
8725
8726
8727
8728
8729
8730
8731
8732
8733
8734
8735
8736
8737
8738
8739
8740
8741
8742


In [107]:
tokenized_train_statement[1:5]

[u'When decline coal start? It started natural gas took started begin (President George W.) Bushs administration.',
 u'Hillary Clinton agrees John McCain "by voting George Bush benefit doubt Iran."',
 u'Health care reform legislation likely mandate free sex change surgeries.',
 u'The economic turnaround started end term.',
 u'The Chicago Bears starting quarterbacks 10 years total number tenured (UW) faculty fired decades.',
 u'Jim Dunnam lived district represents years now.',
 u"I'm person stage worked actively year passing, Russ Feingold, toughest ethics reform Watergate.",
 u'However, took $19.5 million Oregon Lottery funds Port Newport eventually land new NOAA Marine Operations Center-Pacific.',
 u'Says GOP primary opponents Glenn Grothman Joe Leibham cast compromise vote cost $788 million higher electricity costs.',
 u'For time history, share national popular vote margin smaller Latino vote margin.',
 u'Since 2000, nearly 12 million Americans slipped middle class poverty.',
 u'When

<h2>Load GloVe</h2>

In [476]:
from gensim.models import KeyedVectors, Doc2Vec
from gensim.scripts.glove2word2vec import glove2word2vec

In [477]:
glove_file = './data/glove.6B.300d.txt'
tmp_file = './data/glovetmp.txt'

if not os.path.isfile(tmp_file):
    _ = glove2word2vec(glove_file, tmp_file)

glove_model = KeyedVectors.load_word2vec_format(tmp_file)

In [478]:
def buildWordVector(text, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in text:
        try:
            vec += glove_model[word].reshape((1, size))
            count += 1.
        except KeyError:
            continue
    if count != 0:
        vec /= count
    return vec

In [479]:
type(tokenized_train_statement)

list

In [480]:
# statement_glove_train = liar_train['statement'].apply(lambda s: buildWordVector(s.split(' '), 300)[0])
statement_glove_train = pd.Series(tokenized_train_statement).apply(lambda s: buildWordVector(s.split(' '), 300)[0])

In [64]:
statement_glove_train.size

10240

In [66]:
len(statement_glove_train.iloc[5])

300

<h2>Combine Features for Passing into Neural Network</h2>
<br><br>
We must now combine all the meta-data into the sentence features. (Reference: <a href = "https://www.kaggle.com/code/therealcyberlord/fake-news-detection-using-rnn/notebook">link</a>) Then, we will create batch-wise lookup tables of these sentences, during training, to pass as the input to our capsule networks.

In [195]:
X_train = liar_train
# news_df.drop('title', axis=1, inplace=True)

for header in liar_column_headers:
    if header not in ['id', 'label']:
        X_train['statement'] = X_train['statement'] + str(X_train[header])

In [196]:
drop_cols = [header for header in liar_column_headers if header not in ['id', 'label', 'statement']]

In [197]:
# X_train.drop(columns = ['id', 'subjects', 'speaker', 'speaker_job', 'speaker_us_state', 'speaker_affiliation', 'speaker_bt', 'speaker_f', 'speaker_ht', 'speaker_mt', 'speaker_pof', 'context'])
X_train = X_train.drop(columns = drop_cols)
X_train.head()

Unnamed: 0_level_0,label,statement
id,Unnamed: 1_level_1,Unnamed: 2_level_1
2635.json,false,Says the Annies List political group supports ...
10540.json,half-true,When did the decline of coal start? It started...
324.json,mostly-true,"Hillary Clinton agrees with John McCain ""by vo..."
1123.json,false,Health care reform legislation is likely to ma...
9028.json,half-true,The economic turnaround started at the end of ...


In [198]:
Y_train = X_train['label']
X_train = X_train.drop('label', axis = 1)
X_train.head()

Unnamed: 0_level_0,statement
id,Unnamed: 1_level_1
2635.json,Says the Annies List political group supports ...
10540.json,When did the decline of coal start? It started...
324.json,"Hillary Clinton agrees with John McCain ""by vo..."
1123.json,Health care reform legislation is likely to ma...
9028.json,The economic turnaround started at the end of ...


In [199]:
Y_train.head()

id
2635.json           false
10540.json      half-true
324.json      mostly-true
1123.json           false
9028.json       half-true
Name: label, dtype: object

In [200]:
labelEnc = LabelEncoder()
labelEnc.fit(Y_train)
labelEnc.classes_

array(['barely-true', 'false', 'half-true', 'mostly-true', 'pants-fire',
       'true'], dtype=object)

In [201]:
labels = ['barely-true', 'false', 'half-true', 'mostly-true', 'pants-fire', 'true']

In [202]:
Y_train = labelEnc.transform(Y_train)

In [203]:
Y_train.shape

(10240,)

In [204]:
Y_train = Y_train.reshape(-1, 1)

In [205]:
oneHotEnc = OneHotEncoder(handle_unknown='ignore', sparse = False)
oneHotEnc.fit(Y_train)

OneHotEncoder(categorical_features='all', dtype=<type 'float'>,
       handle_unknown=u'ignore', n_values='auto', sparse=False)

In [206]:
oneHotEnc.fit(Y_train)
Y_train = oneHotEnc.transform(Y_train)

In [207]:
Y_train.shape

(10240, 6)

In [208]:
Y_train[0, :]

array([0., 1., 0., 0., 0., 0.])

In [209]:
num_classes = 6

In [213]:
X_train['statement'].str.split().str.len().idxmax()

'1606.json'

In [216]:
X_train['statement'].loc['1606.json']

'Hospitals, doctors, MRIs, surgeries and so forth are more extensively used and far more expensive in this country than they are in many other countries.\'\'\thealth-care\tmitt-romney\tFormer governor\tMassachusetts\trepublican\t34\t32\t58\t33\t19\ta Fox News Sunday interview\n9874.json\tbarely-true\tObamacare cuts seniors Medicare.\thealth-care,medicare\ted-gillespie\tRepublican strategist\tWashington, D.C.\trepublican\t2\t3\t2\t2\t1\ta campaign email.\n3072.json\tmostly-true\tThe refusal of many federal employees to fly coach costs taxpayers $146 million annually.\tgovernment-efficiency,transparency\tnewsmax\tMagazine and website\tFlorida\tnone\t0\t0\t0\t1\t0\tan e-mail solicitation\n2436.json\tmostly-true\tFlorida spends more than $300 million a year just on children repeating pre-K through 3rd grade.\teducation\talex-sink\t\tFlorida\tdemocrat\t1\t2\t2\t4\t0\tfigures cites on campaign website\n9721.json\ttrue\tMilwaukee County Sheriff David A. Clarke Jr. advised citizens to point th

<h2>Convert GLoVe Model into an Embedding Matrix in Tensorflow</h2>
<br><br>
We now convert Glove Model into an embedding matrix in tensorflow. Reference: <a href = "https://stackoverflow.com/questions/53353978/how-to-project-my-word2vec-model-in-tensorflow">link</a>
<br><br>
This embedding matrix will later be used to create our embedding lookup tables, as implemented by Yang et. al.

In [20]:
vec_size = glove_model.vector_size
vocab_size = len(glove_model.vocab)

# Create the embedding matrix where words are indexed alphabetically
embedding_mat = np.zeros(shape=(vocab_size, vec_size), dtype='int32')
for idx, word in enumerate(sorted(glove_model.vocab)):
    embedding_mat[idx] = glove_model.get_vector(word)

# Setup the embedding matrix for tensorflow
# Static embeddings, i.e., non-trainable embeddings for short_text_capsule_model
static_embeddings = tf.Variable(embedding_mat, trainable = False)

# Non-static embeddings, i.e. trainable embeddings for long_text_capsule_model
nonstatic_embeddings = tf.Variable(embedding_mat, trainable = True)

In [173]:
# Create empty lookup tables (Will be filled batch-wise during training)
max_sent = len(liar_train[0])

KeyError: 0

In [None]:
class BatchGenerator(object):
    """Generate and hold batches."""
    def __init__(self, dataset,label, batch_size,input_size, is_shuffle=True):
      self._dataset = dataset
      self._label = label
      self._batch_size = batch_size    
      self._cursor = 0      
      self._input_size = input_size      
      
      if is_shuffle:
          index = np.arange(len(self._dataset))
          np.random.shuffle(index)
          self._dataset = np.array(self._dataset)[index]
          self._label = np.array(self._label)[index]
      else:
          self._dataset = np.array(self._dataset)
          self._label = np.array(self._label)
    def next(self):
      if self._cursor + self._batch_size > len(self._dataset):
          self._cursor = 0
      """Generate a single batch from the current cursor position in the data."""      
      batch_x = self._dataset[self._cursor : self._cursor + self._batch_size,:]
      batch_y = self._label[self._cursor : self._cursor + self._batch_size]
      self._cursor += self._batch_size
      return batch_x, batch_y