# LIAR Fake News Detection

# Setup

## Install Libraries

In [None]:
%pip install -r requirements.txt
!spacy download en_core_web_sm

## Import Libraries

In [2]:
from nltk.corpus import stopwords
import nltk
import numpy as np
import spacy
import pandas as pd
import tensorflow as tf
import os.path
import pickle

nltk.download('stopwords')
nlp = spacy.load('en_core_web_sm')
# get the GPU device name
tf.config.list_physical_devices()

2024-11-24 14:40:27.584373: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-24 14:40:27.745262: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1732439427.827700     740 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1732439427.852570     740 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-24 14:40:28.017720: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'),
 PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

## Load Data

In [3]:
train_data = pd.read_table('Data/train.tsv', names = ["id", "label", "statement", "subject", "speaker", "job", "state", "party", "barely-true", "false", "half-true", "mostly-true", "pants-fire", "venue"])
test_data = pd.read_table('Data/test.tsv', names = ["id", "label", "statement", "subject", "speaker", "job", "state", "party", "barely-true", "false", "half-true", "mostly-true", "pants-fire", "venue"])
valid_data = pd.read_table('Data/valid.tsv', names = ["id", "label", "statement", "subject", "speaker", "job", "state", "party", "barely-true", "false", "half-true", "mostly-true", "pants-fire", "venue"])

### Data Info

In [4]:
print("Training Data Info:")
print(train_data.info())
print("Testing Data Info:")
print(test_data.info())
print("Validation Data Info:")
print(valid_data.info())
print(train_data.label.unique())
print(train_data.head())

Training Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10240 entries, 0 to 10239
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           10240 non-null  object 
 1   label        10240 non-null  object 
 2   statement    10240 non-null  object 
 3   subject      10238 non-null  object 
 4   speaker      10238 non-null  object 
 5   job          7342 non-null   object 
 6   state        8030 non-null   object 
 7   party        10238 non-null  object 
 8   barely-true  10238 non-null  float64
 9   false        10238 non-null  float64
 10  half-true    10238 non-null  float64
 11  mostly-true  10238 non-null  float64
 12  pants-fire   10238 non-null  float64
 13  venue        10138 non-null  object 
dtypes: float64(5), object(9)
memory usage: 1.1+ MB
None
Testing Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1267 entries, 0 to 1266
Data columns (total 14 columns):
 #   Column     

# Preprocessing

## Convert Categorical Data to Numerical

### Output Labels

In [5]:
# based on degree of truthfullness
y_label_dict = {"pants-fire" : 0, "false" : 1, "barely-true" : 2, "half-true" : 3, "mostly-true" : 4, "true" : 5}

train_data['output'] = train_data['label'].apply(lambda i: y_label_dict[i])
valid_data['output'] = valid_data['label'].apply(lambda i: y_label_dict[i])
test_data['output'] = test_data['label'].apply(lambda i: y_label_dict[i])


### Speakers

In [6]:
# take number of top speakers to consider a parameter
no_speaker = 25
# based on the frequency of the label (only consider the top no_speaker speakers as relevent, after 20, rest have less than 50 data points, not relevent)
frequent_speakers = train_data['speaker'].value_counts().reset_index()[:no_speaker].to_dict()['speaker']
frequent_speakers = dict((v, k) for k, v in frequent_speakers.items())
print(frequent_speakers)

def convert_speaker_to_num(speaker):
  # speaker not in the top 20, assign it to the 21st category
  other = no_speaker
  if isinstance(speaker, str):
    if speaker in frequent_speakers:
      return frequent_speakers[speaker]
    else:
      return other
  else:
    return other

train_data['speaker_id'] = train_data['speaker'].apply(convert_speaker_to_num)
valid_data['speaker_id'] = valid_data['speaker'].apply(convert_speaker_to_num)
test_data['speaker_id'] = test_data['speaker'].apply(convert_speaker_to_num)
train_data['speaker_id'].value_counts()

{'barack-obama': 0, 'donald-trump': 1, 'hillary-clinton': 2, 'mitt-romney': 3, 'scott-walker': 4, 'john-mccain': 5, 'rick-perry': 6, 'chain-email': 7, 'marco-rubio': 8, 'rick-scott': 9, 'ted-cruz': 10, 'bernie-s': 11, 'chris-christie': 12, 'facebook-posts': 13, 'charlie-crist': 14, 'newt-gingrich': 15, 'jeb-bush': 16, 'joe-biden': 17, 'blog-posting': 18, 'paul-ryan': 19, 'sarah-palin': 20, 'john-boehner': 21, 'michele-bachmann': 22, 'rick-santorum': 23, 'national-republican-congressional-committee': 24}


speaker_id
25    7295
0      488
1      273
2      239
3      176
4      149
5      148
7      142
6      142
8      117
9      115
10      93
11      88
13      78
12      78
14      70
15      69
17      63
16      63
18      59
19      56
20      52
21      49
24      46
22      46
23      46
Name: count, dtype: int64

### Job Titles

In [7]:
# take number of top jobs to consider a parameter
no_jobs = 25
# based on the frequency of the label (only consider the top no_jobs speakers as relevent, after 20, rest have less than 50 data points, not relevent)
frequent_jobs = train_data['job'].value_counts().reset_index()[:no_jobs].to_dict()['job']
frequent_jobs = dict((v, k) for k, v in frequent_jobs.items())
print(frequent_jobs)

def convert_job_to_num(job):
  # job not in the top jobs, assign it to the last category
  other = no_jobs
  if isinstance(job, str):
    if job in frequent_jobs:
      return frequent_jobs[job]
    else:
      return other
  else:
    return other

train_data['job_id'] = train_data['job'].apply(convert_job_to_num)
valid_data['job_id'] = valid_data['job'].apply(convert_job_to_num)
test_data['job_id'] = test_data['job'].apply(convert_job_to_num)
train_data['job_id'].value_counts()

{'President': 0, 'U.S. Senator': 1, 'Governor': 2, 'President-Elect': 3, 'U.S. senator': 4, 'Presidential candidate': 5, 'Former governor': 6, 'U.S. Representative': 7, 'Milwaukee County Executive': 8, 'Senator': 9, 'State Senator': 10, 'U.S. representative': 11, 'U.S. House of Representatives': 12, 'Attorney': 13, 'Congressman': 14, 'Social media posting': 15, 'Governor of New Jersey': 16, 'Co-host on CNN\'s "Crossfire"': 17, 'State Representative': 18, 'State representative': 19, 'U.S. Congressman': 20, 'Congresswoman': 21, 'Speaker of the House of Representatives': 22, 'State senator': 23, 'state representative': 24}


job_id
25    6348
0      492
1      479
2      391
3      273
4      263
5      254
6      176
7      172
8      149
9      147
10     108
11     103
12     102
13      81
14      80
15      78
16      78
17      73
18      72
19      66
20      63
22      50
21      50
23      48
24      44
Name: count, dtype: int64

### Parties

In [8]:
# take number of top parties to consider a parameter
no_party = 9
# based on the frequency of the label (only consider the top no_party speakers as relevent, after 20, rest have less than 50 data points, not relevent)
frequent_party = train_data['party'].value_counts().reset_index()[:no_party].to_dict()['party']
frequent_party = dict((v, k) for k, v in frequent_party.items())
print(frequent_party)

def convert_party_to_num(party):
  # party not in the top parties, assign it to the last category
  other = no_party
  if isinstance(party, str):
    if party in frequent_party:
      return frequent_party[party]
    else:
      return other
  else:
    return other

train_data['party_id'] = train_data['party'].apply(convert_party_to_num)
valid_data['party_id'] = valid_data['party'].apply(convert_party_to_num)
test_data['party_id'] = test_data['party'].apply(convert_party_to_num)
train_data['party_id'].value_counts()

{'republican': 0, 'democrat': 1, 'none': 2, 'organization': 3, 'independent': 4, 'newsmaker': 5, 'libertarian': 6, 'activist': 7, 'journalist': 8}


party_id
0    4497
1    3336
2    1744
3     219
4     147
9     124
5      56
6      40
7      39
8      38
Name: count, dtype: int64

### States

In [9]:
# take number of top states to consider a parameter
no_state = 30
# based on the frequency of the label (only consider the top no_state speakers as relevent, after 20, rest have less than 50 data points, not relevent)
frequent_state = train_data['state'].value_counts().reset_index()[:no_state].to_dict()['state']
frequent_state = dict((v, k) for k, v in frequent_state.items())
print(frequent_state)

def convert_state_to_num(state):
  # state not in the top states, assign it to the last category
  other = no_state
  if isinstance(state, str):
    if state in frequent_state:
      return frequent_state[state]
    else:
      return other
  else:
    return other

train_data['state_id'] = train_data['state'].apply(convert_state_to_num)
valid_data['state_id'] = valid_data['state'].apply(convert_state_to_num)
test_data['state_id'] = test_data['state'].apply(convert_state_to_num)
train_data['state_id'].value_counts()

{'Texas': 0, 'Florida': 1, 'Wisconsin': 2, 'New York': 3, 'Illinois': 4, 'Ohio': 5, 'Georgia': 6, 'Virginia': 7, 'Rhode Island': 8, 'New Jersey': 9, 'Oregon': 10, 'Massachusetts': 11, 'Arizona': 12, 'California': 13, 'Washington, D.C.': 14, 'Vermont': 15, 'Pennsylvania': 16, 'New Hampshire': 17, 'Arkansas': 18, 'Tennessee': 19, 'Kentucky': 20, 'Maryland': 21, 'Delaware': 22, 'Alaska': 23, 'Minnesota': 24, 'North Carolina': 25, 'Nevada': 26, 'Indiana': 27, 'Missouri': 28, 'New Mexico': 29}


state_id
30    2539
0     1009
1      997
2      713
3      657
4      556
5      447
6      426
7      407
8      369
9      241
10     239
11     206
12     182
13     159
14     120
15      98
16      90
17      86
18      84
19      75
20      74
21      69
22      68
23      65
24      56
25      56
26      48
27      38
28      36
29      30
Name: count, dtype: int64

### Subjects

In [10]:
# take number of top subjects to consider a parameter
no_subject = 30
# based on the frequency of the label (only consider the top no_subject speakers as relevent, after 20, rest have less than 50 data points, not relevent)
frequent_subject = train_data['subject'].value_counts().reset_index()[:no_subject].to_dict()['subject']
frequent_subject = dict((v, k) for k, v in frequent_subject.items())
print(frequent_subject)

def convert_subject_to_num(subject):
  # subject not in the top subjects, assign it to the last category
  other = no_subject
  if isinstance(subject, str):
    if subject in frequent_subject:
      return frequent_subject[subject]
    else:
      return other
  else:
    return other

train_data['subject_id'] = train_data['subject'].apply(convert_subject_to_num)
valid_data['subject_id'] = valid_data['subject'].apply(convert_subject_to_num)
test_data['subject_id'] = test_data['subject'].apply(convert_subject_to_num)
train_data['subject_id'].value_counts()

{'health-care': 0, 'taxes': 1, 'immigration': 2, 'elections': 3, 'education': 4, 'candidates-biography': 5, 'economy': 6, 'guns': 7, 'economy,jobs': 8, 'federal-budget': 9, 'jobs': 10, 'energy': 11, 'abortion': 12, 'foreign-policy': 13, 'state-budget': 14, 'education,state-budget': 15, 'transportation': 16, 'crime': 17, 'ethics': 18, 'iraq': 19, 'campaign-finance': 20, 'terrorism': 21, 'environment': 22, 'history': 23, 'job-accomplishments': 24, 'legal-issues': 25, 'social-security': 26, 'deficit,federal-budget': 27, 'state-budget,taxes': 28, 'energy,environment': 29}


subject_id
30    6910
0      381
1      308
2      253
3      252
4      237
5      190
6      137
7      130
8      125
9      121
10      98
11      94
12      92
13      85
14      75
15      69
16      64
17      59
18      58
19      55
20      53
21      53
22      52
24      45
23      45
25      42
27      40
26      40
28      39
29      38
Name: count, dtype: int64

### Venues

In [11]:
# take number of top venues to consider a parameter
no_venue = 30
# based on the frequency of the label (only consider the top no_venue speakers as relevent, after 20, rest have less than 50 data points, not relevent)
frequent_venue = train_data['venue'].value_counts().reset_index()[:no_venue].to_dict()['venue']
frequent_venue = dict((v, k) for k, v in frequent_venue.items())
print(frequent_venue)

def convert_venue_to_num(venue):
  # venue not in the top venues, assign it to the last category
  other = no_venue
  if isinstance(venue, str):
    if venue in frequent_venue:
      return frequent_venue[venue]
    else:
      return other
  else:
    return other

train_data['venue_id'] = train_data['venue'].apply(convert_venue_to_num)
valid_data['venue_id'] = valid_data['venue'].apply(convert_venue_to_num)
test_data['venue_id'] = test_data['venue'].apply(convert_venue_to_num)
train_data['venue_id'].value_counts()

{'a news release': 0, 'an interview': 1, 'a press release': 2, 'a speech': 3, 'a TV ad': 4, 'a tweet': 5, 'a campaign ad': 6, 'a television ad': 7, 'a radio interview': 8, 'a debate': 9, 'a news conference': 10, 'a Facebook post': 11, 'a campaign commercial': 12, 'a television interview': 13, 'a press conference': 14, 'a speech.': 15, 'a press release.': 16, 'a TV interview': 17, 'a radio ad': 18, 'a chain e-mail': 19, 'an interview on CNN': 20, 'a TV ad.': 21, 'a campaign mailer': 22, 'comments on ABC\'s "This Week"': 23, 'an interview on Fox News': 24, 'an interview.': 25, 'a campaign TV ad': 26, 'a news release.': 27, 'a TV interview.': 28, 'an ad': 29}


venue_id
30    7569
0      241
1      229
2      223
3      214
4      180
5      156
6      132
7      123
8      106
9       92
10      85
11      74
12      73
13      68
14      65
15      57
16      49
17      47
18      45
19      41
21      40
20      40
23      39
22      39
24      38
25      37
26      36
27      35
28      34
29      33
Name: count, dtype: int64

## Tokenizing Content

### Word Frequency Tokenization

In [12]:
vocab_dict = {}
if not os.path.exists('vocab_dict.pkl'):
  tokenizer = tf.keras.preprocessing.text.Tokenizer()
  tokenizer.fit_on_texts(train_data['statement'])
  vocab_dict = tokenizer.word_index
  pickle.dump(vocab_dict, open('vocab_dict.pkl', 'wb'))
else:
  vocab_dict = pickle.load(open('vocab_dict.pkl', 'rb'))

def convert_statement_to_vec(statement):
  stmnt = ''.join(word for word in statement.split() if word not in stopwords.words('english'))
  text = tf.keras.preprocessing.text.text_to_word_sequence(stmnt)
  return [vocab_dict[word] for word in text if word in vocab_dict]

train_data['statement_freq'] = train_data['statement'].apply(convert_statement_to_vec)
valid_data['statement_freq'] = valid_data['statement'].apply(convert_statement_to_vec)
test_data['statement_freq'] = test_data['statement'].apply(convert_statement_to_vec)

### Part of Speech Tagging

In [13]:
"""
pos_tags = {'ADJ': 'adjective', 'ADP': 'adposition', 'ADV': 'adverb',
            'AUX': 'auxiliary verb', 'CONJ': 'coordinating conjunction',
            'DET': 'determiner', 'INTJ': 'interjection', 'NOUN': 'noun',
            'NUM': 'numeral', 'PART': 'particle', 'PRON': 'pronoun',
            'PROPN': 'proper noun', 'PUNCT': 'punctuation', 'X': 'other',
            'SCONJ': 'subord conjunction', 'SYM': 'symbol', 'VERB': 'verb'}
"""
# create a dictionary to convert the pos tags to numbers, arbitrary
pos_dict = {'NOUN' : 0, 'VERB' : 1, 'ADP' : 2, 'PROPN' : 3, 'PUNCT' : 4,
            'DET' : 5, 'ADJ' : 6, 'NUM' : 7, 'ADV' : 8, 'PRON' : 9}
other = len(pos_dict.values())  # fpr all other pos tags

def convert_sentence_to_pos(sentence: str):
  doc = nlp(sentence)
  return [pos_dict.get(token.pos_, other) for token in doc]

train_data['statement_pos'] = train_data['statement'].apply(convert_sentence_to_pos)
valid_data['statement_pos'] = valid_data['statement'].apply(convert_sentence_to_pos)
test_data['statement_pos'] = test_data['statement'].apply(convert_sentence_to_pos)

### Dependency Parsing

In [14]:
"""
all dependencies:
dep_dict = {'ACL' : 0, 'ACOMP' : 1, 'ADVCL' : 2, 'ADVMOD' : 3, 'AGENT' : 4,
            'AMOD' : 5, 'APPOS' : 6, 'ATTR' : 7, 'AUX' : 8, 'AUXPASS' : 9,
            'CASE' : 10, 'CC' : 11, 'CCOMP' : 12, 'COMPOUND' : 13, 'CONJ' : 14,
            'CSUBJ' : 15, 'CSUBJPASS' : 16, 'DATIVE' : 17, 'DEP' : 18,
            'DET' : 19, 'DOBJ' : 20, 'EXPL' : 21, 'INTJ' : 22, 'MARK' : 23,
            'META' : 24, 'NEG' : 25, 'NOUNMOD' : 26, 'NPMOD' : 27, 'NSUBJ' : 28,
            'NSUBJPASS' : 29, 'NUMMOD' : 30, 'OPRD' : 31, 'PARATAXIS' : 32,
            'PCOMP' : 33, 'POBJ' : 34, 'POSS' : 35, 'PRECONJ' : 36, 'PREDET' : 37,
            'PREP' : 38, 'PRT' : 39, 'PUNCT' : 40, 'QUANTMOD' : 41,
            'RELCL' : 42, 'ROOT' : 43, 'XCOMP' : 44}
"""
# create a dictionary to convert the dep tags to numbers, arbitrary
dep_dict = {'punct' : 0, 'prep' : 1, 'pobj' : 2, 'compound' : 3, 'det' : 4,
            'nsubj' : 5, 'ROOT' : 6, 'amod' : 7, 'dobj' : 8, 'aux' : 9}
other = len(dep_dict.values())  # for all other dep tags

def convert_sentence_to_dep(sentence):
  doc = nlp(sentence)
  return [dep_dict.get(token.dep_, other) for token in doc]

train_data['statement_dep'] = train_data['statement'].apply(convert_sentence_to_dep)
valid_data['statement_dep'] = valid_data['statement'].apply(convert_sentence_to_dep)
test_data['statement_dep'] = test_data['statement'].apply(convert_sentence_to_dep)

In [15]:
train_data.head()

Unnamed: 0,id,label,statement,subject,speaker,job,state,party,barely-true,false,...,output,speaker_id,job_id,party_id,state_id,subject_id,venue_id,statement_freq,statement_pos,statement_dep
0,2635.json,false,Says the Annies List political group supports ...,abortion,dwayne-bohac,State representative,Texas,republican,0.0,1.0,...,1,25,19,0,0,12,30,[],"[1, 5, 3, 3, 6, 0, 1, 6, 4, 0, 0, 2, 0, 4]","[6, 4, 10, 10, 7, 5, 10, 7, 0, 3, 8, 1, 2, 0]"
1,10540.json,half-true,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0.0,0.0,...,3,25,25,1,7,30,30,[],"[10, 1, 5, 0, 2, 0, 0, 4, 9, 1, 10, 6, 0, 1, 2...","[10, 6, 4, 5, 1, 3, 2, 0, 5, 6, 10, 7, 5, 10, ..."
2,324.json,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,President,Illinois,democrat,70.0,71.0,...,4,0,0,1,4,13,30,[],"[3, 3, 1, 2, 3, 3, 4, 2, 1, 10, 1, 3, 3, 5, 0,...","[3, 5, 6, 1, 3, 2, 0, 1, 10, 9, 10, 3, 10, 4, ..."
3,1123.json,false,Health care reform legislation is likely to ma...,health-care,blog-posting,,,none,7.0,19.0,...,1,18,25,2,30,0,0,[],"[0, 0, 0, 0, 10, 6, 10, 1, 6, 0, 0, 0, 4]","[3, 3, 3, 5, 6, 10, 9, 10, 7, 3, 3, 8, 0]"
4,9028.json,half-true,The economic turnaround started at the end of ...,"economy,jobs",charlie-crist,,Florida,democrat,15.0,9.0,...,3,14,25,1,1,8,20,[],"[5, 6, 0, 1, 2, 5, 0, 2, 9, 0, 4]","[4, 7, 5, 6, 1, 4, 2, 1, 10, 2, 0]"


## Embeddings
We use the pretrained GloVe embeddings to convertwords into embeddings

In [39]:
embed_dim = 100
embeddings = {}
word = ''
try:
  with open('glove.6B.100d.txt', 'r') as f:
    for line in f:
      values = line.split()
      word = values[0].lower()
      embeddings[word] = np.asarray(values[1:], dtype='float32')
except FileNotFoundError:
  print('File glove.6B.100d.txt was not found in this directory')
  print('Get the file from the references provided in README.md')
  raise FileNotFoundError
print(len(embeddings), ": Embeddings loaded")
print(embed_dim, ": Embedding dimension")

num_words = len(vocab_dict) + 1
embed_matrix = np.zeros((num_words, embed_dim))
for word, i in vocab_dict.items():
  embed_vector = embeddings.get(word)
  if embed_vector is not None:
    embed_matrix[i] = embed_vector

pos_embeddings = np.identity(max(pos_dict.values())+1, dtype=int)
dep_embeddings = np.identity(max(dep_dict.values())+1, dtype=int)

400000 : Embeddings loaded
100 : Embedding dimension


## Global Hyperparameters

In [17]:
vocab_length = len(vocab_dict.keys())
lstm_size = 100
num_steps = 15
num_epochs = 30
batch_size = 40

#Hyperparams for CNN
kernel_sizes = [3,3,3]
filter_size = 128

#Meta data related hyper params
num_party = len(train_data.party_id.unique())
num_state = len(train_data.state_id.unique())
num_venue = len(train_data.venue_id.unique())
num_job = len(train_data.job_id.unique())
num_sub = len(train_data.subject_id.unique())
num_speaker = len(train_data.speaker_id.unique())

## Prepare Sentence Info (Padding)

In [18]:
X_train = train_data['statement_freq']
X_val = valid_data['statement_freq']
X_test = test_data['statement_freq']

Y_train = tf.keras.utils.to_categorical(train_data['output'], num_classes=6)
Y_val = tf.keras.utils.to_categorical(valid_data['output'], num_classes=6)
Y_test = list(test_data['output'])

X_train = tf.keras.preprocessing.sequence.pad_sequences(X_train, maxlen=num_steps, padding='post', truncating='post')
X_val = tf.keras.preprocessing.sequence.pad_sequences(X_val, maxlen=num_steps, padding='post', truncating='post')
X_test = tf.keras.preprocessing.sequence.pad_sequences(X_test, maxlen=num_steps, padding='post', truncating='post')

X_train_pos = train_data['statement_pos']
X_val_pos = valid_data['statement_pos']
X_test_pos = test_data['statement_pos']

X_train_pos = tf.keras.preprocessing.sequence.pad_sequences(X_train_pos, maxlen=num_steps, padding='post', truncating='post')
X_val_pos = tf.keras.preprocessing.sequence.pad_sequences(X_val_pos, maxlen=num_steps, padding='post', truncating='post')
X_test_pos = tf.keras.preprocessing.sequence.pad_sequences(X_test_pos, maxlen=num_steps, padding='post', truncating='post')

X_train_dep = train_data['statement_dep']
X_val_dep = valid_data['statement_dep']
X_test_dep = test_data['statement_dep']

X_train_dep = tf.keras.preprocessing.sequence.pad_sequences(X_train_dep, maxlen=num_steps, padding='post', truncating='post')
X_val_dep = tf.keras.preprocessing.sequence.pad_sequences(X_val_dep, maxlen=num_steps, padding='post', truncating='post')
X_test_dep = tf.keras.preprocessing.sequence.pad_sequences(X_test_dep, maxlen=num_steps, padding='post', truncating='post')

## Meta Data Preparation

In [19]:
party_train = tf.keras.utils.to_categorical(train_data['party_id'], num_classes=num_party)
party_val = tf.keras.utils.to_categorical(valid_data['party_id'], num_classes=num_party)
party_test = tf.keras.utils.to_categorical(test_data['party_id'], num_classes=num_party)

state_train = tf.keras.utils.to_categorical(train_data['state_id'], num_classes=num_state)
state_val = tf.keras.utils.to_categorical(valid_data['state_id'], num_classes=num_state)
state_test = tf.keras.utils.to_categorical(test_data['state_id'], num_classes=num_state)

venue_train = tf.keras.utils.to_categorical(train_data['venue_id'], num_classes=num_venue)
venue_val = tf.keras.utils.to_categorical(valid_data['venue_id'], num_classes=num_venue)
venue_test = tf.keras.utils.to_categorical(test_data['venue_id'], num_classes=num_venue)

job_train = tf.keras.utils.to_categorical(train_data['job_id'], num_classes=num_job)
job_val = tf.keras.utils.to_categorical(valid_data['job_id'], num_classes=num_job)
job_test = tf.keras.utils.to_categorical(test_data['job_id'], num_classes=num_job)

subject_train = tf.keras.utils.to_categorical(train_data['subject_id'], num_classes=num_sub)
subject_val = tf.keras.utils.to_categorical(valid_data['subject_id'], num_classes=num_sub)
subject_test = tf.keras.utils.to_categorical(test_data['subject_id'], num_classes=num_sub)

speaker_train = tf.keras.utils.to_categorical(train_data['speaker_id'], num_classes=num_speaker)
speaker_val = tf.keras.utils.to_categorical(valid_data['speaker_id'], num_classes=num_speaker)
speaker_test = tf.keras.utils.to_categorical(test_data['speaker_id'], num_classes=num_speaker)

X_train_meta = np.hstack((party_train, state_train, venue_train, job_train, subject_train, speaker_train))
X_val_meta = np.hstack((party_val, state_val, venue_val, job_val, subject_val, speaker_val))
X_test_meta = np.hstack((party_test, state_test, venue_test, job_test, subject_test, speaker_test))

## Check Matrix Shapes

In [40]:
print(X_train_meta.shape, X_val_meta.shape, X_test_meta.shape)
print(X_train.shape, X_val.shape, X_test.shape)
print(Y_train.shape, Y_val.shape)
print(X_train_pos.shape, X_val_pos.shape, X_test_pos.shape)
print(X_train_dep.shape, X_val_dep.shape, X_test_dep.shape)

(10240, 155) (1284, 155) (1267, 155)
(10240, 15) (1284, 15) (1267, 15)
(10240, 6) (1284, 6)
(10240, 15) (1284, 15) (1267, 15)
(10240, 15) (1284, 15) (1267, 15)


# Defining Functions

## Train Function

In [47]:
def train(model: tf.keras.models.Model, model_file_name: str, use_pos = False, use_meta = False, use_dep = False):
  sgd = tf.keras.optimizers.SGD(learning_rate=0.025, clipvalue=0.3, nesterov=True)
  # adam = tf.keras.optimizers.Adam(lr=0.000075, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)
  model.compile(optimizer=sgd, loss='categorical_crossentropy', metrics=['categorical_accuracy'])
  tb = tf.keras.callbacks.TensorBoard()
  csv_logger = tf.keras.callbacks.CSVLogger('train.log')
  filepath = model_file_name + '_weights.keras'
  checkpoint = tf.keras.callbacks.ModelCheckpoint(filepath, monitor='val_categorical_accuracy', verbose=1, save_best_only=True, mode='max')

  train_input = [X_train]
  valid_input = [X_val]
  if use_pos:
    train_input.append(X_train_pos)
    valid_input.append(X_val_pos)
  if use_dep:
    train_input.append(X_train_dep)
    valid_input.append(X_val_dep)
  if use_meta:
    train_input.append(X_train_meta)
    valid_input.append(X_val_meta)
  model.fit(train_input, [Y_train], epochs=num_epochs, batch_size=batch_size, validation_data=(valid_input, [Y_val]), callbacks=[tb, csv_logger, checkpoint])

## Test Function

In [61]:
# TODO: Check this function
def test(model_file_name: str, use_pos = False, use_meta = False, use_dep = False):
  model: tf.keras.models.Model = tf.keras.models.load_model(model_file_name + '_weights.keras')
  input = [X_test]
  if use_pos:
    input.append(X_test_pos)
  if use_dep:
    input.append(X_test_dep)
  if use_meta:
    input.append(X_test_meta)
  predictions = model.predict(input, batch_size=batch_size, verbose=1)
  n = len(predictions)

  false_worst = [-1]*n
  true_best = [-1]*n
  for p in range(n):
    if np.argmax(predictions[p]) == 0:
      false_worst[p] = predictions[p][0]
    elif np.argmax(predictions[p]) == 5:
      true_best[p] = predictions[p][5]
  print(n == len(Y_test))
  correct = np.sum(np.argmax(predictions, axis=1) == Y_test)
  print("Correctly predicted: ", correct, "out of", n)
  print("Accuracy: ", correct*100/n)
  pickle.dump(predictions, open(model_file_name + '_predictions.pkl', 'wb'))

  print("Printing the worst false predictions")
  for i, f in enumerate(false_worst):
    if f != -1:
      print(f)
      print(test_data.loc[i])
  print("Printing the best true predictions")
  for i, t in enumerate(true_best):
    if t != -1:
      print(t)
      print(test_data.loc[i])

# Building Models

## Shared Hyperparameters

In [23]:
use_pos = False
use_meta = True
use_dep = True

## CNN

In [48]:
statement_input = tf.keras.layers.Input(shape=(num_steps,), dtype='int32', name='main_input')
x_stmt = tf.keras.layers.Embedding(vocab_length+1, embed_dim, weights=[embed_matrix], trainable=False)(statement_input)

pos_input = tf.keras.layers.Input(shape=(num_steps,), dtype='int32', name='pos_input')
x_pos = tf.keras.layers.Embedding(max(pos_dict.values())+1, max(pos_dict.values())+1, weights=[pos_embeddings], trainable=False)(pos_input)

dep_input = tf.keras.layers.Input(shape=(num_steps,), dtype='int32', name='dep_input')
x_dep = tf.keras.layers.Embedding(max(dep_dict.values())+1, max(dep_dict.values())+1, weights=[dep_embeddings], trainable=False)(dep_input)

meta_input = tf.keras.layers.Input(shape=(X_train_meta.shape[1],), name='aux_input')
x_meta = tf.keras.layers.Dense(64, activation='relu')(meta_input)

kernel_stmt = []
kernel_pos = []
kernel_dep = []
for kernel in kernel_sizes:
  x_1 = tf.keras.layers.Conv1D(filter_size, kernel)(x_stmt)
  x_1 = tf.keras.layers.GlobalMaxPooling1D()(x_1)
  kernel_stmt.append(x_1)

  x_2 = tf.keras.layers.Conv1D(filter_size, kernel)(x_pos)
  x_2 = tf.keras.layers.GlobalMaxPooling1D()(x_2)
  kernel_pos.append(x_2)

  x_3 = tf.keras.layers.Conv1D(filter_size, kernel)(x_dep)
  x_3 = tf.keras.layers.GlobalMaxPooling1D()(x_3)
  kernel_dep.append(x_3)

conv_in1 = tf.keras.layers.concatenate(kernel_stmt)
conv_in1 = tf.keras.layers.Dropout(0.6)(conv_in1)
conv_in1 = tf.keras.layers.Dense(128, activation='relu')(conv_in1)

conv_in2 = tf.keras.layers.concatenate(kernel_pos)
conv_in2 = tf.keras.layers.Dropout(0.6)(conv_in2)
conv_in2 = tf.keras.layers.Dense(128, activation='relu')(conv_in2)

conv_in3 = tf.keras.layers.concatenate(kernel_dep)
conv_in3 = tf.keras.layers.Dropout(0.6)(conv_in3)
conv_in3 = tf.keras.layers.Dense(128, activation='relu')(conv_in3)

lays = [conv_in1]
if use_pos:
  lays.append(conv_in2)
if use_dep:
  lays.append(conv_in3)
if use_meta:
  lays.append(x_meta)
x = tf.keras.layers.concatenate(lays)

main_output = tf.keras.layers.Dense(6, activation='softmax', name='main_output')(x)
inputs = [statement_input]
if use_pos:
  inputs.append(pos_input)
if use_dep:
  inputs.append(dep_input)
if use_meta:
  inputs.append(meta_input)
model_cnn = tf.keras.models.Model(inputs=inputs, outputs=[main_output])
print(model_cnn.summary())

None


## LSTM

In [45]:
model_lstm = tf.keras.models.Sequential()
hidden_size = embed_dim
model_lstm.add(tf.keras.layers.Embedding(vocab_length+1, hidden_size))
model_lstm.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(hidden_size)))
model_lstm.add(tf.keras.layers.Dense(6, activation='softmax'))

statement_input = tf.keras.layers.Input(shape=(num_steps,), dtype='int32', name='main_input')
x_stmt = tf.keras.layers.Embedding(vocab_length+1, embed_dim, weights=[embed_matrix], trainable=False)(statement_input)
lstm_in = tf.keras.layers.LSTM(lstm_size, dropout=0.2)(x_stmt)

pos_input = tf.keras.layers.Input(shape=(num_steps,), dtype='int32', name='pos_input')
x_pos = tf.keras.layers.Embedding(max(pos_dict.values())+1, max(pos_dict.values())+1, weights=[pos_embeddings], trainable=False)(pos_input)
lstm_in2 = tf.keras.layers.LSTM(lstm_size, dropout=0.2)(x_pos)

dep_input = tf.keras.layers.Input(shape=(num_steps,), dtype='int32', name='dep_input')
x_dep = tf.keras.layers.Embedding(max(dep_dict.values())+1, max(dep_dict.values())+1, weights=[dep_embeddings], trainable=False)(dep_input)
lstm_in3 = tf.keras.layers.LSTM(lstm_size, dropout=0.2)(x_dep)

meta_input = tf.keras.layers.Input(shape=(X_train_meta.shape[1],), name='aux_input')
x_meta = tf.keras.layers.Dense(64, activation='relu')(meta_input)

lays = [lstm_in]
if use_pos:
  lays.append(lstm_in2)
if use_dep:
  lays.append(lstm_in3)
if use_meta:
  lays.append(x_meta)
x = tf.keras.layers.concatenate(lays)

main_output = tf.keras.layers.Dense(6, activation='softmax', name='main_output')(x)
inputs = [statement_input]
if use_pos:
  inputs.append(pos_input)
if use_dep:
  inputs.append(dep_input)
if use_meta:
  inputs.append(meta_input)
model_lstm = tf.keras.models.Model(inputs=inputs, outputs=[main_output])
print(model_lstm.summary())

None


# Training Models

## CNN

In [62]:
train(model_cnn, 'cnn', use_pos, use_meta, use_dep)

Epoch 1/30
[1m255/256[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 24ms/step - categorical_accuracy: 0.3139 - loss: 1.6275
Epoch 1: val_categorical_accuracy improved from -inf to 0.26402, saving model to cnn_weights.keras
[1m256/256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 28ms/step - categorical_accuracy: 0.3138 - loss: 1.6275 - val_categorical_accuracy: 0.2640 - val_loss: 1.6899
Epoch 2/30
[1m256/256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - categorical_accuracy: 0.3098 - loss: 1.6377
Epoch 2: val_categorical_accuracy improved from 0.26402 to 0.26480, saving model to cnn_weights.keras
[1m256/256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 16ms/step - categorical_accuracy: 0.3098 - loss: 1.6376 - val_categorical_accuracy: 0.2648 - val_loss: 1.6890
Epoch 3/30
[1m255/256[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 9ms/step - categorical_accuracy: 0.3134 - loss: 1.6260
Epoch 3: val_categorical_accuracy did not improv

## LSTM

In [63]:
train(model_lstm,'lstm', use_pos, use_meta, use_dep)

Epoch 1/30
[1m256/256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step - categorical_accuracy: 0.1994 - loss: 1.7718
Epoch 1: val_categorical_accuracy improved from -inf to 0.21262, saving model to lstm_weights.keras
[1m256/256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 42ms/step - categorical_accuracy: 0.1994 - loss: 1.7718 - val_categorical_accuracy: 0.2126 - val_loss: 1.7539
Epoch 2/30
[1m255/256[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 42ms/step - categorical_accuracy: 0.2216 - loss: 1.7457
Epoch 2: val_categorical_accuracy improved from 0.21262 to 0.23053, saving model to lstm_weights.keras
[1m256/256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 45ms/step - categorical_accuracy: 0.2216 - loss: 1.7457 - val_categorical_accuracy: 0.2305 - val_loss: 1.7440
Epoch 3/30
[1m234/256[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m0s[0m 41ms/step - categorical_accuracy: 0.2334 - loss: 1.7402
Epoch 3: val_categorical_accuracy did not im

# Testing Models

## CNN

In [64]:
test('cnn', use_pos, use_meta, use_dep)

[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step
True
Correctly predicted:  313 out of 1267
Accuracy:  24.704025256511443
Printing the worst false predictions
0.28164738
id                                                       11200.json
label                                                         false
statement         Says 57 percent of federal spending goes to th...
subject                             federal-budget,military,poverty
speaker                                              facebook-posts
job                                            Social media posting
state                                                           NaN
party                                                          none
barely-true                                                      14
false                                                            18
half-true                                                        15
mostly-true                                              

## LSTM

In [65]:
test('lstm', use_pos, use_meta, use_dep)

[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step
True
Correctly predicted:  301 out of 1267
Accuracy:  23.756906077348066
Printing the worst false predictions
0.21284907
id                                                       11200.json
label                                                         false
statement         Says 57 percent of federal spending goes to th...
subject                             federal-budget,military,poverty
speaker                                              facebook-posts
job                                            Social media posting
state                                                           NaN
party                                                          none
barely-true                                                      14
false                                                            18
half-true                                                        15
mostly-true                                              