In [1]:
import csv
import unicodecsv
import tensorflow as tf
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, LSTM, Dropout, Activation, Embedding, Bidirectional

In [2]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))

[nltk_data] Error loading stopwords: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>


In [3]:
vocab_size = 5000 # make the top list of words (common words)
embedding_dim = 64
max_length = 200
trunc_type = 'post'
padding_type = 'post'
oov_tok = '<OOV>' # OOV = Out of Vocabulary

In [4]:
articles = []
val_articles = []
labels = []
val_labels = []

with open("train.tsv", 'r', encoding = 'utf-8') as csvfile:
    reader = csv.reader(csvfile, delimiter='\t')
    next(reader)
    for row in reader:
        labels.append(row[1])
        if(len(row)>13):
            article = row[2]+row[3]+row[7]+row[13]
        else:
            article = row[2]
        for word in STOPWORDS:
            token = ' ' + word + ' '
            article = article.replace(token, ' ')
            article = article.replace(' ', ' ')
        articles.append(article)
        
train_articles = articles[0: 10239]
train_labels = labels[0: 10239]
        
with open("valid.tsv", 'r', encoding = 'utf-8') as csvfile:
    reader = csv.reader(csvfile, delimiter='\t')
    next(reader)
    for row in reader:
        val_labels.append(row[1])
        labels.append(row[1])
        if(len(row)>3):
            val_article = row[2]+row[3]
        else:
            val_article = row[2]
        for word in STOPWORDS:
            token = ' ' + word + ' '
            val_article = val_article.replace(token, ' ')
            val_article = val_article.replace(' ', ' ')
        val_articles.append(val_article)
        
validation_articles = val_articles[0: 1283]
validation_labels = val_labels[0: 1283]

In [5]:
print(len(train_articles))
print(len(train_labels))
print(len(validation_articles))
print(len(validation_labels))
print(len(labels))

10239
10239
1283
1283
11522


In [6]:
print(len(train_articles))
print(len(train_labels))
print(len(validation_articles))
print(len(validation_labels))

10239
10239
1283
1283


In [7]:
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_articles)
word_index = tokenizer.word_index

In [8]:
train_sequences = tokenizer.texts_to_sequences(train_articles)

In [9]:
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [10]:
validation_sequences = tokenizer.texts_to_sequences(validation_articles)
validation_padded = pad_sequences(validation_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [11]:
label_tokenizer = Tokenizer()
label_tokenizer.fit_on_texts(labels)

training_label_seq = np.array(label_tokenizer.texts_to_sequences(train_labels))
validation_label_seq = np.array(label_tokenizer.texts_to_sequences(validation_labels))

In [12]:
print(set(labels))

{'true', 'pantsFire', 'false', 'barelyTrue', 'halfTrue', 'mostlyTrue'}


In [13]:
label_tokenizer.word_index

{'halftrue': 1,
 'false': 2,
 'mostlytrue': 3,
 'barelytrue': 4,
 'true': 5,
 'pantsfire': 6}

In [14]:
print(training_label_seq.shape)
print('-------------')
print(validation_label_seq.shape)

(10239, 1)
-------------
(1283, 1)


In [118]:
print(labels)

['halfTrue', 'mostlyTrue', 'false', 'halfTrue', 'true', 'barelyTrue', 'halfTrue', 'halfTrue', 'mostlyTrue', 'mostlyTrue', 'halfTrue', 'false', 'mostlyTrue', 'barelyTrue', 'halfTrue', 'true', 'barelyTrue', 'halfTrue', 'mostlyTrue', 'false', 'mostlyTrue', 'mostlyTrue', 'halfTrue', 'barelyTrue', 'false', 'mostlyTrue', 'halfTrue', 'false', 'mostlyTrue', 'true', 'barelyTrue', 'false', 'mostlyTrue', 'mostlyTrue', 'true', 'true', 'true', 'pantsFire', 'true', 'false', 'halfTrue', 'pantsFire', 'pantsFire', 'false', 'halfTrue', 'pantsFire', 'false', 'pantsFire', 'true', 'barelyTrue', 'barelyTrue', 'false', 'true', 'barelyTrue', 'halfTrue', 'halfTrue', 'barelyTrue', 'false', 'mostlyTrue', 'false', 'mostlyTrue', 'true', 'mostlyTrue', 'false', 'pantsFire', 'true', 'false', 'barelyTrue', 'barelyTrue', 'true', 'barelyTrue', 'mostlyTrue', 'mostlyTrue', 'mostlyTrue', 'true', 'false', 'barelyTrue', 'false', 'halfTrue', 'true', 'halfTrue', 'false', 'false', 'true', 'halfTrue', 'mostlyTrue', 'mostlyTrue',

In [16]:
model = Sequential()
model.add(Embedding(vocab_size, embedding_dim))
model.add(Dropout(0.5))
model.add(Bidirectional(LSTM(embedding_dim)))
model.add(Dense(7, activation='softmax'))

model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 64)          320000    
_________________________________________________________________
dropout_1 (Dropout)          (None, None, 64)          0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 128)               66048     
_________________________________________________________________
dense_1 (Dense)              (None, 7)                 903       
Total params: 386,951
Trainable params: 386,951
Non-trainable params: 0
_________________________________________________________________


In [17]:
opt = tf.keras.optimizers.Adam(lr=0.001, decay=1e-6)
model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer=opt,
    metrics=['accuracy'],
)

In [18]:
num_epochs = 5
history = model.fit(train_padded, training_label_seq, epochs=num_epochs, validation_data=(validation_padded, validation_label_seq), verbose=2)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Train on 10239 samples, validate on 1283 samples
Epoch 1/5
10239/10239 - 134s - loss: 1.7639 - acc: 0.2157 - val_loss: 1.7525 - val_acc: 0.2510
Epoch 2/5
10239/10239 - 128s - loss: 1.6611 - acc: 0.2745 - val_loss: 1.7205 - val_acc: 0.2190
Epoch 3/5
10239/10239 - 129s - loss: 1.5430 - acc: 0.3587 - val_loss: 1.7865 - val_acc: 0.2229
Epoch 4/5
10239/10239 - 129s - loss: 1.4015 - acc: 0.4424 - val_loss: 1.8576 - val_acc: 0.2315
Epoch 5/5
10239/10239 - 129s - loss: 1.2637 - acc: 0.5124 - val_loss: 2.0154 - val_acc: 0.2190


In [19]:
test_labels = []
test_articles = []

with open("test.tsv", 'r', encoding = 'utf-8') as csvfile:
    reader = csv.reader(csvfile, delimiter='\t')
    next(reader)
    for row in reader:
        test_labels.append(row[1])
        if(len(row)>3):
            test_article = row[2]+row[3]
        else:
            test_article = row[2]
        for word in STOPWORDS:
            token = ' ' + word + ' '
            test_article = test_article.replace(token, ' ')
            test_article = test_article.replace(' ', ' ')
        test_articles.append(test_article)

In [20]:
print(len(test_labels))
print(len(test_articles))

1266
1266


In [21]:
i = 0
for i in range(len(test_articles)):
    txt = [];
    txt.append(test_articles[i])
    seq = tokenizer.texts_to_sequences(txt)
    padded = pad_sequences(seq, maxlen=max_length)
    pred = model.predict(padded)
    print(np.argmax(pred))

4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4


In [22]:
print(test_articles[2])

Suzanne Bonamici supports plan cut choice Medicare Advantage seniors.medicare,message-machine-2012,campaign-advertising


In [23]:
txt = [];
txt.append(test_articles[1])
print(txt)

['Says John McCain done nothing help vets.military,veterans,voting-record']


In [24]:
txt = ["I suck"]
seq = tokenizer.texts_to_sequences(txt)
padded = pad_sequences(seq, maxlen=max_length)
pred = model.predict(padded)
labels = ['halfTrue', 'false', 'mostlyTrue', 'barelyTrue', 'true', 'pantsFire'] #orig

print(txt[0])
print(labels[np.argmax(pred)-1])

I suck
barelyTrue


In [25]:
print(test_labels[0])

false
