In [15]:
import csv
import unicodecsv
import tensorflow as tf
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, LSTM, Dropout, Activation, Embedding, Bidirectional

In [16]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))

[nltk_data] Error loading stopwords: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>


In [17]:
vocab_size = 5000 # make the top list of words (common words)
embedding_dim = 50
max_length = 20
trunc_type = 'post'
padding_type = 'post'
oov_tok = '<OOV>' # OOV = Out of Vocabulary

In [18]:
articles = []
val_articles = []
labels = []
val_labels = []

with open("train.tsv", 'r', encoding = 'utf-8') as csvfile:
    reader = csv.reader(csvfile, delimiter='\t')
    next(reader)
    for row in reader:
        labels.append(row[1])
        if(len(row)>13):
            article = row[2]+row[7]+row[13]
        else:
            article = row[2]
        for word in STOPWORDS:
            token = ' ' + word + ' '
            article = article.replace(token, ' ')
            article = article.replace(' ', ' ')
        articles.append(article)
        
train_articles = articles[0: 10239]
train_labels = labels[0: 10239]
        
with open("valid.tsv", 'r', encoding = 'utf-8') as csvfile:
    reader = csv.reader(csvfile, delimiter='\t')
    next(reader)
    for row in reader:
        val_labels.append(row[1])
        labels.append(row[1])
        if(len(row)>3):
            val_article = row[2]+row[3]
        else:
            val_article = row[2]
        for word in STOPWORDS:
            token = ' ' + word + ' '
            val_article = val_article.replace(token, ' ')
            val_article = val_article.replace(' ', ' ')
        val_articles.append(val_article)
        
validation_articles = val_articles[0: 1283]
validation_labels = val_labels[0: 1283]

In [19]:
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_articles)
word_index = tokenizer.word_index

In [20]:
train_sequences = tokenizer.texts_to_sequences(train_articles)
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
validation_sequences = tokenizer.texts_to_sequences(validation_articles)
validation_padded = pad_sequences(validation_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [21]:
label_tokenizer = Tokenizer()
label_tokenizer.fit_on_texts(labels)

training_label_seq = np.array(label_tokenizer.texts_to_sequences(train_labels))
validation_label_seq = np.array(label_tokenizer.texts_to_sequences(validation_labels))

In [22]:
print(set(labels))

{'barelyTrue', 'true', 'false', 'pantsFire', 'mostlyTrue', 'halfTrue'}


In [23]:
label_tokenizer.word_index

{'halftrue': 1,
 'false': 2,
 'mostlytrue': 3,
 'barelytrue': 4,
 'true': 5,
 'pantsfire': 6}

In [24]:
print(training_label_seq.shape)
print('-------------')
print(validation_label_seq.shape)

(10239, 1)
-------------
(1283, 1)


In [31]:
model = Sequential()
model.add(Embedding(vocab_size, embedding_dim))
#model.add(Dropout(0.5))
model.add(Bidirectional(LSTM(embedding_dim)))
model.add(Dense(7, activation='softmax'))

model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, None, 50)          250000    
_________________________________________________________________
bidirectional_3 (Bidirection (None, 100)               40400     
_________________________________________________________________
dense_3 (Dense)              (None, 7)                 707       
Total params: 291,107
Trainable params: 291,107
Non-trainable params: 0
_________________________________________________________________


In [32]:
opt = tf.keras.optimizers.Adam(lr=0.001, decay=1e-6)
model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer=opt,
    metrics=['accuracy'],
)

In [None]:
num_epochs = 50
history = model.fit(train_padded, training_label_seq, epochs=num_epochs, validation_data=(validation_padded, validation_label_seq), verbose=2)

Train on 10239 samples, validate on 1283 samples
Epoch 1/50
10239/10239 - 7s - loss: 1.7630 - acc: 0.2157 - val_loss: 1.7293 - val_acc: 0.2463
Epoch 2/50
10239/10239 - 6s - loss: 1.6245 - acc: 0.3111 - val_loss: 1.7149 - val_acc: 0.2330
Epoch 3/50
10239/10239 - 6s - loss: 1.4350 - acc: 0.4204 - val_loss: 1.8254 - val_acc: 0.2369
Epoch 4/50
10239/10239 - 7s - loss: 1.2413 - acc: 0.5175 - val_loss: 2.0762 - val_acc: 0.2323
Epoch 5/50
10239/10239 - 7s - loss: 1.0440 - acc: 0.6109 - val_loss: 2.2388 - val_acc: 0.2401
Epoch 6/50
10239/10239 - 7s - loss: 0.8543 - acc: 0.6891 - val_loss: 2.5353 - val_acc: 0.2369
Epoch 7/50
10239/10239 - 7s - loss: 0.6846 - acc: 0.7624 - val_loss: 2.8405 - val_acc: 0.2369
Epoch 8/50
10239/10239 - 7s - loss: 0.5541 - acc: 0.8131 - val_loss: 3.2735 - val_acc: 0.2214
Epoch 9/50
10239/10239 - 7s - loss: 0.4460 - acc: 0.8536 - val_loss: 3.6921 - val_acc: 0.2276
Epoch 10/50
10239/10239 - 7s - loss: 0.3666 - acc: 0.8822 - val_loss: 3.9406 - val_acc: 0.2214
Epoch 11/5

In [28]:
test_labels = []
test_articles = []

with open("test.tsv", 'r', encoding = 'utf-8') as csvfile:
    reader = csv.reader(csvfile, delimiter='\t')
    next(reader)
    for row in reader:
        test_labels.append(row[1])
        if(len(row)>3):
            test_article = row[2]+row[3]
        else:
            test_article = row[2]
        for word in STOPWORDS:
            token = ' ' + word + ' '
            test_article = test_article.replace(token, ' ')
            test_article = test_article.replace(' ', ' ')
        test_articles.append(test_article)

In [51]:
print(len(test_labels))
print(len(test_articles))

1266
1266


In [53]:
print(test_articles[0])

Wisconsin pace double number layoffs year.jobs


In [54]:
tokenizer.fit_on_texts(test_articles)
test_sequences = tokenizer.texts_to_sequences(test_articles)
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [55]:
svm = svm_wrapper(train_padded, test_padded)

NameError: name 'svm_wrapper' is not defined

In [29]:
i = 0
correct = 0.0;
total = 0.0;
for i in range(len(test_articles)):
    txt = [];
    txt.append(test_articles[i])
    seq = tokenizer.texts_to_sequences(txt)
    padded = pad_sequences(seq, maxlen=30)
    pred = model.predict(padded)
    labels = ['halfTrue', 'false', 'mostlyTrue', 'barelyTrue', 'true', 'pantsFire']
    if(labels[np.argmax(pred)-1] == test_labels[i]):
        print("true")
        correct += 1;
    else:
        print("false")
    total += 1;

false
false
false
false
false
true
false
false
true
true
false
false
false
false
false
false
false
false
false
false
false
false
false
false
false
true
false
true
true
false
false
false
false
false
false
true
false
false
false
true
false
false
false
false
false
false
false
false
false
false
true
false
false
false
false
false
false
false
false
true
true
true
false
false
false
false
false
false
false
false
false
false
false
false
false
false
true
false
false
false
false
false
false
false
true
true
false
false
false
false
false
true
false
false
false
false
false
true
true
false
false
false
true
false
true
true
false
false
false
false
false
false
false
false
false
false
false
false
false
false
false
false
false
false
false
false
false
false
false
false
false
false
false
true
false
false
false
true
false
false
false
true
false
false
true
false
false
false
false
true
false
true
false
false
false
false
false
false
false
false
false
false
false
false
true
false
false
false
false
true
false
fal

In [30]:
print(correct/total)

0.165086887835703


In [65]:
txt = [];
txt.append(test_articles[1])
print(txt)

['Says John McCain done nothing help vets.military,veterans,voting-record']


In [40]:
txt = ["I suck"]
seq = tokenizer.texts_to_sequences(txt)
padded = pad_sequences(seq, maxlen=max_length)
pred = model.predict(padded)
labels = ['halfTrue', 'false', 'mostlyTrue', 'barelyTrue', 'true', 'pantsFire'] #orig

print(txt[0])
print(labels[np.argmax(pred)-1])

I suck
mostlyTrue


In [67]:
print(test_labels[0])

false
