<a href="https://colab.research.google.com/github/MNaplesDevelopment/ACM-Fake-News/blob/master/RNN_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
from scipy.spatial.distance import cdist
from keras.models import Sequential
from keras.layers import Dense, CuDNNLSTM, Embedding
from keras.optimizers import RMSprop, Adam
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

In [0]:
# Load data
import pickle
with open('real.pkl', 'rb') as f:
    real_titles = pickle.load(f)
with open('fake.pkl', 'rb') as f:
    fake_titles = pickle.load(f)
with open('embs.pkl', 'rb') as f:
    embs = pickle.load(f)

In [0]:
# Store into array, at this point the titles are still strings.
real = np.asarray(real_titles)[:len(fake_titles)]
fake = np.asarray(fake_titles)
# Create labels for each title, one for fake and 0 for real
fake_labels = np.ones(len(fake))
real_labels = np.zeros(len(real))

In [0]:
# The embeddings are currently stored in a dictionary and we need to turn it into a 2-D array of size (# words, Embedding Length)
# word_to_int will take a word and convert it to it's index into the embedding matrix
# int_to_word does the opposite - takes an index and converts it back to a word
embedding_matrix = []
int_to_word = []
word_to_int = {}
i = 0
for word, emb in embs.items():
    embedding_matrix.append(emb)
    int_to_word.append(word)
    word_to_int[word] = i
    i += 1
    
embedding_matrix.append(np.zeros(100)) # For unknown words we use an array of zeros.
embedding_matrix = np.asarray(embedding_matrix)

In [0]:
print(word_to_int['and'])
print(int_to_word[3])
print(np.array_equal(embs['and'], embedding_matrix[3]))

3
and
True


In [0]:
# Concat fake and real title into 1 array- same with the labels
train_data = np.concatenate((real, fake), axis=0)
train_labels = np.concatenate((real_labels, fake_labels), axis=0)
# "Randomly" shuffle data with the same seed to ensure the 2 arrays maintain their parallel relationship
np.random.seed(3)
np.random.shuffle(train_data)
np.random.seed(3)
np.random.shuffle(train_labels)

In [0]:
# Chop off 1000 examples for testing
test_data = train_data[train_data.shape[0]-1000:]
test_labels = train_labels[train_labels.shape[0]-1000:]
train_data = train_data[:train_data.shape[0]-1000]
train_labels = train_labels[:train_labels.shape[0]-1000]

num_words = len(embs)

In [0]:
# Converts each title of strings into integers - each word is turned into it's index into the embedding matrix.
train_data_tokens = []
test_data_tokens = []
num_words_missed = 0
num_words_found = 0
for i in range(train_data.shape[0]):
    train_data_tokens.append([])
    for word in train_data[i].split():
        if word.lower() in embs:
            train_data_tokens[i].append(word_to_int[word.lower()])
            num_words_found += 1
        else:
            train_data_tokens[i].append(-1)
            num_words_missed += 1
for i in range(test_data.shape[0]):
    test_data_tokens.append([])
    for word in test_data[i].split():
        if word.lower() in embs:
            test_data_tokens[i].append(word_to_int[word.lower()])
            num_words_found += 1
        else:
            test_data_tokens[i].append(embedding_matrix.shape[0]-1)
            num_words_missed += 1
print("Number of words embedding found %d" % num_words_found)
print("Number of words embedding missing %d" % num_words_missed)

Number of words embedding found 226332
Number of words embedding missing 40147


In [0]:
# Function for taking a title thats been converting into indeces back into strings.
print(train_data_tokens[3])
int_to_word.append("unknown")
def tokens_to_string(tokens):
    words = [int_to_word[token] for token in tokens if token != 0]
    text = " ".join(words)
    return text
print(tokens_to_string(train_data_tokens[3]))
# See the how the array of ints gets converted into words.

[-1, 11548, 9, 341, 18, 61, 14, 2976, 194, 3605, 441, 3911]
unknown agony is far from over as syrian general seeks further battles


In [0]:
# Unimportant
num_tokens = [len(tokens) for tokens in train_data_tokens + test_data_tokens]
num_tokens = np.asarray(num_tokens)

In [0]:
max_tokens = np.max(num_tokens)
print(max_tokens)

68


In [0]:
# Tensorflow requires each title is the same length - that's all this function does.
pad = 'pre'
train_data_pad = pad_sequences(train_data_tokens, maxlen=max_tokens,
                              padding=pad, truncating=pad)
test_data_pad = pad_sequences(test_data_tokens, maxlen=max_tokens,
                             padding=pad, truncating=pad)

In [0]:
# Create Network
from keras.layers import Dropout
num_words = len(int_to_word)
model = Sequential()
model.add(Embedding(input_dim=embedding_matrix.shape[0],
                   output_dim=embedding_matrix.shape[1],
                   input_length=max_tokens,
                   weights=[embedding_matrix],
                   trainable=False,
                   name='embedding_layer'))   
model.add(CuDNNLSTM(16, return_sequences=True))
model.add(Dropout(0.2))
model.add(CuDNNLSTM(8))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
optimizer = Adam(lr=1e-3)
model.compile(loss='binary_crossentropy',
             optimizer=optimizer,
             metrics=['accuracy'])

In [0]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_layer (Embedding)  (None, 68, 100)           3604700   
_________________________________________________________________
cu_dnnlstm_9 (CuDNNLSTM)     (None, 68, 16)            7552      
_________________________________________________________________
dropout_9 (Dropout)          (None, 68, 16)            0         
_________________________________________________________________
cu_dnnlstm_10 (CuDNNLSTM)    (None, 8)                 832       
_________________________________________________________________
dropout_10 (Dropout)         (None, 8)                 0         
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 9         
Total params: 3,613,093
Trainable params: 8,393
Non-trainable params: 3,604,700
______________________________________________________________

In [0]:
# Train
%%time
model.fit(train_data_pad, train_labels,
         validation_split=0.05, epochs=3, batch_size=64)

Train on 22615 samples, validate on 1191 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
CPU times: user 24 s, sys: 6.6 s, total: 30.6 s
Wall time: 30.4 s


<keras.callbacks.History at 0x7f950f8ec7b8>

In [0]:
# Test on testing data
result = model.evaluate(test_data_pad, test_labels)



In [0]:
print("accuracy: {0:.2%}".format(result[1]))

accuracy: 86.80%


In [0]:
title = np.asarray(['US Suspends Nuclear Arms Control Treaty With Russia'.lower()])
real_or_fake = 'real'
label = "0" if real_or_fake == 'real' else '1'
wrong = 'real' if real_or_fake == 'fake' else 'fake'

In [0]:
num_words_found = 0
num_words_missed = 0
title_tokens = []
for i in range(title.shape[0]):
    title_tokens.append([])
    for word in title[i].split():
        if word in embs:
            title_tokens[i].append(word_to_int[word.lower()])
            num_words_found += 1
        else:
            title_tokens[i].append(-1)
            num_words_missed += 1
print(num_words_missed)
print(num_words_found)

0
8


In [0]:
title_pad = pad_sequences(title_tokens, maxlen=max_tokens,
                              padding=pad, truncating=pad)

In [0]:
result = model.evaluate(title_pad, np.array([label]))



In [0]:
print("accuracy: {0:.2%}".format(result[1]))
print('Prediction: ' + real_or_fake if result[1] == 1 else wrong)

accuracy: 0.00%
fake
