In [1]:
import preprocessing

import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from sklearn.cross_validation import train_test_split
from sklearn.model_selection import StratifiedKFold

import numpy as np
import os
import re
import pickle as pkl
import collections
import codecs
from functools import reduce



In [2]:
# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download('wordnet')

# replace this with your data directory
data_dir = '/Users/luu22/Desktop/fake-real-news/train'

# this is where you would store your pickle
pkl_dir = '/Users/luu22/Desktop/fake-real-news/pickle'

preprocessing.load_data(data_dir)
preprocessing.process_content(pkl_dir)

In [4]:
with open(pkl_dir + '/processed.pickle', 'rb') as f:
    data = pkl.load(f)
    contents = data['contents']
    labels = data['labels']

In [5]:
vocabulary_size = 20000

all_words = reduce(lambda w1, w2: w1 + w2, contents)
word_num = len(all_words)
print('There are {} words in the dataset'.format(word_num))

count = [['UNK', -1]]
count.extend(collections.Counter(all_words).most_common(vocabulary_size - 1))

There are 1304658 words in the dataset


In [6]:
len(count)

20000

In [7]:
dictionary = dict()
for word, _ in count:
    dictionary[word] = len(dictionary)
    
dictionary

{'UNK': 0,
 'trump': 1,
 'said': 2,
 'clinton': 3,
 'state': 4,
 'u': 5,
 'would': 6,
 'one': 7,
 'people': 8,
 'year': 9,
 'new': 10,
 'republican': 11,
 'time': 12,
 'president': 13,
 'obama': 14,
 'american': 15,
 'also': 16,
 'campaign': 17,
 'hillary': 18,
 'like': 19,
 'say': 20,
 'election': 21,
 'party': 22,
 'could': 23,
 'even': 24,
 'country': 25,
 'right': 26,
 'many': 27,
 'two': 28,
 'government': 29,
 'day': 30,
 'candidate': 31,
 'first': 32,
 'political': 33,
 'get': 34,
 'make': 35,
 'house': 36,
 'way': 37,
 'white': 38,
 'world': 39,
 'voter': 40,
 'news': 41,
 'vote': 42,
 'know': 43,
 'percent': 44,
 'going': 45,
 'think': 46,
 'sander': 47,
 'donald': 48,
 'last': 49,
 'war': 50,
 'may': 51,
 'presidential': 52,
 'want': 53,
 'democratic': 54,
 'take': 55,
 'policy': 56,
 'much': 57,
 'told': 58,
 'law': 59,
 'well': 60,
 'email': 61,
 'united': 62,
 'medium': 63,
 'group': 64,
 'america': 65,
 'back': 66,
 'week': 67,
 'support': 68,
 'national': 69,
 'go': 70,


In [8]:
# Smoothing: replace the infrequent words to UNKNOWN
for i, content in enumerate(contents):
        for word_idx, word in enumerate(content):
            if word in dictionary:
                index = dictionary[word]
            else:  # the word is considered as infrequent, represent using UNKNOWN
                index = 0
            contents[i][word_idx] = index

In [9]:
X = contents
y = data['labels']

In [10]:
print('---review---')
print(X[0])
print('---label---')
print(y[0])

---review---
[112, 4587, 5, 5571, 1086, 1758, 1343, 795, 1, 375, 6125, 10, 1578, 405, 10847, 94, 7281, 3530, 162, 4, 483, 1286, 115, 703, 471, 125, 470, 822, 0, 2235, 62, 4, 1758, 1343, 108, 937, 8887, 5571, 1086, 5, 1914, 552, 62, 8505, 740, 77, 29, 13, 1393, 48, 1, 567, 291, 2345, 3967, 1893, 94, 1979, 911, 16294, 165, 291, 2345, 2718, 3805, 96, 94, 0, 483, 1286, 5836, 418, 1192, 5572, 1105, 1333, 778, 3531, 69, 113, 256, 10847, 1241, 1611, 5571, 1086, 7, 16295, 344, 112, 578, 39, 10847, 403, 1369, 595, 339, 113, 94, 1759, 3531, 125, 1459, 4842, 5571, 1086, 10, 5, 713, 796, 3621, 596, 1700, 8506, 1086, 108, 137, 1879, 113, 256, 568, 427, 6851, 4279, 32, 543, 5571, 1086, 5705, 12371, 39, 94, 994, 12372, 498, 54, 29, 1932, 7055, 116, 2392, 1086, 667, 1612, 108, 543, 125, 3307, 1460, 18568, 86, 3531, 988, 7056, 2999, 130, 1339, 543, 5571, 1086, 94, 567, 774, 3390, 1086, 2934, 3268, 108, 12372, 498, 29, 849, 15, 330, 483, 363, 0, 6852, 2058, 1430, 116, 5350, 1993, 229, 19, 1932, 4665, 17

In [11]:
print('Maximum news length: {}'.format(
len(max((X), key=len))))

Maximum news length: 10290


In [12]:
print('Minimum news length: {}'.format(
len(min((X), key=len))))

Minimum news length: 2


In [13]:
lens = [len(s) for s in X]
print("average length: %0.1f" % np.mean(lens))

average length: 434.9


In [14]:
len(X)

3000

In [15]:
from keras.preprocessing import sequence

# limit the maximum review length to max_words by truncating longer news and padding shorter news with a null value (0). 
max_words = 400
X_train = sequence.pad_sequences(X, maxlen=max_words)

Using TensorFlow backend.


### A simple Word2Vec CBOW model

In [16]:
from keras.layers import Dense, Input, Flatten
from keras.layers import GlobalAveragePooling1D, Embedding
from keras.models import Model
from keras.utils.np_utils import to_categorical

EMBEDDING_DIM = 50

# input: a sequence of MAX_SEQUENCE_LENGTH integers
def create_model():
    sequence_input = Input(shape=(max_words,))

    embedding_layer = Embedding(20000, EMBEDDING_DIM,
                            input_length=max_words,
                            trainable=True)
    embedded_sequences = embedding_layer(sequence_input)

    average = GlobalAveragePooling1D()(embedded_sequences)
    predictions = Dense(2, activation='softmax')(average)

    model = Model(sequence_input, predictions)
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam', 
                  metrics=['acc'])
    return model

In [24]:
avg_score1 = []

In [25]:
kfold = StratifiedKFold(n_splits=10, shuffle=True)

i = 0
y = np.array(y)
y_train = to_categorical(np.asarray(y))
cv_score = []


for train, test in kfold.split(X_train, y):
    model = create_model()
#     model.summary()
    print(X_train[0].shape)
    
    i += 1
    print("Running Fold", i, "/", 10)
    model.fit(X_train[train], y_train[train], batch_size=64, epochs=5, verbose=1)
    print("cross_validation score: ", model.evaluate(X_train[test], y_train[test], verbose=0)[1])
    cv_score.append(model.evaluate(X_train[test], y_train[test], verbose=0)[1])
avg_score1.append(sum(cv_score) / 10)

(400,)
Running Fold 1 / 10
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
cross_validation score:  0.8366666674613953
(400,)
Running Fold 2 / 10
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
cross_validation score:  0.823333334128062
(400,)
Running Fold 3 / 10
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
cross_validation score:  0.7966666674613953
(400,)
Running Fold 4 / 10
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
cross_validation score:  0.7899999992052714
(400,)
Running Fold 5 / 10
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
cross_validation score:  0.7766666666666666
(400,)
Running Fold 6 / 10
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
cross_validation score:  0.83
(400,)
Running Fold 7 / 10
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
cross_validation score:  0.8133333333333334
(400,)
Running Fold 8 / 10
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
cross_validation score:  0.8533333341280619
(400,)
Running Fold 9 / 10
Epoch 1/5
Epoch 2/5


In [26]:
avg_score1

[0.8146666667461396]

### A more complex model : CNN - LSTM

In [27]:
from keras.layers import LSTM, Conv1D, MaxPooling1D, Dropout

def create_cnn_lstm():
    # input: a sequence of 1000 integers
    sequence_input = Input(shape=(max_words,), dtype='int32')
    embedding_layer = Embedding(20000, EMBEDDING_DIM,
                        input_length=max_words,
                        trainable=True)
    embedded_sequences = embedding_layer(sequence_input)

    # 1D convolution with 64 output channels
    x = Conv1D(256, 5)(embedded_sequences)
    # MaxPool divides the length of the sequence by 5
    x = MaxPooling1D(5)(x)
    x = Dropout(0.2)(x)
    x = Conv1D(64, 5)(x)
    x = MaxPooling1D(5)(x)
    
    # LSTM layer with a hidden size of 64
    x = Dropout(0.2)(x)
    x = LSTM(16)(x)
    predictions = Dense(2, activation='softmax')(x)

    model = Model(sequence_input, predictions)
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['acc'])
    return model

In [28]:
avg_score2 = []

In [29]:
kfold = StratifiedKFold(n_splits=10, shuffle=True)

i = 0
y = np.array(y)
y_train = to_categorical(np.asarray(y))
cv_score = []

for train, test in kfold.split(X_train, y):
    model = create_cnn_lstm()
#     model.summary()
    print(X_train[0].shape)
#     print(test)
    
    i += 1
    print("Running Fold", i, "/", 10)
    model.fit(X_train[train], y_train[train], batch_size=64, epochs=6, verbose=1)
    print("cross_validation score: ", model.evaluate(X_train[test], y_train[test], verbose=0)[1])
    cv_score.append(model.evaluate(X_train[test], y_train[test], verbose=0)[1])
avg_score2.append(sum(cv_score) / 10)

(400,)
Running Fold 1 / 10
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
cross_validation score:  0.9033333333333333
(400,)
Running Fold 2 / 10
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
cross_validation score:  0.9033333333333333
(400,)
Running Fold 3 / 10
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
cross_validation score:  0.8933333333333333
(400,)
Running Fold 4 / 10
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
cross_validation score:  0.86
(400,)
Running Fold 5 / 10
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
cross_validation score:  0.92
(400,)
Running Fold 6 / 10
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
cross_validation score:  0.91
(400,)
Running Fold 7 / 10
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
cross_validation score:  0.9033333333333333
(400,)
Running Fold 8 / 10
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
cross_validation score:  0.88000000079

In [31]:
avg_score2

[0.8966666668256125]