### Imports

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, CuDNNLSTM, Embedding, Dropout, Activation, Flatten, Bidirectional, GlobalMaxPool1D
from keras.models import Model, Sequential

### Loading in Data

In [None]:
# df = pd.read_csv("../input/fake-news-detection/data.csv")
df_real = pd.read_csv("../input/news-articles/fake_news_set.csv")
df_fake = pd.read_csv("../input/news-articles/real_news_set.csv")
df = pd.concat([df_real, df_fake], axis=0).sample(frac=1).reset_index(drop=True)
df.head()

### Preprocessing
We use a Keras Tokenizer to sequences of word indices. Then, we use GloVe word embeddings to map them to embedding vectors.[](http://)

In [None]:
from nltk.corpus import stopwords
stop = stopwords.words('english')
df['body'].apply(lambda x: [item for item in x if item not in stop])

In [19]:
max_features = 25000
maxlen = 1000
embedding_size = 200

tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(df['body']))
X = tokenizer.texts_to_sequences(df['body'])
X = pad_sequences(X, maxlen = maxlen)
y = df['label']

In [20]:
EMBEDDING_FILE = '../input/glove200dtxt/glove.6B.200d.txt'


def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE) if len(o)>100)

all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
embed_size = all_embs.shape[1]

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))

embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embedding_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

  import sys


### Preparing the Embedding and CNN
We create an embedding layer using the embedding matrix as weights. From there, we build a 1D convnet to classify the news articles.

In [21]:
from keras.callbacks import EarlyStopping

model = Sequential()
model.add(Embedding(max_features, embedding_size, trainable=False, weights=[embedding_matrix])) # add weights
model.add(Bidirectional(CuDNNLSTM(128, return_sequences = True)))
model.add(GlobalMaxPool1D())
model.add(Dense(40, activation="relu"))
model.add(Dropout(0.7))
model.add(Dense(20, activation="relu"))
model.add(Dropout(0.5))
model.add(Dense(1, activation="sigmoid"))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

early = EarlyStopping(monitor='val_acc', min_delta=0, patience=25, verbose=1, mode='auto', restore_best_weights=True)


batch_size = 32
epochs = 30
history = model.fit(X, y, batch_size=batch_size, epochs=epochs, validation_split=0.2, callbacks = [early])



Train on 14922 samples, validate on 3731 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30

KeyboardInterrupt: 

We can compare the output to a CNN with no preset embedding weights.

In [None]:
from keras.callbacks import EarlyStopping

model = Sequential()
model.add(Embedding(max_features, embedding_size, trainable=False))
model.add(Bidirectional(CuDNNLSTM(128, return_sequences = True)))
model.add(GlobalMaxPool1D())
model.add(Dense(40, activation="relu"))
model.add(Dropout(0.7))
model.add(Dense(20, activation="relu"))
model.add(Dropout(0.5))
model.add(Dense(1, activation="sigmoid"))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

early = EarlyStopping(monitor='val_acc', min_delta=0, patience=25, verbose=1, mode='auto', restore_best_weights=True)


batch_size = 32
epochs = 30
history = model.fit(X, y, batch_size=batch_size, epochs=epochs, validation_split=0.2, callbacks = [early])