In [None]:
import numpy as np
import pandas as pd
import os
os.environ['KERAS_BACKEND']='tensorflow' 
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.optimizers import Adam
from keras import Sequential
from keras.layers import (GRU,LSTM,
                          Embedding, 
                          Dense, 
                          Dropout, 
                          Bidirectional)
from keras.callbacks import ModelCheckpoint
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
%matplotlib inline

from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import PorterStemmer
import re
from string import punctuation

In [None]:
train_df = pd.read_csv("../input/fake-news/train.csv", index_col = 'id')

print('Shape of dataset ',train_df.shape)
print(train_df.columns)
print('No. of unique classes',len(set(train_df['label'])))
train_df.head()

In [None]:
train_df.isnull().sum()

In [None]:
train_df = train_df.dropna()

In [None]:
print('Shape of dataset ',train_df.shape)

# Preparing the text data

In [None]:
stop_words = stopwords.words('english')
stem = PorterStemmer()

In [None]:
def cleaning(text): 
    text = re.sub('(@[A-Za-z0-9]+)', ' ', text)
    text = text.lower().split()
    text = [stem.stem(word) for word in text if word not in stop_words]
    text = ' '.join(text)
    text = re.sub(r"\d+",' ', text)
    text = ''.join(p for p in text if p not in punctuation)
    return text

In [None]:
train_df['clean'] = train_df['text'].apply(cleaning)

In [None]:
train_df['clean'].head(10)

In [None]:
texts = train_df['clean']
targets = np.asarray(train_df['label'])

In [None]:
MAX_NB_WORDS = 20000
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index  #count unique tokens
print('Number of Unique Tokens',len(word_index))

In [None]:
MAX_SEQUENCE_LENGTH = 1000
text_data = pad_sequences(sequences,maxlen = MAX_SEQUENCE_LENGTH,
                          padding = 'post',
                          truncating = 'post')

In [None]:
EMBEDDING_DIM = 100

model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM))
model.add(Bidirectional(LSTM(64)))
model.add(Dropout(0.3))
model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.summary()
cp=ModelCheckpoint('model_Rnn.hdf5',monitor='val_acc',verbose=1,save_best_only=True)

In [None]:
model.compile(optimizer='adam',
              loss = 'binary_crossentropy',
              metrics = ['accuracy'])

In [None]:
VALIDATION_SPLIT = 0.2
EPOCHS = 5

X_train, X_test, y_train, y_test = train_test_split(text_data, targets, test_size=0.25, random_state=7, shuffle=True)
history = model.fit(X_train,
                    y_train, 
                    batch_size = 128, 
                    validation_split = VALIDATION_SPLIT,
                    epochs = EPOCHS,
                    callbacks=[cp])

In [None]:
loss, accuracy = model.evaluate(X_train, y_train, verbose=True)
print("Accuracy: {:.4f}".format(accuracy))

loss, accuracy = model.evaluate(X_test, y_test, verbose=True)
print("Accuracy: {:.4f}".format(accuracy))

## Testing

In [None]:
test_df = pd.read_csv("../input/fake-news/test.csv")

In [None]:
print('Shape of dataset ',test_df.shape)
print(test_df.columns)
test_df.head()

In [None]:
test_df.isnull().sum()

In [None]:
test_df.fillna(method = 'bfill', inplace = True)

In [None]:
test_df['clean'] = test_df['text'].apply(cleaning)

In [None]:
text_test = test_df['clean']
test_id = test_df['id']

In [None]:
test_sequences = tokenizer.texts_to_sequences(text_test)
test_data = pad_sequences(test_sequences,
                          maxlen = MAX_SEQUENCE_LENGTH,
                          padding = 'post',
                          truncating = 'post') 

In [None]:
preds = model.predict_classes(test_data)
preds

In [None]:
predictions =[]
for i in preds:
    predictions.append(i[0])

In [None]:
len(predictions)

In [None]:
submission = pd.DataFrame({'id':test_id, 'label':predictions})
submission.shape

In [None]:
submission.head(5)

In [None]:
submission.to_csv('submission.csv',index=False)