In [33]:
import numpy as np
import pandas as pd
import os
os.environ['KERAS_BACKEND']='tensorflow' 
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.optimizers import Adam
from keras import Sequential
from keras.layers import (GRU,LSTM,
                          Embedding, 
                          Dense, 
                          Dropout, 
                          Bidirectional)
from keras.callbacks import ModelCheckpoint
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
%matplotlib inline

from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import PorterStemmer
import re
from string import punctuation

In [34]:
train_df = pd.read_csv("../input/fake-news/train.csv", index_col = 'id')

print('Shape of dataset ',train_df.shape)
print(train_df.columns)
print('No. of unique classes',len(set(train_df['label'])))
train_df.head()

Shape of dataset  (20800, 4)
Index(['title', 'author', 'text', 'label'], dtype='object')
No. of unique classes 2


Unnamed: 0_level_0,title,author,text,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [35]:
train_df.isnull().sum()

title      558
author    1957
text        39
label        0
dtype: int64

In [36]:
train_df = train_df.dropna()

In [37]:
print('Shape of dataset ',train_df.shape)

Shape of dataset  (18285, 4)


# Preparing the text data

In [38]:
stop_words = stopwords.words('english')
stem = PorterStemmer()

In [39]:
def cleaning(text): 
    text = re.sub('(@[A-Za-z0-9]+)', ' ', text)
    text = text.lower().split()
    text = [stem.stem(word) for word in text if word not in stop_words]
    text = ' '.join(text)
    text = re.sub(r"\d+",' ', text)
    text = ''.join(p for p in text if p not in punctuation)
    return text

In [40]:
train_df['clean'] = train_df['text'].apply(cleaning)

In [41]:
train_df['clean'].head(10)

id
0     hous dem aide didn’t even see comey’ letter ja...
1     ever get feel life circl roundabout rather hea...
2     truth might get fire octob     tension intelli...
3     video   civilian kill singl us airstrik identi...
4     print iranian woman sentenc six year prison ir...
5     tri times jacki mason voic reason in week’ exc...
7     pari — franc chose idealistic tradit candid su...
9     week michael t flynn resign nation secur advis...
10    organ action activist group morph barack obama...
11    bbc produc spoof “real housewives” tv programm...
Name: clean, dtype: object

In [42]:
texts = train_df['clean']
targets = np.asarray(train_df['label'])

In [43]:
MAX_NB_WORDS = 20000
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index  #count unique tokens
print('Number of Unique Tokens',len(word_index))

Number of Unique Tokens 223439


In [44]:
MAX_SEQUENCE_LENGTH = 1000
text_data = pad_sequences(sequences,maxlen = MAX_SEQUENCE_LENGTH,
                          padding = 'post',
                          truncating = 'post')

In [45]:
EMBEDDING_DIM = 100

model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM))
model.add(Bidirectional(LSTM(64)))
model.add(Dropout(0.3))
model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.summary()
cp=ModelCheckpoint('model_Rnn.hdf5',monitor='val_acc',verbose=1,save_best_only=True)

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 100)         2000000   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 128)               84480     
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 65        
Total params: 2,092,801
Trainable params: 2,092,801
Non-trainable params: 0
_________________________________________________________________


In [46]:
model.compile(optimizer='adam',
              loss = 'binary_crossentropy',
              metrics = ['accuracy'])

In [47]:
VALIDATION_SPLIT = 0.2
EPOCHS = 5

X_train, X_test, y_train, y_test = train_test_split(text_data, targets, test_size=0.25, random_state=7, shuffle=True)
history = model.fit(X_train,
                    y_train, 
                    batch_size = 128, 
                    validation_split = VALIDATION_SPLIT,
                    epochs = EPOCHS,
                    callbacks=[cp])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [63]:
loss, accuracy = model.evaluate(X_train, y_train, verbose=True)
print("Training Accuracy: {:.4f}".format(accuracy))
print("Training Loss: {:.4f}".format(loss))

loss, accuracy = model.evaluate(X_test, y_test, verbose=True)
print("Testing Accuracy: {:.4f}".format(accuracy))
print("Testing Loss: {:.4f}".format(loss))

Training Accuracy: 0.9908
Training Lossy: 0.0341
Testing Accuracy: 0.9617
Testing Loss: 0.1572


In [69]:
from sklearn.metrics import classification_report

y_pred=model.predict(X_test, batch_size=200, verbose=1)
report = classification_report(y_test, y_pred.round())
print(report)

              precision    recall  f1-score   support

           0       0.97      0.96      0.97      2606
           1       0.95      0.96      0.96      1966

    accuracy                           0.96      4572
   macro avg       0.96      0.96      0.96      4572
weighted avg       0.96      0.96      0.96      4572



## Testing

In [70]:
test_df = pd.read_csv("../input/fake-news/test.csv")

In [71]:
print('Shape of dataset ',test_df.shape)
print(test_df.columns)
test_df.head()

Shape of dataset  (5200, 4)
Index(['id', 'title', 'author', 'text'], dtype='object')


Unnamed: 0,id,title,author,text
0,20800,"Specter of Trump Loosens Tongues, if Not Purse...",David Streitfeld,"PALO ALTO, Calif. — After years of scorning..."
1,20801,Russian warships ready to strike terrorists ne...,,Russian warships ready to strike terrorists ne...
2,20802,#NoDAPL: Native American Leaders Vow to Stay A...,Common Dreams,Videos #NoDAPL: Native American Leaders Vow to...
3,20803,"Tim Tebow Will Attempt Another Comeback, This ...",Daniel Victor,"If at first you don’t succeed, try a different..."
4,20804,Keiser Report: Meme Wars (E995),Truth Broadcast Network,42 mins ago 1 Views 0 Comments 0 Likes 'For th...


In [72]:
test_df.isnull().sum()

id          0
title     122
author    503
text        7
dtype: int64

In [73]:
test_df.fillna(method = 'bfill', inplace = True)

In [74]:
test_df['clean'] = test_df['text'].apply(cleaning)

In [75]:
text_test = test_df['clean']
test_id = test_df['id']

In [76]:
test_sequences = tokenizer.texts_to_sequences(text_test)
test_data = pad_sequences(test_sequences,
                          maxlen = MAX_SEQUENCE_LENGTH,
                          padding = 'post',
                          truncating = 'post') 

In [77]:
preds = model.predict_classes(test_data)
preds

array([[0],
       [1],
       [1],
       ...,
       [0],
       [1],
       [0]], dtype=int32)

In [78]:
predictions =[]
for i in preds:
    predictions.append(i[0])

In [79]:
len(predictions)

5200

In [80]:
submission = pd.DataFrame({'id':test_id, 'label':predictions})
submission.shape

(5200, 2)

In [81]:
submission.head(5)

Unnamed: 0,id,label
0,20800,0
1,20801,1
2,20802,1
3,20803,0
4,20804,1


In [82]:
submission.to_csv('submit.csv',index=False)