In [46]:
import numpy as np
import pandas as pd
import tensorflow as tw
import keras
import matplotlib.pyplot as plt
import scipy
from scipy import stats
from keras.utils import pad_sequences

In [47]:
data_set = pd.read_csv('fake-news/train.csv')
data_set_sub = pd.read_csv('fake-news/test.csv')

In [48]:
data_set

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1
...,...,...,...,...,...
20795,20795,Rapper T.I.: Trump a ’Poster Child For White S...,Jerome Hudson,Rapper T. I. unloaded on black celebrities who...,0
20796,20796,"N.F.L. Playoffs: Schedule, Matchups and Odds -...",Benjamin Hoffman,When the Green Bay Packers lost to the Washing...,0
20797,20797,Macy’s Is Said to Receive Takeover Approach by...,Michael J. de la Merced and Rachel Abrams,The Macy’s of today grew from the union of sev...,0
20798,20798,"NATO, Russia To Hold Parallel Exercises In Bal...",Alex Ansary,"NATO, Russia To Hold Parallel Exercises In Bal...",1


In [49]:
data_set_sub

Unnamed: 0,id,title,author,text
0,20800,"Specter of Trump Loosens Tongues, if Not Purse...",David Streitfeld,"PALO ALTO, Calif. — After years of scorning..."
1,20801,Russian warships ready to strike terrorists ne...,,Russian warships ready to strike terrorists ne...
2,20802,#NoDAPL: Native American Leaders Vow to Stay A...,Common Dreams,Videos #NoDAPL: Native American Leaders Vow to...
3,20803,"Tim Tebow Will Attempt Another Comeback, This ...",Daniel Victor,"If at first you don’t succeed, try a different..."
4,20804,Keiser Report: Meme Wars (E995),Truth Broadcast Network,42 mins ago 1 Views 0 Comments 0 Likes 'For th...
...,...,...,...,...
5195,25995,The Bangladeshi Traffic Jam That Never Ends - ...,Jody Rosen,Of all the dysfunctions that plague the world’...
5196,25996,John Kasich Signs One Abortion Bill in Ohio bu...,Sheryl Gay Stolberg,WASHINGTON — Gov. John Kasich of Ohio on Tu...
5197,25997,"California Today: What, Exactly, Is in Your Su...",Mike McPhate,Good morning. (Want to get California Today by...
5198,25998,300 US Marines To Be Deployed To Russian Borde...,,« Previous - Next » 300 US Marines To Be Deplo...


In [50]:
print(data_set.isnull().sum())
print('-----------------')
print(data_set_sub.isnull().sum())

id           0
title      558
author    1957
text        39
label        0
dtype: int64
-----------------
id          0
title     122
author    503
text        7
dtype: int64


In [51]:
data_set['title'].ffill(inplace=True)
data_set_sub['title'].ffill(inplace=True)
title_list_dataset = list(data_set['title'])
title_list_dataset_submission = list(data_set_sub['title'])
print(len(title_list_dataset), data_set.shape)
print(len(title_list_dataset_submission), data_set_sub.shape)

20800 (20800, 5)
5200 (5200, 4)


In [52]:
! pip install stopwords



In [53]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import string
import re

def clean_data(data_list):
    corpus = []
    for sentence in data_list:
        
        title = sentence.lower()
        
        title = re.sub('[^a-zA-Z]', ' ', title)
        
        title = title.split()
        ps = PorterStemmer()
        all_stopwords = stopwords.words('english')
        title = [ps.stem(word) for word in title if not word in set(all_stopwords)]
        title = ' '.join(title)
        
        corpus.append(title)
        
    return corpus


corpus_train = clean_data(title_list_dataset)
corpus_submit = clean_data(title_list_dataset_submission)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kirill/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [54]:
tokenizer = keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(corpus_train + corpus_submit)


from sklearn.model_selection import train_test_split
train_data_x,validate_x, train_data_y, validate_y = train_test_split(corpus_train, list(data_set['label']), test_size=0.2)

In [55]:
maxlen = 50


X_train = tw.keras.preprocessing.sequence.pad_sequences(
            tokenizer.texts_to_sequences(train_data_x),
            maxlen=maxlen,
            padding='post'
        )
y_train = np.array(train_data_y)

X_validate = tw.keras.preprocessing.sequence.pad_sequences(
              tokenizer.texts_to_sequences(validate_x),
              maxlen=maxlen,
              padding='post'
        )
y_validate = np.array(validate_y)

X_submit = tw.keras.preprocessing.sequence.pad_sequences(
            tokenizer.texts_to_sequences(corpus_submit),
            maxlen=maxlen,
            padding='post'
        )

# Make the model

In [56]:
nn = keras.models.Sequential()
nn.add(keras.layers.Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=32, input_length=maxlen))
nn.add(keras.layers.LSTM(units=128, return_sequences=True))
nn.add(keras.layers.Dropout(rate=0.5))
nn.add(keras.layers.LSTM(units=64))
nn.add(keras.layers.Dropout(rate=0.5))
nn.add(keras.layers.Dense(units=1, activation='sigmoid'))

nn.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])

In [57]:
nn.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 50, 32)            525024    
                                                                 
 lstm_4 (LSTM)               (None, 50, 128)           82432     
                                                                 
 dropout_4 (Dropout)         (None, 50, 128)           0         
                                                                 
 lstm_5 (LSTM)               (None, 64)                49408     
                                                                 
 dropout_5 (Dropout)         (None, 64)                0         
                                                                 
 dense_2 (Dense)             (None, 1)                 65        
                                                                 
Total params: 656,929
Trainable params: 656,929
Non-tr

In [None]:
trained_obj = rnn.fit(X_train, y_train, epochs=15, batch_size=64, validation_data=(X_validate, y_validate))

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15

In [None]:
plt.plot(trained_obj.history['loss'])
plt.plot(trained_obj.history['val_loss'])
plt.plot(trained_obj.history['accuracy'])
plt.plot(trained_obj.history['val_accuracy'])
plt.legend(['loss', 'val_loss', 'accuracy', 'val_accuracy'])

In [None]:
pred_submission = np.array(nn.predict(X_submit) >= 0.5, dtype='int32')
submit_frame = pd.DataFrame({'id':data_set_sub['id'] , 'label': pred_submission.reshape(1, -1)[0]})
submit_frame.set_index('id', inplace=True)
submit_frame.to_csv('Sub.csv')