In [5]:
import numpy as np
import pandas as pd
import seaborn as sns
from tensorflow.keras.layers import Dense,LSTM,Embedding
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [15]:
import spacy
from spacy.lang.en import English

In [7]:
df=pd.read_csv('../input/fake-news/train.csv')

In [8]:
df.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [9]:
df.shape

(20800, 5)

In [10]:
df.isna().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [11]:
df.dropna(inplace=True,axis=0)

In [12]:
df.isna().sum()

id        0
title     0
author    0
text      0
label     0
dtype: int64

In [14]:
df['text'][0]

'House Dem Aide: We Didn’t Even See Comey’s Letter Until Jason Chaffetz Tweeted It By Darrell Lucus on October 30, 2016 Subscribe Jason Chaffetz on the stump in American Fork, Utah ( image courtesy Michael Jolley, available under a Creative Commons-BY license) \nWith apologies to Keith Olbermann, there is no doubt who the Worst Person in The World is this week–FBI Director James Comey. But according to a House Democratic aide, it looks like we also know who the second-worst person is as well. It turns out that when Comey sent his now-infamous letter announcing that the FBI was looking into emails that may be related to Hillary Clinton’s email server, the ranking Democrats on the relevant committees didn’t hear about it from Comey. They found out via a tweet from one of the Republican committee chairmen. \nAs we now know, Comey notified the Republican chairmen and Democratic ranking members of the House Intelligence, Judiciary, and Oversight committees that his agency was reviewing emai

In [16]:
parser=English()
nlp=spacy.load('en_core_web_sm')

In [38]:
words=parser(nlp(df['text'][0]))
for word in words[:10]:
    print(word,word.lemma_)

House House
Dem Dem
Aide Aide
: :
We we
Did do
n’t not
Even even
See see
Comey Comey


In [39]:
words=[word.lemma_.lower().strip() for word in words if word.is_punct ==False and word.is_stop==False and word.is_space==False ]
words[:10]

['house',
 'dem',
 'aide',
 'comey',
 'letter',
 'jason',
 'chaffetz',
 'tweet',
 'darrell',
 'lucus']

In [41]:
st=''
for word in words :
    st=st+' '+word
st

' house dem aide comey letter jason chaffetz tweet darrell lucus october 30 2016 subscribe jason chaffetz stump american fork utah image courtesy michael jolley available creative commons license apology keith olbermann doubt worst person world week fbi director james comey accord house democratic aide look like know second bad person turn comey send infamous letter announce fbi look email relate hillary clinton email server rank democrats relevant committee hear comey find tweet republican committee chairman know comey notify republican chairman democratic rank member house intelligence judiciary oversight committee agency review email recently discover order contain classified information long letter go oversight committee chairman jason chaffetz set political world ablaze tweet fbi dir inform fbi learn existence email appear pertinent investigation case reopen jason chaffetz @jasoninthehouse october 28 2016 course know case comey actually say review email light unrelated case”–which

In [42]:
vocab_size = 30
encoded_reviews = one_hot(st, vocab_size)
print(encoded_reviews)

[9, 25, 6, 18, 18, 16, 3, 4, 1, 28, 21, 16, 6, 15, 16, 3, 12, 28, 5, 22, 20, 5, 28, 9, 22, 6, 3, 28, 21, 18, 21, 27, 27, 16, 3, 19, 4, 7, 10, 18, 9, 9, 4, 6, 10, 21, 16, 23, 12, 16, 14, 18, 24, 12, 18, 24, 4, 10, 4, 1, 21, 25, 4, 4, 3, 7, 4, 23, 20, 18, 29, 4, 8, 23, 13, 16, 18, 8, 8, 13, 4, 3, 19, 9, 25, 27, 13, 23, 14, 15, 4, 24, 26, 22, 12, 1, 1, 13, 18, 12, 13, 23, 13, 16, 3, 16, 24, 3, 16, 4, 4, 28, 12, 4, 1, 12, 4, 2, 17, 6, 17, 12, 16, 3, 7, 21, 12, 6, 13, 16, 17, 18, 27, 6, 15, 4, 22, 29, 16, 16, 15, 29, 5, 16, 7, 28, 17, 24, 9, 3, 22, 8, 11, 18, 12, 6, 21, 3, 1, 24, 19, 29, 21, 24, 7, 3, 26, 4, 24, 26, 4, 2, 5, 11, 6, 27, 2, 24, 9, 25, 9, 4, 6, 29, 18, 3, 11, 6, 27, 12, 9, 7, 16, 18, 18, 11, 29, 1, 20, 4, 23, 13, 4, 23, 17, 18, 18, 8, 12, 24, 4, 23, 13, 9, 17, 13, 13, 24, 29, 23, 16, 3, 4, 19, 6, 16, 24, 4, 7, 27, 3, 8, 23, 13, 15, 15, 4, 24, 6, 6, 3, 18, 5, 6, 4, 28, 16, 10, 9, 6, 29, 20, 8, 23, 7, 18, 16, 27, 25, 18, 3, 24, 13, 11, 14, 2, 11, 1, 21, 23, 16, 17, 23, 16, 18, 5

In [45]:
def cleaner(review):
    words=parser(review)
    for word in words:
        clean_words=[word.lemma_.lower().strip() for word in words if word.is_punct ==False and word.is_stop==False and word.is_space==False ]
    st=''
    for word in clean_words :
        st=st+' '+word
    return st

In [46]:
def preprocessing(reviews):
    clean_reviews=[cleaner(review) for review in reviews]
    encoded_reviews=[one_hot(review,5000) for review in clean_reviews]
    return encoded_reviews
        