In [1]:
import numpy as np
import pandas as pd

In [2]:
df=pd.read_csv('../input/fake-news/train.csv')
df.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [3]:
df.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [4]:
df=df.dropna()
df.isnull().sum()

id        0
title     0
author    0
text      0
label     0
dtype: int64

In [5]:
x=df.drop('label',axis=1)
y=df['label']

In [6]:
print(x.shape)
print(y.shape)

(18285, 4)
(18285,)


In [7]:
# LSTM model is used to built with the title feature. So, title feature is created

title=x.copy()

In [8]:
title['title'][1]

'FLYNN: Hillary Clinton, Big Woman on Campus - Breitbart'

In [9]:
title.reset_index(inplace=True)

In [10]:
title.head()

Unnamed: 0,index,id,title,author,text
0,0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...
1,1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...
2,2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ..."
3,3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...
4,4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...


In [11]:
len(title)

18285

# Data Cleaning

In [12]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [13]:
ps=PorterStemmer()
corpus=[]

for i in range(0,len(title)):
    rev=re.sub('[^A-Za-z]', ' ', title['title'][i])
    rev=rev.lower()
    rev=rev.split()
    
    rev=[ps.stem(word) for word in rev if not word in stopwords.words('english')]
    rev=' '.join(rev)
    corpus.append(rev)

In [14]:
corpus[0:5]

['hous dem aid even see comey letter jason chaffetz tweet',
 'flynn hillari clinton big woman campu breitbart',
 'truth might get fire',
 'civilian kill singl us airstrik identifi',
 'iranian woman jail fiction unpublish stori woman stone death adulteri']

# onehot representation

In [15]:
from tensorflow.keras.preprocessing.text import one_hot

In [16]:
voc_size=5000

In [17]:
one_hot_rep=[one_hot(words,voc_size)for words in corpus]
one_hot_rep[0:5]

[[1909, 910, 485, 271, 3898, 4080, 3653, 1425, 4092, 1816],
 [2832, 1531, 2329, 3039, 1466, 1090, 3967],
 [4568, 62, 4994, 3939],
 [2072, 4667, 3481, 231, 2256, 1231],
 [973, 1466, 3566, 3092, 4940, 3852, 1466, 37, 1498, 2232]]

# Embedding representation 

In [18]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Dropout,Embedding,LSTM
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [19]:
embedded_rep=pad_sequences(one_hot_rep,padding='pre',maxlen=20)
embedded_rep[0:5]

array([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0, 1909,
         910,  485,  271, 3898, 4080, 3653, 1425, 4092, 1816],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0, 2832, 1531, 2329, 3039, 1466, 1090, 3967],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0, 4568,   62, 4994, 3939],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0, 2072, 4667, 3481,  231, 2256, 1231],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,  973,
        1466, 3566, 3092, 4940, 3852, 1466,   37, 1498, 2232]],
      dtype=int32)

In [20]:
x_final=np.array(embedded_rep)
y_final=np.array(y)

In [21]:
y_final[0:5]

array([1, 0, 1, 1, 1])

In [22]:
from sklearn.model_selection import train_test_split

In [23]:
x_train,x_test,y_train,y_test=train_test_split(x_final,y_final,test_size=0.3,random_state=42)

# Creating Model

In [24]:
embedding_vector_size=40

In [25]:
model=Sequential()
model.add(Embedding(voc_size,embedding_vector_size,input_length=20))
model.add(Dropout(0.3))
model.add(LSTM(100))
model.add(Dropout(0.3))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [26]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 20, 40)            200000    
_________________________________________________________________
dropout (Dropout)            (None, 20, 40)            0         
_________________________________________________________________
lstm (LSTM)                  (None, 100)               56400     
_________________________________________________________________
dropout_1 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense (Dense)                (None, 1)                 101       
Total params: 256,501
Trainable params: 256,501
Non-trainable params: 0
_________________________________________________________________


In [27]:
model.fit(x_train,y_train,validation_data=(x_test,y_test),epochs=10,batch_size=64)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f44680c4650>

# Performance metrics and accuracy

In [28]:
y_pred=model.predict_classes(x_test)



In [29]:
from sklearn.metrics import accuracy_score,confusion_matrix

In [30]:
accuracy_score(y_test,y_pred)

0.9050309879693766

In [31]:
confusion_matrix(y_test,y_pred)

array([[2830,  277],
       [ 244, 2135]])

In [32]:
test=pd.read_csv('../input/fake-news/test.csv')
test.head()

Unnamed: 0,id,title,author,text
0,20800,"Specter of Trump Loosens Tongues, if Not Purse...",David Streitfeld,"PALO ALTO, Calif. — After years of scorning..."
1,20801,Russian warships ready to strike terrorists ne...,,Russian warships ready to strike terrorists ne...
2,20802,#NoDAPL: Native American Leaders Vow to Stay A...,Common Dreams,Videos #NoDAPL: Native American Leaders Vow to...
3,20803,"Tim Tebow Will Attempt Another Comeback, This ...",Daniel Victor,"If at first you don’t succeed, try a different..."
4,20804,Keiser Report: Meme Wars (E995),Truth Broadcast Network,42 mins ago 1 Views 0 Comments 0 Likes 'For th...


In [33]:
test_new=test.dropna()
test_new.isnull().sum()

id        0
title     0
author    0
text      0
dtype: int64

In [34]:
test_new.reset_index(inplace=True)

In [35]:
test_new.head()

Unnamed: 0,index,id,title,author,text
0,0,20800,"Specter of Trump Loosens Tongues, if Not Purse...",David Streitfeld,"PALO ALTO, Calif. — After years of scorning..."
1,2,20802,#NoDAPL: Native American Leaders Vow to Stay A...,Common Dreams,Videos #NoDAPL: Native American Leaders Vow to...
2,3,20803,"Tim Tebow Will Attempt Another Comeback, This ...",Daniel Victor,"If at first you don’t succeed, try a different..."
3,4,20804,Keiser Report: Meme Wars (E995),Truth Broadcast Network,42 mins ago 1 Views 0 Comments 0 Likes 'For th...
4,6,20806,Pelosi Calls for FBI Investigation to Find Out...,Pam Key,"Sunday on NBC’s “Meet the Press,” House Minori..."


In [36]:
test_corpus=[]

for i in range(0,len(test_new)):
    revs=re.sub('[^A-Za-z]',' ',test_new['title'][i])
    revs=revs.lower()
    revs=revs.split()
    
    revs=[ps.stem(word) for word in revs if not word in stopwords.words('english')]
    revs=' '.join(revs)
    test_corpus.append(revs)

In [37]:
test_onehot=[one_hot(word,voc_size)for word in test_corpus]
test_onehot[0]

[2035, 54, 700, 2447, 2542, 4600, 1146, 4976, 4990, 3519, 1223]

In [38]:
test_embedded=pad_sequences(test_onehot,maxlen=20,padding='pre')
test_embedded[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0, 2035,   54,
        700, 2447, 2542, 4600, 1146, 4976, 4990, 3519, 1223], dtype=int32)

In [39]:
y_test=model.predict_classes(test_embedded)



In [40]:
test_new['label']=y_test

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [98]:
test_new.head()

Unnamed: 0,index,id,title,author,text,label
0,0,20800,"Specter of Trump Loosens Tongues, if Not Purse...",David Streitfeld,"PALO ALTO, Calif. — After years of scorning...",0
1,2,20802,#NoDAPL: Native American Leaders Vow to Stay A...,Common Dreams,Videos #NoDAPL: Native American Leaders Vow to...,0
2,3,20803,"Tim Tebow Will Attempt Another Comeback, This ...",Daniel Victor,"If at first you don’t succeed, try a different...",0
3,4,20804,Keiser Report: Meme Wars (E995),Truth Broadcast Network,42 mins ago 1 Views 0 Comments 0 Likes 'For th...,1
4,6,20806,Pelosi Calls for FBI Investigation to Find Out...,Pam Key,"Sunday on NBC’s “Meet the Press,” House Minori...",0


In [42]:
submit=pd.read_csv('../input/fake-news/submit.csv')


In [136]:
c=pd.merge(submit,test_new[['id','label']],on='id',how='left')
c.head()

Unnamed: 0,id,label_x,label_y
0,20800,0,0.0
1,20801,1,
2,20802,0,0.0
3,20803,1,0.0
4,20804,1,1.0


In [137]:
c[c['label_y'].isnull()].head()

Unnamed: 0,id,label_x,label_y
1,20801,1,
5,20805,1,
8,20808,0,
9,20809,1,
22,20822,0,


In [139]:
c['label_y']=c['label_y'].fillna(c['label_x'])

In [140]:
c.head()

Unnamed: 0,id,label_x,label_y
0,20800,0,0.0
1,20801,1,1.0
2,20802,0,0.0
3,20803,1,0.0
4,20804,1,1.0


In [142]:
c['label_y']=c['label_y'].astype('int')

In [144]:
c['label']=c['label_y']
c.head()

Unnamed: 0,id,label_x,label_y,label
0,20800,0,0,0
1,20801,1,1,1
2,20802,0,0,0
3,20803,1,0,0
4,20804,1,1,1


In [145]:
c=c.drop(['label_x','label_y'],axis=1)
c.head()

Unnamed: 0,id,label
0,20800,0
1,20801,1
2,20802,0
3,20803,0
4,20804,1


In [146]:
c.shape

(5200, 2)

In [147]:
c.to_csv('my_submision.csv',index=False)

import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

doc = "NLP  is an interesting     field.  "
new_doc = re.sub("\s+"," ", doc)
print(new_doc)

doc.split()

a=re.sub("[^A-Za-z]"," ",doc)
a=a.split()
a

b=PorterStemmer()
[b.stem(word) for word in a if not word in stopwords.words('english')]