## Imports

In [35]:
import pandas as pd
import tensorflow as tf
import nltk
import re
import numpy as np

from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import Dropout

## paths

In [3]:
df = pd.read_csv('/kaggle/input/fake-news-classification/WELFake_Dataset.csv')

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,1,,Did they post their votes for Hillary already?,1
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1


In [5]:
df = df.dropna() # Dropping NaN values

In [6]:
X = df.drop('label', axis = 1)
len(X)

71537

In [7]:
y = df['label']
len(y)

71537

In [8]:
X.shape

(71537, 3)

In [9]:
# vocabulary size
vocab_size = 5000
messages = X.copy()
messages.reset_index(inplace=True)

In [10]:
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem.snowball import PorterStemmer

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Pre-Processing 

In [11]:
ps = PorterStemmer()
corpus = []
for i in range(0,len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['title'][i])
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)


In [12]:
corpus

['law enforc high alert follow threat cop white blacklivesmatt fyf terrorist video',
 'unbeliev obama attorney gener say charlott rioter peac protest home state north carolina video',
 'bobbi jindal rais hindu use stori christian convers woo evangel potenti bid',
 'satan russia unv imag terrifi new supernuk western world take notic',
 'time christian group sue amazon splc design hate group',
 'dr ben carson target ir never audit spoke nation prayer breakfast',
 'hous intel chair trump russia fake stori evid anyth video',
 'sport bar owner ban nfl game show true american sport like speak rural america video',
 'latest pipelin leak underscor danger dakota access pipelin',
 'gop senat smack punchabl alt right nazi internet',
 'may brexit offer would hurt cost eu citizen eu parliament',
 'schumer call trump appoint offici overse puerto rico relief',
 'watch hilari ad call question health age clinton crime famili boss',
 'chang expect espn polit agenda despit huge subscrib declin breitbart'

In [13]:
# taking every word by indexes
one_hot_repr = [one_hot(words, vocab_size)for words in corpus]
one_hot_repr

[[4724, 1785, 2095, 2804, 1161, 446, 1186, 3873, 2559, 663, 1521, 3429],
 [1898,
  3019,
  1874,
  3371,
  3531,
  121,
  1168,
  3210,
  1808,
  1562,
  3950,
  3714,
  1822,
  3429],
 [1282, 2664, 394, 738, 2641, 4971, 4692, 4348, 1323, 1052, 1632, 3983],
 [4267, 4062, 1089, 1983, 2128, 1177, 2810, 3750, 1072, 1656, 3319],
 [4386, 4692, 1752, 3638, 2272, 3903, 1251, 1609, 1752],
 [2939, 875, 4938, 3940, 2538, 476, 2142, 4163, 2029, 286, 4908],
 [1508, 4419, 1541, 914, 4062, 4846, 4971, 3016, 1569, 3429],
 [54,
  4685,
  1911,
  4557,
  3144,
  3334,
  4244,
  3957,
  4340,
  54,
  3374,
  1650,
  211,
  4109,
  3429],
 [4897, 4917, 2175, 3687, 1463, 3909, 4144, 4917],
 [550, 4305, 2150, 2015, 1545, 249, 3386, 3421],
 [1521, 3148, 2974, 806, 3857, 4537, 1030, 4613, 1030, 38],
 [4566, 2808, 914, 2424, 1287, 3780, 3326, 2773, 1885],
 [1607, 4561, 1671, 2808, 490, 3720, 4755, 3444, 3674, 1386, 2635],
 [196, 2869, 112, 712, 754, 4733, 1669, 2764, 4056, 3552],
 [4914, 4655, 218, 2309, 3184

In [14]:
sentence_length = 30
embedded_docs = pad_sequences(one_hot_repr, padding = 'pre', maxlen= sentence_length)
print(embedded_docs)

[[   0    0    0 ...  663 1521 3429]
 [   0    0    0 ... 3714 1822 3429]
 [   0    0    0 ... 1052 1632 3983]
 ...
 [   0    0    0 ... 2739 3903 4354]
 [   0    0    0 ... 1371 4711 3043]
 [   0    0    0 ... 2681 3444 1039]]


In [15]:
embedded_docs[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0, 4724, 1785, 2095, 2804,
       1161,  446, 1186, 3873, 2559,  663, 1521, 3429], dtype=int32)

In [36]:
#creating the model
embedding_vector_features = 40
model = Sequential()
model.add(Embedding(vocab_size, embedding_vector_features, input_length= sentence_length))
model.add(Dropout(0.3))
model.add(LSTM(100))
model.add(Dropout(0.3))
model.add(Dense(1, activation= 'sigmoid'))
model.compile(loss= 'binary_crossentropy', optimizer= 'adam', metrics= ['accuracy'])
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 30, 40)            200000    
                                                                 
 dropout (Dropout)           (None, 30, 40)            0         
                                                                 
 lstm_1 (LSTM)               (None, 100)               56400     
                                                                 
 dropout_1 (Dropout)         (None, 100)               0         
                                                                 
 dense_1 (Dense)             (None, 1)                 101       
                                                                 
Total params: 256,501
Trainable params: 256,501
Non-trainable params: 0
_________________________________________________________________
None


In [37]:
len(embedded_docs),y.shape

(71537, (71537,))

In [38]:
X_final = np.array(embedded_docs)
y_final = np.array(y)

In [39]:
X_final.shape, y_final.shape

((71537, 30), (71537,))

In [40]:
from sklearn.model_selection import train_test_split
import math
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size= 0.25, random_state = 42)

In [41]:
model.fit(X_train, y_train, validation_data= (X_test, y_test), epochs = 20, batch_size = 32)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7c8d4be49510>

In [45]:
# predict_x=model.predict(X_test) 
# y_pred=np.argmax(predict_x,axis=1)
# y_pred=model.predict_classes(X_test)
y_pred = (model.predict(X_test) > 0.5).astype("int32")



In [46]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,y_pred)

array([[7812, 1048],
       [ 804, 8221]])

In [47]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.8964495387195974