# <b><center>Fake news Classifier</center></b>

In [1]:
#Read Dataset

import pandas as pd
import numpy as np

df = pd.read_csv('train.csv',index_col='id')
df.dropna(inplace=True)
df.reset_index(inplace=True)
df.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [2]:
#Cleaning Dataset
x = df['title']
y = df['label']

In [3]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

ps = PorterStemmer()
corpus = []

for i in range(len(x)):
    text = x[i]
    text = re.sub('[^a-zA-Z]',' ',text)
    text = text.lower()
    text = text.split()
    text = [ps.stem(t) for t in text if t not in stopwords.words('english')]
    corpus.append(' '.join(text))

- Here, we are navigating through X before cleaning up our data with regex and storing it in a corpus list.

- First of all, a space will be added in place of everything that is not an alphabet.

- Then it will be separated and lowercased.

- Then we check to see if the words are stopwords before stemming them.

- Join these results together to form a sentence, then add it to the corpus list.

In [4]:
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences

vocab_size = 5000
sent_len = 20

one_hot_encoded = [one_hot(x,vocab_size) for x in corpus]
one_hot_encoded = pad_sequences(one_hot_encoded,maxlen=sent_len)
one_hot_encoded[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0, 1028,
        804, 4382, 4279, 3109, 3693, 1338,  178, 1432, 1373])

- Here, we are utilising one hot to encode our text input to numerical data.
- Keep in mind that this heat is not all 0s and 1s. In this one-hot encoding, the word is given a random number by means of hashing. The range 0-vocab size is used to select the random word.
- The sequences are then being padded with 0s to equalise the length of each line.

In [5]:
from sklearn.model_selection import train_test_split
x = np.array(one_hot_encoded)
y = np.array(y)

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.33,random_state=42)


In [6]:
#Creating the model
import tensorflow as tf
from tensorflow.keras.models import Sequential
from sklearn.metrics import confusion_matrix,accuracy_score
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding,LSTM,Dense,Dropout


no_of_output_features = 40

model = Sequential()
model.add(Embedding(vocab_size,no_of_output_features,input_length=sent_len))
model.add(Dropout(0.5))
model.add(LSTM(100))
model.add(Dropout(0.5))
model.add(Dense(1))

model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])
model.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 20, 40)            200000    
                                                                 
 dropout (Dropout)           (None, 20, 40)            0         
                                                                 
 lstm (LSTM)                 (None, 100)               56400     
                                                                 
 dropout_1 (Dropout)         (None, 100)               0         
                                                                 
 dense (Dense)               (None, 1)                 101       
                                                                 
Total params: 256,501
Trainable params: 256,501
Non-trainable params: 0
_________________________________________________________________


In [18]:
model.fit(x_train,y_train,validation_data=(x_test,y_test),batch_size=64,epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1dfb4c58fd0>

In [19]:
#Checking metrics of the model

predictions = (model.predict(x_test) > 0.5).astype("int32")
accuracy_score(y_test,predictions)

0.9068765534382767