In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('/content/drive/MyDrive/data/fake news classifier/train.csv/train.csv')

In [None]:
df

In [4]:
df = df.dropna() # dropping NAN values

In [5]:
## Creating independent and dependent feature
X = df.drop('label', axis = 1)
y = df['label']


In [6]:
X.shape,y.shape

((18285, 4), (18285,))

In [7]:
## check tensorflow version
import tensorflow as tf
print(tf.__version__)

2.5.0


In [8]:
## importing all the library
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense

In [9]:
## vocab size
voc_size = 10000

In [10]:
## one_hot representation
message = X.copy()
message.reset_index(inplace = True)

In [None]:
## import nltk library for text preprocessing
import nltk
import re
from nltk.corpus import stopwords
nltk.download('stopwords')

In [None]:
## Data preprocessing
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
corpus = []
for i in range(0,len(message)):
  print(i)
  review = re.sub('[^a-zA-Z]',' ',message['title'][i])
  review = review.lower()
  review  = review.split()
  review = [ps.stem(word)   for word in review   if not word in stopwords.words('english')]
  review = ' '.join(review)
  corpus.append(review)

In [None]:
corpus

In [14]:
## one hot representation
one_hot_repr = [one_hot(word,voc_size)  for word in corpus]

In [None]:
one_hot_repr

In [16]:
## creating embedding layers
sent_length = 20
embedded_doc = pad_sequences(one_hot_repr,padding= 'pre',maxlen=sent_length)

In [17]:
embedded_doc[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0, 8041,
       7442, 8268, 8988, 2285, 7389, 9126,  544, 5243, 5145], dtype=int32)

In [None]:
#creating model
embedding_feature = 40
model= Sequential()
model.add(Embedding(voc_size,embedding_feature,input_length=sent_length))
model.add(LSTM(100))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss = 'binary_crossentropy',optimizer = 'adam',metrics = ['accuracy'])
print(model.summary())

In [19]:
## converting independent feature into array
X_final = np.array(embedded_doc)
y_final = np.array(y)

In [20]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X_final,y_final,test_size = 0.33,random_state = 42)

In [None]:
## model training
model_fitting  = model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs= 20,batch_size= 50)

In [None]:
model_fitting.history

In [23]:
import matplotlib.pyplot as plt


In [None]:
plt.plot(model_fitting.history['loss'],label = 'training loss')
plt.plot(model_fitting.history['val_loss'],label = 'test loss')
plt.legend()
plt.show()

In [None]:
## performance matrix and accuracy
y_pred = model.predict_classes(X_test)

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,y_pred)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)