**Importing Libraries**

In [35]:
import numpy as np # for numpy arrays
import pandas as pd # to load dataset and data analysis
import re # regular expression library for fetching and patterns and expressions in data
from nltk.corpus import stopwords  # for comparing and removing less important,common words
from nltk.stem.porter import PorterStemmer
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential # to create sequential models in which layers can be linearly stacked
from tensorflow.keras.layers import Dense,Embedding,LSTM
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import os  # for accessing operating system environment variables to create folders,setting paths
import json # read and write json format file
from zipfile import ZipFile # to unzip compressed downloaded dataset

**Data Collection through Kaggle API**

In [36]:
!pip install kaggle # installing kaggle to access datasets
kaggle_dictionary= json.load(open('/content/kaggle.json')) #loading kaggle. json format file in kaggle dictionary variable



**Setting up Kaggle credentials**

In [37]:
kaggle_dictionary.keys() # dictionary has key value pairs
os.environ['KAGGLE_USERNAME']= kaggle_dictionary['username'] # accessing environment variables and saving in dictionary
os.environ['KAGGLE_KEY']= kaggle_dictionary['key']

**Downloading kaggle dataset using API Command**

In [38]:
#!/bin/bash
!kaggle datasets download jruvika/fake-news-detection # downloading dataset in compressed zip form

Dataset URL: https://www.kaggle.com/datasets/jruvika/fake-news-detection
License(s): ODbL-1.0
fake-news-detection.zip: Skipping, found more recently modified local copy (use --force to force download)


**Unziping the compressed kaggle dataset**

In [39]:
with ZipFile('/content/fake-news-detection.zip') as unzipped: # unzip the compressed folder
   unzipped.extractall()

**Dataset Loading and Preprocessing**

In [40]:
df=pd.read_csv('/content/data.csv') # creating dataframe and loading dataset
df.head() # first 5 rows

Unnamed: 0,URLs,Headline,Body,Label
0,http://www.bbc.com/news/world-us-canada-414191...,Four ways Bob Corker skewered Donald Trump,Image copyright Getty Images\nOn Sunday mornin...,1
1,https://www.reuters.com/article/us-filmfestiva...,Linklater's war veteran comedy speaks to moder...,"LONDON (Reuters) - “Last Flag Flying”, a comed...",1
2,https://www.nytimes.com/2017/10/09/us/politics...,Trump’s Fight With Corker Jeopardizes His Legi...,The feud broke into public view last week when...,1
3,https://www.reuters.com/article/us-mexico-oil-...,Egypt's Cheiron wins tie-up with Pemex for Mex...,MEXICO CITY (Reuters) - Egypt’s Cheiron Holdin...,1
4,http://www.cnn.com/videos/cnnmoney/2017/10/08/...,Jason Aldean opens 'SNL' with Vegas tribute,"Country singer Jason Aldean, who was performin...",1


In [41]:
df.tail() # last 5 rows

Unnamed: 0,URLs,Headline,Body,Label
4004,http://beforeitsnews.com/sports/2017/09/trends...,Trends to Watch,Trends to Watch\n% of readers think this story...,0
4005,http://beforeitsnews.com/u-s-politics/2017/10/...,Trump Jr. Is Soon To Give A 30-Minute Speech F...,Trump Jr. Is Soon To Give A 30-Minute Speech F...,0
4006,https://www.activistpost.com/2017/09/ron-paul-...,"Ron Paul on Trump, Anarchism & the AltRight",,0
4007,https://www.reuters.com/article/us-china-pharm...,China to accept overseas trial data in bid to ...,SHANGHAI (Reuters) - China said it plans to ac...,1
4008,http://beforeitsnews.com/u-s-politics/2017/10/...,Vice President Mike Pence Leaves NFL Game Beca...,Vice President Mike Pence Leaves NFL Game Beca...,0


In [42]:
df.drop(columns='URLs',axis=1,inplace=True) # droping urls

In [43]:
df['Text']= df['Headline']+df['Body'] # merging headline and body

In [44]:
df.drop(columns=['Headline','Body'],axis=1,inplace=True)

In [45]:
df.head()

Unnamed: 0,Label,Text
0,1,Four ways Bob Corker skewered Donald TrumpImag...
1,1,Linklater's war veteran comedy speaks to moder...
2,1,Trump’s Fight With Corker Jeopardizes His Legi...
3,1,Egypt's Cheiron wins tie-up with Pemex for Mex...
4,1,Jason Aldean opens 'SNL' with Vegas tributeCou...


In [46]:
df.isnull().sum() # checking null values

Unnamed: 0,0
Label,0
Text,21


In [47]:
df=df.dropna() #dropping null values

**Downloading stopwords**

In [48]:
import nltk # importing nltk library for text preprocessing
nltk.download('stopwords') # importing stop words from library
print(stopwords.words('english')) # printing stopwords in imported from library to compare with data

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [49]:
stemmed= PorterStemmer() # storing stemming function in variable

In [50]:
def cleaned(Text): #creating a function named as cleaned and providing argument as Text
  cleaned_text= re.sub('[^a-z A-Z\s]','',Text) # re function will substitute every thing except upper,lower casealphabets and space
  cleaned_text= cleaned_text.lower() # will lower case all alphabets to maintain same format
  cleaned_text= cleaned_text.split() # splits the text
  cleaned_text= [stemmed.stem(word) for word in cleaned_text if not word in stopwords.words('english')] #applying for loop for stemming the words to their root form and removing stopwords
  cleaned_text= ' '.join(cleaned_text) # again joining splited words into single string
  return cleaned_text


In [None]:
df['Text']= df['Text'].apply(cleaned) # applying function to dataframe Text feature

**Separating train and test data**

In [None]:
train_data,test_data=train_test_split(df,test_size=0.2,random_state=40)

In [None]:
print(train_data.shape) # checking rows and columns
print(test_data.shape)

**Tokenization**

In [None]:
tokenizer= Tokenizer(num_words=6000) # most frequent common occuring 6000 words while ignoring all other data text
tokenized= tokenizer.fit_on_texts(train_data['Text']) # training tokenizer
paded_sequence_trained=pad_sequences(tokenizer.texts_to_sequences(train_data['Text']),maxlen=200) # converting text to tokens by tokenization and ensuring same length through padding
X_train=paded_sequence_trained
paded_sequence_test= pad_sequences(tokenizer.texts_to_sequences(test_data['Text']),maxlen=200) # same on test data
X_test= paded_sequence_test

In [None]:
Y_train= train_data['Label'] # creatin target features from splited data
Y_test=test_data['Label']

In [None]:
print(X_train)
print(X_test)

**Building LSTM Model**

In [None]:
model= Sequential()
model.add(Embedding(input_dim=6000,output_dim=256,input_length=200,input_shape=(200,)))
model.add(LSTM(256,dropout=0.2,recurrent_dropout=0.2))
model.add(Dense(1,activation='sigmoid'))

In [None]:
model.summary()

**Model compiling**

In [None]:
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

**Model Training**

In [None]:
model.fit(X_train,Y_train,epochs=5,batch_size=64,validation_split=0.2)

**Model Evaluation**

In [None]:
loss,accuracy=model.evaluate(X_test,Y_test)
print(f'The accuracy of model is:{accuracy}')
print(f'The loss value is:{loss}')

**As we have no information about labels being Fake  and authentic because the dataset is itself encoded already,so I am assuming 0 as authentic and 1 as fake news label, You can change it as per your data set **

**Building a predictive Syestem**

In [None]:
def predict_news(Text):
  tokenized_text= tokenizer.texts_to_sequences(Text)
  paded_sequences=pad_sequences(tokenized_text,maxlen=200)
  prediction= model.predict(paded_sequences)
  news= 'Authentic news' if prediction[0][0] > 0.5 else 'Fake News'
  return news



In [None]:
# sample news
sample_news= 'imran khan won election'
news= predict_news([sample_news])
print(f'{news}')