<a href="https://colab.research.google.com/github/Jagdish05/Fake-News-Detection/blob/main/Final_CNN_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [43]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

Data

In [None]:
downloaded = drive.CreateFile({'id':"1lXpXmq7xjNxNptIY8dR0yCV6egkYCHVT"}) 
downloaded.GetContentFile('train.csv')

import pandas as pd
data = pd.read_csv('train.csv')
data['title'] = data['title'].apply(lambda x: str(x))
data['text'] = data['text'].apply(lambda x: str(x))
data['article'] = data['title'] + ': ' + data['text']
del data['title']
del data['text']
del data['id']
del data['author']

# Removing punctuation
data['article'] = data['article'].str.replace('[^\w\s]','')
# Lower Casing
data['article'] = data['article'].apply(lambda x: " ".join(x.lower() for x in x.split()))
# Removing stopwords.
from nltk.corpus import stopwords
import nltk
stop = stopwords.words('english')
data['article'] = data['article'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
# Lemmatization
from textblob import Word
data['article'] = data['article'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
# Stemming
from nltk.stem import PorterStemmer
st = PorterStemmer()
data['article'] = data['article'].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))

articles = data['article'].values
y = data['label'].values

from sklearn.model_selection import train_test_split
articles_train, articles_test, y_train, y_test = train_test_split(articles, y, test_size=0.20, random_state=1000)

Tokenization

In [None]:
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(articles_train)

X_train = tokenizer.texts_to_sequences(articles_train)
X_test = tokenizer.texts_to_sequences(articles_test)

vocab_size = len(tokenizer.word_index) + 1  # Adding 1 because of reserved 0 index

from keras.preprocessing.sequence import pad_sequences

maxlen = 5000

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

CNN model



In [None]:
from keras.models import Sequential
from keras import layers

embedding_dim = 50

model = Sequential()
model.add(layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=maxlen))
model.add(layers.Conv1D(128, 5, activation='relu'))
model.add(layers.GlobalMaxPool1D())
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 5000, 50)          7344100   
_________________________________________________________________
conv1d (Conv1D)              (None, 4996, 128)         32128     
_________________________________________________________________
global_max_pooling1d (Global (None, 128)               0         
_________________________________________________________________
dense (Dense)                (None, 10)                1290      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 11        
Total params: 7,377,529
Trainable params: 7,377,529
Non-trainable params: 0
_________________________________________________________________


In [None]:
history = model.fit(X_train, y_train, epochs=20, verbose=False, batch_size=10)

loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

Training Accuracy: 1.0000
Testing Accuracy:  0.9803


Saving Model To and Loading Model From JSON

In [None]:
# serialize model to JSON
model_json = model.to_json()
with open("/content/drive/My Drive/final_model.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("/content/drive/My Drive/final_model.h5")
print("Saved model to disk")

Saved model to disk


In [None]:
from keras.models import model_from_json

# load json and create model
json_file = open("/content/drive/My Drive/final_model.json", 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights("/content/drive/My Drive/final_model.h5")
print("Loaded model from disk")

Loaded model from disk


Testing With Data Collected From Google

In [54]:
downloaded1 = drive.CreateFile({'id':"1-yz6aai5IBl7F0WD5C6_OSAywlYUsBFQ"}) 
downloaded1.GetContentFile('google news.csv')

import pandas as pd
google_data = pd.read_csv('google news.csv', sep=";", encoding='cp1252')
google_data['title'] = google_data['title'].apply(lambda x: str(x))
google_data['text'] = google_data['text'].apply(lambda x: str(x))
google_data['article'] = google_data['title'] + ': ' + google_data['text']
del data['title']
del data['text']

# Removing punctuation
google_data['article'] = google_data['article'].str.replace('[^\w\s]','')
# Lower Casing
google_data['article'] = google_data['article'].apply(lambda x: " ".join(x.lower() for x in x.split()))
# Removing stopwords.
google_data['article'] = google_data['article'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
# Lemmatization
google_data['article'] = google_data['article'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
# Stemming
google_data['article'] = google_data['article'].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))

google_data['label'] = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

google_articles= google_data['article'].values
google_y = google_data['label'].values

In [55]:
google_articles = tokenizer.texts_to_sequences(google_articles)
google_articles = pad_sequences(google_articles, padding='post', maxlen=maxlen)
loaded_model.predict_classes(google_articles)



array([[0],
       [0],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [0],
       [1],
       [1],
       [1],
       [1],
       [0]], dtype=int32)