In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences




In [3]:
data = pd.read_csv("news.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [4]:
data = data.drop(["Unnamed: 0"], axis=1)
data.head(5)

Unnamed: 0,title,text,label
0,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [5]:
le = preprocessing.LabelEncoder()
le.fit(data['label'])
data['label'] = le.transform(data['label'])

In [6]:
embedding_dim = 50
max_length = 54
padding_type = 'post'
trunc_type = 'post'
oov_tok = "<OOV>"
training_size = 3000
test_portion = 0.1

In [7]:
title = []
text = []
labels = []
for x in range(training_size):
    title.append(data['title'][x])
    text.append(data['text'][x])
    labels.append(data['label'][x])

tokenizer1 = Tokenizer()
tokenizer1.fit_on_texts(title)
word_index1 = tokenizer1.word_index
vocab_size1 = len(word_index1)
sequences1 = tokenizer1.texts_to_sequences(title)
padded1 = pad_sequences(sequences1, padding=padding_type, truncating=trunc_type)

In [8]:
split = int(test_portion * training_size)
training_sequences1 = padded1[split:training_size]
test_sequences1 = padded1[0:split]
test_labels = labels[0:split]
training_labels = labels[split:training_size]

In [9]:
training_sequences1 = np.array(training_sequences1)
test_sequences1 = np.array(test_sequences1)

In [10]:
import requests, zipfile, os

# Define GloVe download URL and filename
glove_url = "http://nlp.stanford.edu/data/glove.6B.zip"
glove_zip_path = "glove.6B.zip"

# Download GloVe zip if not already downloaded
if not os.path.exists(glove_zip_path):
    print("Downloading GloVe embeddings...")
    r = requests.get(glove_url)
    with open(glove_zip_path, "wb") as f:
        f.write(r.content)

# Extract the zip file
if not os.path.exists("glove.6B.50d.txt"):
    print("Extracting...")
    with zipfile.ZipFile(glove_zip_path, 'r') as zip_ref:
        zip_ref.extractall()
    print("Done extracting.")
else:
    print("GloVe file already extracted.")


GloVe file already extracted.


In [11]:
embedding_index = {}
with open("glove.6B.50d.txt", 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = coefs


In [12]:
embedding_index = {}
with open('glove.6B.50d.txt', 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = coefs
        
embedding_matrix = np.zeros((vocab_size1 + 1, embedding_dim))

for word, i in word_index1.items():
    if i < vocab_size1:
        embedding_vector = embedding_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

In [13]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size1 + 1, embedding_dim, input_length=max_length, 
                              weights=[embedding_matrix], trainable=False),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Conv1D(64, 5, activation='relu'),
    tf.keras.layers.MaxPooling1D(pool_size=4),
    tf.keras.layers.LSTM(64),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()




Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 54, 50)            377600    
                                                                 
 dropout (Dropout)           (None, 54, 50)            0         
                                                                 
 conv1d (Conv1D)             (None, 50, 64)            16064     
                                                                 
 max_pooling1d (MaxPooling1  (None, 12, 64)            0         
 D)                                                              
                                                                 
 lstm (LSTM)                 (None, 64)                33024     
                                                                 
 dense (Dense)               (None, 1)                 65        
                                                     

In [14]:
history = model.fit(
    training_sequences1, 
    np.array(training_labels), 
    epochs=50, 
    validation_data=(test_sequences1, np.array(test_labels)), 
    verbose=2
)

Epoch 1/50


85/85 - 7s - loss: 0.6418 - accuracy: 0.6189 - val_loss: 0.5758 - val_accuracy: 0.6733 - 7s/epoch - 80ms/step
Epoch 2/50
85/85 - 1s - loss: 0.5797 - accuracy: 0.6937 - val_loss: 0.5413 - val_accuracy: 0.7000 - 823ms/epoch - 10ms/step
Epoch 3/50
85/85 - 1s - loss: 0.5371 - accuracy: 0.7307 - val_loss: 0.5158 - val_accuracy: 0.7133 - 830ms/epoch - 10ms/step
Epoch 4/50
85/85 - 1s - loss: 0.4995 - accuracy: 0.7604 - val_loss: 0.5247 - val_accuracy: 0.7300 - 808ms/epoch - 10ms/step
Epoch 5/50
85/85 - 1s - loss: 0.4341 - accuracy: 0.8015 - val_loss: 0.4974 - val_accuracy: 0.7800 - 870ms/epoch - 10ms/step
Epoch 6/50
85/85 - 1s - loss: 0.3874 - accuracy: 0.8237 - val_loss: 0.5144 - val_accuracy: 0.7733 - 835ms/epoch - 10ms/step
Epoch 7/50
85/85 - 1s - loss: 0.3576 - accuracy: 0.8452 - val_loss: 0.5192 - val_accuracy: 0.7800 - 846ms/epoch - 10ms/step
Epoch 8/50
85/85 - 1s - loss: 0.3152 - accuracy: 0.8689 - val_loss: 0.4763 - val_accuracy: 0.7700 - 838ms/epoch - 10ms/step
Epoch 9/5

In [21]:
X = "Revanth Reddy is the current CM of Telangana"

sequences = tokenizer1.texts_to_sequences([X])
sequences = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
if model.predict(sequences, verbose=0)[0][0] >= 0.5:
    print("This news is True")
else:
    print("This news is False")

This news is False
