In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/fake-news-classification/WELFake_Dataset.csv


In [None]:
import pandas as pd
import numpy as np

import nltk
import string

from nltk.corpus import stopwords
from nltk.stem   import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report

import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

import tensorflow as tf

from tensorflow.keras import layers
from tensorflow.keras import losses

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
df = pd.read_csv("../input/fake-news-classification/WELFake_Dataset.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,1,,Did they post their votes for Hillary already?,1
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1


In [None]:
df.drop(["Unnamed: 0"], axis=1, inplace=True)

In [None]:
print(f"Number of rows: {df.shape[0]}\nNumber of columns: {df.shape[1]}")

Number of rows: 72134
Number of columns: 3


In [None]:
fig = px.bar((df.isnull().sum()/len(df))*100, 
              color_discrete_sequence=["deeppink"],
              pattern_shape_sequence=["."])
fig.update_layout(title={'text': "Percentage of Missing Values for each Column",
                         'x': 0.50,  
                         'xanchor': 'center',  
                         'yanchor': 'top',
                         'font': {'size': 22}},
                  margin={'t': 100},
                  xaxis_title="Name of Column",
                  yaxis_title="Percentage", 
                  showlegend=False)
fig.show()

In [None]:
df.fillna(" ", inplace=True)

In [None]:
df["title_text"] = df["title"] + df["text"]
df.head()

Unnamed: 0,title,text,label,title_text
0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1,LAW ENFORCEMENT ON HIGH ALERT Following Threat...
1,,Did they post their votes for Hillary already?,1,Did they post their votes for Hillary already?
2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...
3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0,"Bobby Jindal, raised Hindu, uses story of Chri..."
4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1,SATAN 2: Russia unvelis an image of its terrif...


In [None]:

df["body_length"] = df["title_text"].apply(lambda x: len(x) - x.count(" "))
df.head()

Unnamed: 0,title,text,label,title_text,body_length
0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,4222
1,,Did they post their votes for Hillary already?,1,Did they post their votes for Hillary already?,39
2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,299
3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0,"Bobby Jindal, raised Hindu, uses story of Chri...",6811
4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1,SATAN 2: Russia unvelis an image of its terrif...,1668


> The goal here is to generate new features that help a model distinguish fake from real news. So it's always useful to find some way to see our new features appear to be predictive.

In [None]:
bins = np.linspace(0, 200, 40)

fig = go.Figure()

fig.add_trace(go.Histogram(x=df[df["label"]== 0]["body_length"], 
                           name='Real',
                           xbins=dict(start=0,
                                      end=200,
                                      size=2),
                           marker=dict(color='deeppink', pattern=dict(shape='.', size=5))
                          )
             )

fig.add_trace(go.Histogram(x=df[df["label"]== 1]["body_length"], 
                           name='Fake',
                           xbins=dict(start=0,
                                      end=200,
                                      size=2),
                           marker=dict(color='violet', pattern=dict(shape='-', size=5))
                          )
             )

fig.update_layout(title="Body Length of Real News Vs Body Length of Fake News", 
                  barmode='stack')
fig.show()

>  So the body length is very different for real news and fake news. Fake news seem to be quite a bit longer than real news. So it appears that this extra feature could be really helpful for the model to distinguish real from fake. So if we didn't create this feature, the model may not necessarily pick up on this difference.

In [None]:
df.drop(["title", "text"], axis=1, inplace=True)

In [None]:
title_text = df["title_text"].tolist()
labels = df['label'].tolist()

In [None]:
train_size = int(len(title_text) * 0.8)

x_train = title_text[0:train_size]
x_test = title_text[train_size:]

y_train = labels[0:train_size]
y_test = labels[train_size:]

# Make labels into numpy arrays for use with the network later
y_train_np = np.array(y_train)
y_test_np = np.array(y_test)

print(f"Train size: {len(x_train)}\nTest size:  {len(x_test)}")

Train size: 57707
Test size:  14427


In [None]:
tokenizer = Tokenizer(num_words = 500, oov_token="<OOV>")
tokenizer.fit_on_texts(x_train)
word_index = tokenizer.word_index

train_sequences = tokenizer.texts_to_sequences(x_train)
train_padded = pad_sequences(train_sequences, maxlen=50, padding="post", truncating="post")

test_sequences = tokenizer.texts_to_sequences(x_test)
test_padded = pad_sequences(test_sequences, maxlen=50, padding="post", truncating="post")

In [None]:
list(word_index.items())[:20]

[('<OOV>', 1),
 ('the', 2),
 ('to', 3),
 ('of', 4),
 ('and', 5),
 ('a', 6),
 ('in', 7),
 ('that', 8),
 ('is', 9),
 ('for', 10),
 ('on', 11),
 ('it', 12),
 ('he', 13),
 ('with', 14),
 ('s', 15),
 ('was', 16),
 ('as', 17),
 ('said', 18),
 ('by', 19),
 ('trump', 20)]

In [None]:
reverse_index = [(value,key) for (key,value) in word_index.items()] 
reverse_index[:20]

[(1, '<OOV>'),
 (2, 'the'),
 (3, 'to'),
 (4, 'of'),
 (5, 'and'),
 (6, 'a'),
 (7, 'in'),
 (8, 'that'),
 (9, 'is'),
 (10, 'for'),
 (11, 'on'),
 (12, 'it'),
 (13, 'he'),
 (14, 'with'),
 (15, 's'),
 (16, 'was'),
 (17, 'as'),
 (18, 'said'),
 (19, 'by'),
 (20, 'trump')]

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(500, 16, input_length=50),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(6, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 50, 16)            8000      
                                                                 
 global_average_pooling1d (G  (None, 16)               0         
 lobalAveragePooling1D)                                          
                                                                 
 dense (Dense)               (None, 6)                 102       
                                                                 
 dense_1 (Dense)             (None, 1)                 7         
                                                                 
Total params: 8,109
Trainable params: 8,109
Non-trainable params: 0
_________________________________________________________________


In [None]:
history = model.fit(train_padded, y_train_np, epochs=30, validation_data=(test_padded, y_test_np))

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [None]:
import plotly.subplots as sp

accuracy = history.history["accuracy"]
val_accuracy = history.history["val_accuracy"]

accuracy_df = pd.DataFrame({'accuracy': accuracy, 'val_accuracy': val_accuracy})


fig = px.line(accuracy_df, x=accuracy_df.index, y=['accuracy', 'val_accuracy'], 
              title='Accuracy and Validation Accuracy',
              color_discrete_sequence=["violet", "purple"])

fig.update_layout(xaxis_title="Epochs", yaxis_title="Accuracy")
fig.show()

In [None]:
loss = history.history["loss"]
val_loss = history.history["val_loss"]

loss_df = pd.DataFrame({'loss': loss, 'val_loss': val_loss})

fig = px.line(loss_df, x=loss_df.index, y=['loss', 'val_loss'], 
              title='Loss and Validation Loss',color_discrete_sequence=["violet", "purple"])

fig.update_layout(xaxis_title="Epochs", yaxis_title="Loss")

fig.show()




In [None]:
preds = model.predict(test_padded)



In [None]:
def round(num):
    ret = 0
    if num > 0.5:
        ret = 1
    return ret 

preds = preds.flatten()
test_result = list(map(round, preds))
test_result[:10]

[1, 0, 0, 0, 0, 0, 1, 1, 1, 1]

In [None]:
error = 0

for i in range(len(test_result)):
    result = test_result[i]
    true = y_test[i]
    if result != true:
        error += 1
        
print(error)
print(len(test_result))
print((len(test_result)-error)/len(test_result)*100)

1066
14427
92.6110764538712


# **93% of accuracy in the test set**