# The Libraries

For the base and ensemble models.

In [1]:
# import libraries
import os
import string
import warnings
import re
import time
import json
import numpy as np
import pandas as pd
pd.set_option('display.max_colwidth', 50)

import nltk
nltk.download('wordnet')
from nltk import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Perceptron
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.externals import joblib

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


For Neural Network

In [2]:
import spacy
from time import time

from gensim.models.phrases import Phrases, Phraser
from gensim.models import Word2Vec

import multiprocessing

import bokeh.plotting as bp
from bokeh.models import HoverTool, BoxSelectTool
from bokeh.plotting import figure, show, output_notebook

from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import scale

import tensorflow.keras 
from tensorflow.keras.models import Sequential, Model 
from tensorflow.keras import layers
from tensorflow.keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, Dropout, Input, Embedding
from keras.layers.merge import Concatenate

from nltk.tokenize import RegexpTokenizer
from sklearn.metrics import confusion_matrix

Using TensorFlow backend.


For web-scraping

In [3]:
from collections import Counter

import feedparser as fp
import newspaper
from newspaper import Article
import spacy

In [4]:
import en_core_web_sm
nlp = en_core_web_sm.load()

-----------------------------------------------------------------------------------------------------------------------------

# The Data loading

In [5]:
with open('scraped_articles_true.json') as data_file:
    dtrue = json.load(data_file)

In [6]:
for i, site in enumerate((list(dtrue['newspapers']))):
    articles = list(dtrue['newspapers'][site]['articles'])
    if i == 0:
        X_true = pd.DataFrame.from_dict(articles)
    else:
        new_df = pd.DataFrame.from_dict(articles)
        X_true = pd.concat([X_true, new_df], ignore_index = True, sort = True)

In [7]:
with open('scraped_articles_false.json') as data_file:
    dfalse = json.load(data_file)

In [8]:
for i, site in enumerate((list(dfalse['newspapers']))):
    articles = list(dfalse['newspapers'][site]['articles'])
    if i == 0:
        X_false = pd.DataFrame.from_dict(articles)
    else:
        new_df = pd.DataFrame.from_dict(articles)
        X_false = pd.concat([X_false, new_df], ignore_index = True, sort = True)

In [9]:
# a label of 0 is real, a label of 1 is fake
X_true['label'] = 1
X_false['label'] = 0

In [10]:
X_true["author"] = X_true["author"].apply(lambda x: x[0] if len(x) > 0 else np.NaN)
X_false["author"] = X_false["author"].apply(lambda x: x[0] if len(x) > 0 else np.NaN)

X_true.dropna(axis = 0, inplace = True)
X_false.dropna(axis = 0, inplace = True)

In [11]:
if len(X_true) > len(X_false):
    X = pd.concat([X_true.sample(n = len(X_false)), X_false], ignore_index = True, sort = True)
else:
    X = pd.concat([X_true, X_false.sample(n = len(X_true))], ignore_index = True, sort = True)

In [12]:
X.head()

Unnamed: 0,author,label,link,published,text,title
0,Helen Sullivan,1,https://www.theguardian.com/world/2020/may/25/...,2020-05-25T00:00:00,Key developments in the global coronavirus out...,Coronavirus: at a glance
1,Nicola Davis,1,https://www.theguardian.com/world/2020/may/24/...,2020-05-24T00:00:00,Explainer: what do we now know about Covid-19 ...,Explainer: what do we now know about Covid-19 ...
2,Ian Sample,1,https://www.theguardian.com/world/2020/may/22/...,2020-05-22T00:00:00,Politicians have become more cautious about im...,Why we might not get a coronavirus vaccine
3,Sarah Boseley,1,https://www.theguardian.com/science/2020/may/2...,2020-05-22T00:00:00,"Hydroxychloroquine, the anti-malarial drug Don...",Hydroxychloroquine: Trump's Covid-19 'cure' in...
4,Ed Aarons,1,https://www.theguardian.com/sport/2020/mar/13/...,2020-03-13T00:00:00,From major club and international football to ...,Coronavirus and sport – a list of the major ca...


-------------------------------------------------------------------------------------------------------------------------------

# Cleaning part

In [13]:
X['label'].value_counts()

1    266
0    266
Name: label, dtype: int64

In [14]:
df_clean = X
from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer(r'\w+')
df_clean['clean'] = df_clean['text'].astype('str') 
df_clean.dtypes

df_clean["tokens"] = df_clean["clean"].apply(tokenizer.tokenize)
# delete Stop Words

df_clean.head()

Unnamed: 0,author,label,link,published,text,title,clean,tokens
0,Helen Sullivan,1,https://www.theguardian.com/world/2020/may/25/...,2020-05-25T00:00:00,Key developments in the global coronavirus out...,Coronavirus: at a glance,Key developments in the global coronavirus out...,"[Key, developments, in, the, global, coronavir..."
1,Nicola Davis,1,https://www.theguardian.com/world/2020/may/24/...,2020-05-24T00:00:00,Explainer: what do we now know about Covid-19 ...,Explainer: what do we now know about Covid-19 ...,Explainer: what do we now know about Covid-19 ...,"[Explainer, what, do, we, now, know, about, Co..."
2,Ian Sample,1,https://www.theguardian.com/world/2020/may/22/...,2020-05-22T00:00:00,Politicians have become more cautious about im...,Why we might not get a coronavirus vaccine,Politicians have become more cautious about im...,"[Politicians, have, become, more, cautious, ab..."
3,Sarah Boseley,1,https://www.theguardian.com/science/2020/may/2...,2020-05-22T00:00:00,"Hydroxychloroquine, the anti-malarial drug Don...",Hydroxychloroquine: Trump's Covid-19 'cure' in...,"Hydroxychloroquine, the anti-malarial drug Don...","[Hydroxychloroquine, the, anti, malarial, drug..."
4,Ed Aarons,1,https://www.theguardian.com/sport/2020/mar/13/...,2020-03-13T00:00:00,From major club and international football to ...,Coronavirus and sport – a list of the major ca...,From major club and international football to ...,"[From, major, club, and, international, footba..."


In [15]:
x = df_clean['clean']

Y = df_clean['label']

In [16]:
X_train, X_test, Y_train, Y_test = train_test_split(x, Y, test_size = 0.2, random_state = 1)

-------------------------------------------------------------------------------------------------------------------------------

# Turning the texts into vectors, TFIDF, and N-grams

## Count Vector

In [17]:
count_vect = CountVectorizer(token_pattern = r'\w{1,}')

# Learn a vocabulary dictionary of all tokens in the raw documents
count_vect.fit(df_clean['clean'])

# Transform documents to document-term matrix.
X_train_count = count_vect.transform(X_train)
X_test_count = count_vect.transform(X_test)

## N-gram with TFIDF

In [18]:
tfidf_vect_ngram = TfidfVectorizer(analyzer = 'word',
                                   token_pattern = r'\w{1,}',
                                   ngram_range = (2, 3),
                                   max_features = 5000)
print(tfidf_vect_ngram)

tfidf_vect_ngram.fit(df_clean['clean'])
X_train_tfidf_ngram = tfidf_vect_ngram.transform(X_train)
X_test_tfidf_ngram  = tfidf_vect_ngram.transform(X_test)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=5000, min_df=1,
        ngram_range=(2, 3), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='\\w{1,}', tokenizer=None, use_idf=True,
        vocabulary=None)


## N-gram Characters

In [19]:
tfidf_vect_ngram_chars = TfidfVectorizer(analyzer = 'char',
                                         token_pattern = r'\w{1,}',
                                         ngram_range = (2, 3),
                                         max_features = 5000)
print(tfidf_vect_ngram_chars)

tfidf_vect_ngram_chars.fit(df_clean['clean'])
X_train_tfidf_ngram_chars = tfidf_vect_ngram_chars.transform(X_train)
X_test_tfidf_ngram_chars  = tfidf_vect_ngram_chars.transform(X_test)

TfidfVectorizer(analyzer='char', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=5000, min_df=1,
        ngram_range=(2, 3), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='\\w{1,}', tokenizer=None, use_idf=True,
        vocabulary=None)


-------------------------------------------------------------------------------------------------------------------------------

# 1. Model Testing

In [20]:
row_list = []

In [21]:
def classifier_runner(classifier, X_train, X_test):
    classifier.fit(X_train, Y_train)
    train_accuracy = accuracy_score(Y_train, classifier.predict(X_train))
    test_accuracy = accuracy_score(Y_test, classifier.predict(X_test))
    return train_accuracy, test_accuracy

## Logistic Regression

Setting default iteration to 350.

In [None]:
classifier1 = LogisticRegression(solver = 'lbfgs', max_iter = 350, random_state = 0)

train_accuracy, test_accuracy = classifier_runner(classifier1, X_train_count, X_test_count)

row_list.append(['1', 'Base', 'Logistic Regression', 'Count_Vector', train_accuracy, test_accuracy])

In [None]:
classifier2 = LogisticRegression(solver = 'lbfgs', max_iter = 350, random_state = 0)

train_accuracy, test_accuracy = classifier_runner(classifier2, X_train_tfidf_ngram, X_test_tfidf_ngram)

row_list.append(['2', 'Base', 'Logistic Regression', 'TFIDF', train_accuracy, test_accuracy])

In [None]:
classifier3 = LogisticRegression(solver = 'lbfgs', max_iter = 350, random_state = 0)

train_accuracy, test_accuracy = classifier_runner(classifier3, X_train_tfidf_ngram_chars, X_test_tfidf_ngram_chars)

row_list.append(['3', 'Base', 'Logistic Regression', 'N_grams', train_accuracy, test_accuracy])

## Linear SVC

In [None]:
classifier4 = LinearSVC(random_state = 0)

train_accuracy, test_accuracy = classifier_runner(classifier4, X_train_count, X_test_count)

row_list.append(['4', 'Base', 'Linear SVC', 'Count_Vector', train_accuracy, test_accuracy])

In [None]:
classifier5 = LinearSVC(random_state = 0)

train_accuracy, test_accuracy = classifier_runner(classifier5, X_train_tfidf_ngram, X_test_tfidf_ngram)

row_list.append(['5', 'Base', 'Linear SVC', 'TFIDF', train_accuracy, test_accuracy])

In [None]:
classifier6 = LinearSVC(random_state = 0)

train_accuracy, test_accuracy = classifier_runner(classifier6, X_train_tfidf_ngram_chars, X_test_tfidf_ngram_chars)

row_list.append(['6', 'Base', 'Linear SVC', 'N_grams', train_accuracy, test_accuracy])

## K-Neighbors

In [None]:
classifier7 = KNeighborsClassifier()

train_accuracy, test_accuracy = classifier_runner(classifier7, X_train_count, X_test_count)

row_list.append(['7', 'Base', 'K-Neighbors', 'Count_Vector', train_accuracy, test_accuracy])

In [None]:
classifier8 = KNeighborsClassifier()

train_accuracy, test_accuracy = classifier_runner(classifier8, X_train_tfidf_ngram, X_test_tfidf_ngram)

row_list.append(['8', 'Base', 'K-Neighbors', 'TFIDF', train_accuracy, test_accuracy])

In [None]:
classifier9 = KNeighborsClassifier()

train_accuracy, test_accuracy = classifier_runner(classifier9, X_train_tfidf_ngram_chars, X_test_tfidf_ngram_chars)

row_list.append(['9', 'Base', 'K-Neighbors', 'N_grams', train_accuracy, test_accuracy])

## Decision Tree

In [None]:
classifier10 = DecisionTreeClassifier(random_state = 0)

train_accuracy, test_accuracy = classifier_runner(classifier10, X_train_count, X_test_count)

row_list.append(['10', 'Base', 'Decision Tree', 'Count_Vector', train_accuracy, test_accuracy])

In [None]:
classifier11 = DecisionTreeClassifier(random_state = 0)

train_accuracy, test_accuracy = classifier_runner(classifier11, X_train_tfidf_ngram, X_test_tfidf_ngram)

row_list.append(['11', 'Base', 'Decision Tree', 'TFIDF', train_accuracy, test_accuracy])

In [None]:
classifier12 = DecisionTreeClassifier(random_state = 0)

train_accuracy, test_accuracy = classifier_runner(classifier12, X_train_tfidf_ngram_chars, X_test_tfidf_ngram_chars)

row_list.append(['12', 'Base', 'Decision Tree', 'N_grams', train_accuracy, test_accuracy])

## Random Forest - Parameter tuned

Setting assumption
- N estimator = 500
    - This doesn't affect the movement of accuracy
- Max Depth = 3
- Sample Split = 40
- Sample Leaf = 44
- Leaf Nodes = 30
- Weight Fraction = 0.2

In [None]:
classifier13 = RandomForestClassifier(n_estimators = 500, \
                                      max_depth = 3,\
                                      min_samples_split = 40, \
                                      min_samples_leaf = 44, \
                                      max_leaf_nodes = 30, \
                                      min_weight_fraction_leaf = 0.2, \
                                      random_state = 0)

train_accuracy, test_accuracy = classifier_runner(classifier13, X_train_count, X_test_count)

row_list.append(['13', 'Random Forest', 'Random Forest Classifier', 'Count_Vector', train_accuracy, test_accuracy])

In [None]:
classifier14 = RandomForestClassifier(n_estimators = 500, \
                                      max_depth = 3,\
                                      min_samples_split = 40, \
                                      min_samples_leaf = 44, \
                                      max_leaf_nodes = 30, \
                                      min_weight_fraction_leaf = 0.2, \
                                      random_state = 0)

train_accuracy, test_accuracy = classifier_runner(classifier14, X_train_tfidf_ngram, X_test_tfidf_ngram)

row_list.append(['14', 'Random Forest', 'Random Forest Classifier', 'TFIDF', train_accuracy, test_accuracy])

In [None]:
classifier15 = RandomForestClassifier(n_estimators = 500, \
                                      max_depth = 3,\
                                      min_samples_split = 40, \
                                      min_samples_leaf = 44, \
                                      max_leaf_nodes = 30, \
                                      min_weight_fraction_leaf = 0.2, \
                                      random_state = 0)

train_accuracy, test_accuracy = classifier_runner(classifier15, X_train_tfidf_ngram_chars, X_test_tfidf_ngram_chars)

row_list.append(['15', 'Random Forest', 'Random Forest Classifier', 'N_grams', train_accuracy, test_accuracy])

## Bagging Classifier

Using Logistic regression, KNN & Naive Bayes for classifier

In [None]:
classifier16 = BaggingClassifier(LogisticRegression(solver = 'lbfgs', max_iter = 350, random_state = 0), \
                                 n_estimators = 500, \
                                 max_samples = 44, \
                                 max_features = 40, \
                                 random_state = 0)

train_accuracy, test_accuracy = classifier_runner(classifier16, X_train_count, X_test_count)

row_list.append(['16', 'Bagging', 'Logistic Regression', 'Count_Vector', train_accuracy, test_accuracy])

In [None]:
classifier17 = BaggingClassifier(LogisticRegression(solver = 'lbfgs', max_iter = 350, random_state = 0), \
                                 n_estimators = 500, \
                                 max_samples = 44, \
                                 max_features = 40, \
                                 random_state = 0)

train_accuracy, test_accuracy = classifier_runner(classifier17, X_train_tfidf_ngram, X_test_tfidf_ngram)

row_list.append(['17', 'Bagging', 'Logistic Regression', 'TFIDF', train_accuracy, test_accuracy])

In [None]:
classifier18 = BaggingClassifier(LogisticRegression(solver = 'lbfgs', max_iter = 350, random_state = 0), \
                                 n_estimators = 500, \
                                 max_samples = 44, \
                                 max_features = 40, \
                                 random_state = 0)

train_accuracy, test_accuracy = classifier_runner(classifier18, X_train_tfidf_ngram_chars, X_test_tfidf_ngram_chars)

row_list.append(['18', 'Bagging', 'Logistic Regression', 'N_grams', train_accuracy, test_accuracy])

-----------------------------------------

In [None]:
classifier19 = BaggingClassifier(KNeighborsClassifier())

train_accuracy, test_accuracy = classifier_runner(classifier19, X_train_count, X_test_count)

row_list.append(['19', 'Bagging', 'K-Neighbors', 'Count_Vector', train_accuracy, test_accuracy])

In [None]:
classifier20 = BaggingClassifier(KNeighborsClassifier())

train_accuracy, test_accuracy = classifier_runner(classifier20, X_train_tfidf_ngram, X_test_tfidf_ngram)

row_list.append(['20', 'Bagging', 'K-Neighbors', 'TFIDF', train_accuracy, test_accuracy])

In [None]:
classifier21 = BaggingClassifier(KNeighborsClassifier())

train_accuracy, test_accuracy = classifier_runner(classifier21, X_train_tfidf_ngram_chars, X_test_tfidf_ngram_chars)

row_list.append(['21', 'Bagging', 'K-Neighbors', 'N_grams', train_accuracy, test_accuracy])

---------------------------

In [None]:
classifier25 = BaggingClassifier(MultinomialNB())

train_accuracy, test_accuracy = classifier_runner(classifier25, X_train_count, X_test_count)

row_list.append(['25', 'Bagging', 'Naive Bayes', 'Count_Vector', train_accuracy, test_accuracy])

In [None]:
classifier26 = BaggingClassifier(MultinomialNB())

train_accuracy, test_accuracy = classifier_runner(classifier26, X_train_tfidf_ngram, X_test_tfidf_ngram)

row_list.append(['26', 'Bagging', 'Naive Bayes', 'TFIDF', train_accuracy, test_accuracy])

In [None]:
classifier27 = BaggingClassifier(MultinomialNB())

train_accuracy, test_accuracy = classifier_runner(classifier27, X_train_tfidf_ngram_chars, X_test_tfidf_ngram_chars)

row_list.append(['27', 'Bagging', 'Naive Bayes', 'N_grams', train_accuracy, test_accuracy])

## Naive Bayes

In [None]:
classifier22 = MultinomialNB()

train_accuracy, test_accuracy = classifier_runner(classifier22, X_train_count, X_test_count)

row_list.append(['22', 'Base', 'Naive Bayes', 'Count_Vector', train_accuracy, test_accuracy])

In [None]:
classifier23 = MultinomialNB()

train_accuracy, test_accuracy = classifier_runner(classifier23, X_train_tfidf_ngram, X_test_tfidf_ngram)

row_list.append(['23', 'Base', 'Naive Bayes', 'TFIDF', train_accuracy, test_accuracy])

In [22]:
classifier24 = MultinomialNB()

train_accuracy, test_accuracy = classifier_runner(classifier24, X_train_tfidf_ngram_chars, X_test_tfidf_ngram_chars)

row_list.append(['24', 'Base', 'Naive Bayes', 'N_grams', train_accuracy, test_accuracy])

## Perceptron

In [None]:
classifier28 = Perceptron()

train_accuracy, test_accuracy = classifier_runner(classifier28, X_train_count, X_test_count)

row_list.append(['28', 'Base', 'Perceptron', 'Count_Vector', train_accuracy, test_accuracy])

In [None]:
classifier29 = Perceptron()

train_accuracy, test_accuracy = classifier_runner(classifier29, X_train_tfidf_ngram, X_test_tfidf_ngram)

row_list.append(['29', 'Base', 'Perceptron', 'TFIDF', train_accuracy, test_accuracy])

In [None]:
classifier30 = Perceptron()

train_accuracy, test_accuracy = classifier_runner(classifier30, X_train_tfidf_ngram_chars, X_test_tfidf_ngram_chars)

row_list.append(['30', 'Base', 'Perceptron', 'N_grams', train_accuracy, test_accuracy])

# 1.2 Neural Network - Attempt

I borrowed the codes to build the neural network as I'm still an amateur at it. 

## Model 1

In [None]:
df_clean = X
from nltk.tokenize import RegexpTokenizer
t = time()

tokenizer = RegexpTokenizer(r'\w+')
df_clean['clean'] = df_clean['text'].astype('str') 
df_clean.dtypes

df_clean["tokens"] = df_clean["clean"].apply(tokenizer.tokenize)
# delete Stop Words

print('Time to tokenize everything: {} mins'.format(round((time() - t) / 60, 2)))
df_clean.head()

In [None]:
import gensim

w2v_model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

In [None]:
cores = multiprocessing.cpu_count()
w2v_model = Word2Vec(min_count=20,
                     window=2,
                     size=300,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=cores-1)

t = time()
w2v_model.build_vocab(df_clean["tokens"], progress_per=1000)
print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

w2v_model.train(df_clean["tokens"], total_examples=w2v_model.corpus_count, epochs=10000, report_delay=1)
print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

In [None]:
y = X['label'].values
x = np.array(X["clean"])

#And here is the train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)

In [None]:
vectorizer = TfidfVectorizer(analyzer=lambda x: x, min_df=10)
matrix = vectorizer.fit_transform([x for x in X_train])
tfidf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))
print ('vocab size :', len(tfidf))

In [None]:
def buildWordVector(tokens, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in tokens:
        try:
            vec += w2v_model[word].reshape((1, size)) * tfidf[word]
            count += 1.
        except KeyError: # handling the case where the token is not
                         # in the corpus. useful for testing.
            continue
    if count != 0:
        vec /= count
    return vec

In [None]:
train_vecs_w2v = np.concatenate([buildWordVector(z, 300) for z in map(lambda x: x, X_train)])
train_vecs_w2v = scale(train_vecs_w2v)

test_vecs_w2v = np.concatenate([buildWordVector(z, 300) for z in map(lambda x: x, X_test)])
test_vecs_w2v = scale(test_vecs_w2v)

print ('shape for training set : ',train_vecs_w2v.shape,
      '\nshape for test set : ', test_vecs_w2v.shape)

In [None]:
model1 = Sequential()

model1.add(Dense(128, activation='relu', input_dim=300))
model1.add(Dropout(0.7))
model1.add(Dense(1, activation='sigmoid'))
model1.compile(optimizer='adadelta',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model1.summary()

In [None]:
history = model1.fit(train_vecs_w2v, y_train, epochs = 300, batch_size = 50, validation_data = (test_vecs_w2v, y_test))
loss, train_accuracy = model1.evaluate(train_vecs_w2v, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(train_accuracy))
loss, test_accuracy = model1.evaluate(test_vecs_w2v, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(test_accuracy))

row_list.append(['31', 'Neural Network', 'ReLU', 'Word_Vector', train_accuracy, test_accuracy])

In [None]:
model2 = Sequential()

model2.add(Dense(128, activation='softmax', input_dim = 300))
model2.add(Dropout(0.7))
model2.add(Dense(1, activation='sigmoid'))
model2.compile(optimizer='adadelta',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model2.summary()

In [None]:
history = model2.fit(train_vecs_w2v, y_train, epochs = 300, batch_size=50, validation_data=(test_vecs_w2v, y_test))
loss, train_accuracy = model2.evaluate(train_vecs_w2v, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(train_accuracy))
loss, test_accuracy = model2.evaluate(test_vecs_w2v, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(test_accuracy))

row_list.append(['32', 'Neural Network', 'SoftMax', 'Word_Vector', train_accuracy, test_accuracy])

In [None]:
model3 = Sequential()

model3.add(Dense(128, activation='tanh', input_dim=300))
model3.add(Dropout(0.7))
model3.add(Dense(1, activation='sigmoid'))
model3.compile(optimizer='adadelta',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model3.summary()

In [None]:
history = model3.fit(train_vecs_w2v, y_train, epochs = 300, batch_size = 50, validation_data = (test_vecs_w2v, y_test))
loss, train_accuracy = model3.evaluate(train_vecs_w2v, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(train_accuracy))
loss, test_accuracy = model3.evaluate(test_vecs_w2v, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(test_accuracy))

row_list.append(['33', 'Neural Network', 'TanH', 'Word_Vector', train_accuracy, test_accuracy])

-----------------

## Model 2 - Complex

In [None]:
all_words = [word for tokens in X for word in tokens]
all_sentence_lengths = [len(tokens) for tokens in X]
ALL_VOCAB = sorted(list(set(all_words)))
print("%s words total, with a vocabulary size of %s" % (len(all_words), len(ALL_VOCAB)))
print("Max sentence length is %s" % max(all_sentence_lengths))


####################### CHANGE THE PARAMETERS HERE #####################################
EMBEDDING_DIM = 300 # how big is each word vector
MAX_VOCAB_SIZE = 30# how many unique words to use
MAX_SEQUENCE_LENGTH = 9 # max number of words

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words = MAX_VOCAB_SIZE, lower = True, char_level = False)
tokenizer.fit_on_texts(X["text"].tolist())
training_sequences = tokenizer.texts_to_sequences(X_train.tolist())

train_word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(train_word_index))

train_embedding_weights = np.zeros((len(train_word_index) + 1, EMBEDDING_DIM))
for word,index in train_word_index.items():
    train_embedding_weights[index,:] = w2v_model[word] if word in w2v_model else np.random.rand(EMBEDDING_DIM)
print(train_embedding_weights.shape)


######################## TRAIN AND TEST SET #################################
train_cnn_data = pad_sequences(training_sequences, maxlen = MAX_SEQUENCE_LENGTH)
test_sequences = tokenizer.texts_to_sequences(X_test.tolist())
test_cnn_data = pad_sequences(test_sequences, maxlen = MAX_SEQUENCE_LENGTH)

In [None]:
from tensorflow.keras.layers import concatenate

def ConvNet(embeddings, max_sequence_length, num_words, embedding_dim, trainable = False, extra_conv = True):
    
    embedding_layer = Embedding(num_words,
                            embedding_dim,
                            weights = [embeddings],
                            input_length = max_sequence_length,
                            trainable = trainable)

    sequence_input = Input(shape = (max_sequence_length,), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)
    
    convs = []
    filter_sizes = [3,4,5]

    for filter_size in filter_sizes:
        l_conv = Conv1D(filters = 128, kernel_size = filter_size, activation = 'relu')(embedded_sequences)
        l_pool = MaxPooling1D(pool_size = 3)(l_conv)
        convs.append(l_pool)

    l_merge = concatenate([convs[0],convs[1],convs[2]],axis=1)
    
    conv = Conv1D(filters = 128, kernel_size = 3, activation = 'sigmoid')(embedded_sequences)
    pool = MaxPooling1D(pool_size = 3)(conv)
    
    if extra_conv == True:
        x = Dropout(0.5)(l_merge)
    else:
        x = Dropout(0.5)(pool)
    x = Flatten()(x)
    x = Dense(128, activation = 'sigmoid')(x)
    preds = Dense(1, activation='relu')(x)

    model = Model(sequence_input, preds)
    model.compile(loss = 'binary_crossentropy', optimizer = 'adadelta', metrics = ['acc'])
    model.summary()
    
    return model

In [None]:
model = ConvNet(train_embedding_weights, MAX_SEQUENCE_LENGTH, len(train_word_index) + 1, EMBEDDING_DIM, False)

In [None]:
train_cnn_data.shape

In [None]:
history = model.fit(train_cnn_data, y_train, epochs = 10, batch_size = 50, validation_data=(test_cnn_data, y_test))

loss, train_accuracy = model.evaluate(train_cnn_data, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(train_accuracy))
loss, test_accuracy = model.evaluate(test_cnn_data, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(test_accuracy))

row_list.append(['34', 'Neural Network', 'Multi-layer', 'Word_Vector', train_accuracy, test_accuracy])

In [None]:
from tensorflow.keras.layers import concatenate
from keras.initializers import normal

def ConvNet(embeddings, max_sequence_length, num_words, embedding_dim, initializer, trainable = False, extra_conv = True):
    
    embedding_layer = Embedding(num_words,
                            embedding_dim,
                            weights = [embeddings],
                            input_length = max_sequence_length,
                            trainable = trainable)

    sequence_input = Input(shape = (max_sequence_length,), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)
    
    convs = []
    filter_sizes = [3, 4, 5]

    for filter_size in filter_sizes:
        l_conv = Conv1D(filters = 128, kernel_size = filter_size, activation = 'sigmoid')(embedded_sequences)
        l_pool = MaxPooling1D(pool_size = 3)(l_conv)
        convs.append(l_pool)

    l_merge = concatenate([convs[0],convs[1],convs[2]],axis=1)
    
    conv = Conv1D(filters = 128, kernel_size = 3, activation = 'sigmoid')(embedded_sequences)
    pool = MaxPooling1D(pool_size = 3)(conv)
    
    if extra_conv == True:
        x = Dropout(0.5)(l_merge)
    else:
        x = Dropout(0.5)(pool)
    
    x = Flatten()(x)
    x = Dense(128, activation = 'sigmoid', kernel_initializer = initializer)(x)
    preds = Dense(1, activation = 'tanh')(x)

    model = Model(sequence_input, preds)
    model.compile(loss = 'binary_crossentropy', optimizer = 'adadelta', metrics = ['acc'])
    model.summary()
    
    return model

In [None]:
model = ConvNet(train_embedding_weights, MAX_SEQUENCE_LENGTH, len(train_word_index) + 1, EMBEDDING_DIM, 'he_normal', False)

In [None]:
history = model.fit(train_cnn_data, y_train, epochs = 20, batch_size = 50, validation_data = (test_cnn_data, y_test))

loss, train_accuracy = model.evaluate(train_cnn_data, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(train_accuracy))
loss, test_accuracy = model.evaluate(test_cnn_data, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(test_accuracy))

row_list.append(['34', 'Neural Network', 'Multi-layer', 'Word_Vector', train_accuracy, test_accuracy])

-----

# 2. Result

In [None]:
accuracy_dataframe = pd.DataFrame(row_list, columns = ['Classifier No.', 'Type', 'Classifier', 'Method', 'train_accuracy', 'test_accuracy'])
accuracy_dataframe.set_index('Classifier No.', inplace = True)
accuracy_dataframe.drop_duplicates(inplace = True)
accuracy_dataframe.sort_values('test_accuracy', axis = 0, ascending = False, inplace = True)

accuracy_dataframe['Robust'] = (accuracy_dataframe['train_accuracy'] - accuracy_dataframe['test_accuracy'])/accuracy_dataframe['train_accuracy']

In [None]:
accuracy_dataframe[accuracy_dataframe['Robust'] <= 0.2]

# 3. Predictor

In [23]:
# input the url only here

url_input = "https://www.wsj.com/articles/germany-sees-largest-local-covid-19-outbreak-since-lifting-lockdown-11592415003"

In [24]:
paper = Article(url_input)
newsPaper = {}

paper.download()
paper.parse()
newsPaper['title'] = paper.title
newsPaper['text'] = paper.text
newsPaper['link'] = paper.url
newsPaper['author'] = paper.authors
print("articles downloaded from newspaper url: ", paper.url)

X_input = pd.DataFrame(newsPaper)

articles downloaded from newspaper url:  https://www.wsj.com/articles/germany-sees-largest-local-covid-19-outbreak-since-lifting-lockdown-11592415003


In [25]:
df_clean = X_input
from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer(r'\w+')
df_clean['clean'] = df_clean['text'].astype('str') 
df_clean.dtypes

df_clean["tokens"] = df_clean["clean"].apply(tokenizer.tokenize)
# delete Stop Words

df_clean.head()

Unnamed: 0,title,text,link,author,clean,tokens
0,Germany Sees Largest Local Covid-19 Outbreak S...,BERLIN—Germany has recorded its largest local ...,https://www.wsj.com/articles/germany-sees-larg...,Bojan Pancevski,BERLIN—Germany has recorded its largest local ...,"[BERLIN, Germany, has, recorded, its, largest,..."


In [26]:
X_predict = df_clean['clean']

In [27]:
X_predict_tfidf_ngram_chars = tfidf_vect_ngram_chars.transform(X_predict)

In [28]:
output = classifier24.predict(X_predict_tfidf_ngram_chars)

In [34]:
probability = classifier24.predict_proba(X_predict_tfidf_ngram_chars)[:,1]

In [37]:
df_output = df_clean[['link', 'author', 'title', 'text']]

In [38]:
df_output['predict'] = output

In [39]:
df_output['probability'] = probability

In [40]:
df_output

Unnamed: 0,link,author,title,text,predict,probability
0,https://www.wsj.com/articles/germany-sees-larg...,Bojan Pancevski,Germany Sees Largest Local Covid-19 Outbreak S...,BERLIN—Germany has recorded its largest local ...,1,0.532927


Feel free to add output export codes after this line here if needed.

-------------