In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import sklearn as sk
import scipy as sp
import matplotlib.pyplot as plt
import nltk
import gensim
import string, time
from math import floor, ceil

from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import precision_score, recall_score

from nltk import tokenize
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

nltk.download('stopwords')
nltk.download('punkt')

from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import model_from_json
from tensorflow.keras import metrics

from gensim.models import Word2Vec
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
import gensim.downloader as api

from scipy.sparse.csr import csr_matrix

MODEL_PATH = '/'
STOP_WORDS = set(stopwords.words('english'))
FAKE, REAL = 0, 1
SEED = 145
OOV = '<OOV>'
PAD_VALUE = '0'

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


# Parameters

In [None]:
TRAINING_SIZE = .70
VALIDATION_SIZE = .15
TEST_SIZE = .15
assert TRAINING_SIZE + VALIDATION_SIZE + TEST_SIZE == 1

VOCAB_SIZE = 10000
TEXT_MAX_LENGTH = 300
TRUNCATING = 'post'
PADDING = 'post'

WORD_SIZE = 300
DOC_SIZE = 300

SILENT, PROGRESS_BAR, ONE_LINEPER_EPOCH = 0, 1, 2

# Dataset Formation Functions

In [None]:
def combine_title_and_text(dataset) -> pd.DataFrame:
    """
    parameters:
      - dataset: a Pandas DataFrame containing all of the data

    usage:
      - dataset = combine_title_and_text(dataset)

    returns:
      - a Pandas DataFrame with the 'title' and 'text' combined into one column titled 'text'
    """

    assert all([col in dataset.columns for col in ['title', 'text', 'label']])
    
    data = pd.DataFrame({'text': dataset['title'] + ". " + dataset['text'],
                           'label': dataset['label']})

    return data

In [None]:
def download_dataset(include_title = True) -> pd.DataFrame:
    """
    parameters:
      - include_title: boolean specifying whether the tile should be included
    
    usage:
      - dataset = download_dataset()
      - dataset = download_dataset(include_title = False)

    returns:
      - a Pandas Dataframe containing the complete set of data
          - the columns are:
            > text
            > label
    """

    # obtain the dataset containing all true articles
    dataset_real = pd.read_csv('drive/My Drive/Colab Notebooks/Data/True.csv')
    # add corresponding class label
    dataset_real['label'] = REAL
    # if the title is to be added to the text
    if include_title:
        # remove all uneccessary columns
        dataset_real = dataset_real.drop(columns=[col for col in dataset_real.columns if col not in ['title', 'text', 'label']])
        # combine the 'title' and 'text' columns into one column named 'text'
        dataset_real = combine_title_and_text(dataset_real)
    else:
        # remove all uneccessary columns
        dataset_real = dataset_real.drop(columns=[col for col in dataset_real.columns if col not in ['text', 'label']])

    # obtain the dataset containing all fake articles and add corresponding class label
    dataset_fake = pd.read_csv('drive/My Drive/Colab Notebooks/Data/Fake.csv')
    # add corresponding class label
    dataset_fake['label'] = FAKE
    # if the title is to be added to the text
    if include_title:
        # remove all uneccessary columns
        dataset_fake = dataset_fake.drop(columns=[col for col in dataset_fake.columns if col not in ['title', 'text', 'label']])
        # combine the 'title' and 'text' columns into one column named 'text'
        dataset_fake = combine_title_and_text(dataset_fake)
    else:
        # remove all uneccessary columns
        dataset_fake = dataset_fake.drop(columns=[col for col in dataset_fake.columns if col not in ['text', 'label']])

    # obtain dataset containing all fake articles
    dataset_only_fake = pd.read_csv('drive/My Drive/Colab Notebooks/Data/FakeNews.csv')
    # remove all articles not written in English
    dataset_only_fake = dataset_only_fake.loc[dataset_only_fake.language == 'english']
    # add corresponding class label
    dataset_only_fake['label'] = FAKE
    # if the title is to be added to the text
    if include_title:
        # remove all uneccessary columns
        dataset_only_fake = dataset_only_fake.drop(columns=[col for col in dataset_only_fake.columns if col not in ['title', 'text', 'label']])
        # combine the 'title' and 'text' columns into one column named 'text'
        dataset_only_fake = combine_title_and_text(dataset_only_fake)
    else:
        # remove all uneccessary columns
        dataset_only_fake = dataset_only_fake.drop(columns=[col for col in dataset_only_fake.columns if col not in ['text', 'label']])
    
    # combine the three datasets and remove all columns with nan values
    data = pd.concat([dataset_real, dataset_fake, dataset_only_fake]).dropna().reset_index(drop=True)

    return data

In [None]:
def split_dataset(data, training_size=TRAINING_SIZE, validation_size=VALIDATION_SIZE, test_size=TEST_SIZE, stratify=True, random_state=SEED) -> pd.DataFrame:
    """
    parameters:
      - data: the complete dataset to split into training, (validation), and test sets
      - training_size: the percentage of the data to use for training
      - validation_size: the percentage of the data to use for the validation set
      - test_size: the percentage of the data to use for testing
      - stratify: boolean value representing if the training, validation, and test sets should be split in a stratified fashion
      - random_seed: the np.random.seed value to use for consistency

    usage:
      - training_set, validation_set, test_set = split_dataset(dataset)

    returns:
      - this function returns a dictionary containing three entries, with each entry conataining the values to be used in that set
      - keys:
          > 'training'
          > 'validation'
          > 'test'
    """

    # check whether the dataset contains the columns text and label
    assert 'text' in data.columns and 'label' in data.columns
    # check whether the percentages specified for the training, validation and test set add up to one
    assert training_size + validation_size + test_size == 1

    class_labels = None
    if stratify:
        # set a 50/50 split between 1's and 0's
        class_labels = [0]*ceil(len(dataset.label)/2) + [1]*floor(len(dataset.label)/2)

    # obtain the test set, and values for the training and validation sets
    train_data, test_data, train_label, test_label = train_test_split(data.text, data.label, test_size=int(len(data)*test_size), random_state=random_state, stratify=class_labels)
    
    if stratify:
        # set a 50/50 split between 1's and 0's
        class_labels = [0]*ceil(len(train_label)/2) + [1]*floor(len(train_label)/2)
    # obtain the training and validation sets
    train_data, validation_data, train_label, validation_label = train_test_split(train_data, train_label, test_size=int(len(data)*validation_size), random_state=random_state, stratify=class_labels)

    # return a pandas DataFrame of each of the sets
    return train_data, train_label, validation_data, validation_label, test_data, test_label
    """
    return (pd.DataFrame(data={'text': train_data, 'label': train_label}).reset_index(drop=True),
            pd.DataFrame(data={'text': validation_data, 'label': validation_label}).reset_index(drop=True),
            pd.DataFrame(data={'text': test_data, 'label': test_label}).reset_index(drop=True))
    """

# Data Cleaning and Preprocessing

In [None]:
def remove_punctuation(data, print_time=False) -> pd.DataFrame:
    """
    parameters:
      - data: a Pandas Series containing all of the text
      - print_time: boolean specifying whether or not the amount of time the function took should be printed

    usage:
      - dataset = remove_punctuation(data)

    returns:
      - a Pandas DataFrame with all punctuation removed
    """

    assert isinstance(data, pd.Series)

    # obtain the start time
    if print_time:
        start_time = time.time()

    # removes all forms of punctuation from the dataset
    data = data.str.replace('[{}]'.format(string.punctuation + '’‘“”…—–•'), '')


    # print the amount of time it took
    if print_time:
        print(f'it took {round(time.time() - start_time, 2)} seconds to remove puctuation')

    return data

In [None]:
def remove_stopwords(data, print_time=False) -> pd.DataFrame:
    """
    parameters:
      - data: a Pandas Series containing all of the text
      - print_time: boolean specifying whether or not the amount of time the function took should be printed

    usage:
      - dataset = remove_stopwords(data)

    returns:
      - a Pandas DataFrame with all stopwords removed
    """

    assert isinstance(data, pd.Series)

    # obtain the start time
    if print_time:
        start_time = time.time()

    # removes all english stop words from the dataset
    data = data.str.replace(r'\b(?:{})\b'.format('|'.join(STOP_WORDS)), '')

    # print the amount of time it took
    if print_time:
        print(f'it took {round(time.time() - start_time, 2)} seconds to remove stopwords')

    return data

In [None]:
def remove_uppercase(data, print_time=False) -> pd.DataFrame:
    """
    parameters:
      - data: a Pandas Series containing all of the text
      - print_time: boolean specifying whether or not the amount of time the function took should be printed

    usage:
      - dataset = remove_uppercase(data)

    returns:
      - a Pandas DataFrame with all text being lowercase
    """

    assert isinstance(data, pd.Series)

    # obtain the start time
    if print_time:
        start_time = time.time()

    # converts all uppercased letters/words to lowercase
    data = data.apply(lambda x: x.lower())
   
    # print the amount of time it took
    if print_time:
        print(f'it took {round(time.time() - start_time, 2)} seconds to remove uppercases')

    return data

In [None]:
def tokenize_text(data, token_length='word') -> pd.DataFrame:
    """
    parameters:
      - data: a Pandas Series containing all of the text
      - token_length: string value representing if the tokenization should split articles into lists of words or sentences
          - possible values:
            > 'word'
            > 'sentence'

    usage:
      - dataset = tokenize_text(dataset)

    returns:
      - a Pandas DataFrame with all text tokenized, i.e. each article is a list of the words from that article
    """

    assert isinstance(data, pd.Series)
    assert token_length in ['word', 'sentence']

    if token_length == 'word':
        # convert each article to a list of the words in the article
        data = data.apply(lambda x: word_tokenize(x))
    elif token_length == 'sentence':
        # convert each article to a list of the sentences in the article
        data = data.apply(lambda x: sent_tokenize(x))

    return data

In [None]:
def join_text(data) -> pd.DataFrame:
    """
    parameters:
      - data: a Pandas Series containing all of the data

    usage:
      - dataset = join_text(dataset)

    returns:
      - a Pandas DataFrame with all text converted to a string
    """

    assert isinstance(data, pd.Series)
    assert all([isinstance(text, list) for text in data])

    # combine each article into one large string
    data = data.apply(lambda x: ' '.join(x))

    return data

In [None]:
def remove_nonstems(data, print_time=False) -> pd.DataFrame:
    """
    parameters:
      - data: a Pandas Series containing all of the text
      - print_time: boolean specifying whether or not the amount of time the function took should be printed

    usage:
      - dataset = remove_nonstems(data)

    returns:
      - a Pandas DataFrame with all words truncating to just their root stems
    """

    assert isinstance(data, pd.Series)

    # obtain the start time
    if print_time:
        start_time = time.time()
    
    # split each article into a list of the words in the article
    data = tokenize_text(data)

    # truncate each word into its root steem
    porter_stemmer = PorterStemmer()
    data = data.apply(lambda x: [porter_stemmer.stem(word) for word in x])

    # combine the articles back into a string
    data = join_text(data)

    # print the amount of time it took
    if print_time:
        print(f'it took {round(time.time() - start_time, 2)} seconds to remove nonstems')

    return data

In [None]:
 def clean_text(data, punctuation=True, stopwords=True, uppercase=True, stemming=True, print_time=False) -> pd.DataFrame:
    """
    parameters:
      - data: a Pandas Series containing all of the text
      - punctuation: boolean value specifying whether or not to remove all punctuation from the text
      - stopwords: boolean value specifying whether or not to remove all stopwords from the text
      - uppercase: boolean value specifying whether or not to change all text to lowercase
      - stemming: boolean value specifying whether or not to truncate all words in the text to their root wor
      - print_time: boolean specifying whether or not the function run time should be printed

    usage:
      - dataset = clean_text(dataset, clean_time=True)

    returns
      - the inputed data with all of the specified cleaning functions applied
    """

    assert isinstance(data, pd.Series)

    run_clean_text_functions = [punctuation,               stopwords,        uppercase,        stemming]
    clean_text_functions     = [remove_punctuation, remove_stopwords, remove_uppercase, remove_nonstems]
    clean_text_descriptions  = ['punctuation',           'stopwords',     'uppercases',      'nonstems']

    for run, function, description in zip(run_clean_text_functions, clean_text_functions, clean_text_descriptions):
        if run:
            if print_time:
                start_time = time.time()
            
            data = function(data)

            if print_time:
                print(f'it took {round(time.time() - start_time, 2)} seconds to remove {description}')

    return data

In [None]:
def encode_labels(labels) -> pd.Series:
    """
    parameters:
      - labels: a pandas series of class labels

    usage:
      - training_set.label = encode_labels(training_set.labels)

    returns:
      - the labels encoded between the values 0 and num_of_classes - 1
    """

    assert isinstance(labels, pd.Series)

    return preprocessing.LabelEncoder().fit_transform(labels)

In [None]:
def obtain_vocabulary(training_data, num_words=VOCAB_SIZE, oov_token=OOV) -> Tokenizer:
    """
    parameters:
      - training_data: the training articles to create the vocabulary from
      - num_words: the size of the vocabulary
      - oov_token: string value to use for words not in the vocabulary, i.e. Out Of Vocabulary

    usage:
      - vocabulary = obtain_vocabulary(training_set)

    returns:
      - a tokenizer object fit on the training data
    """

    assert isinstance(training_data, pd.Series)

    # create a vocaulary for the training data and store the count of each word
    vocab = Tokenizer(num_words=num_words, oov_token=oov_token)
    vocab.fit_on_texts(training_data)

    return vocab

In [None]:
def truncate_vocabulary(vocab, vocab_size=VOCAB_SIZE) -> dict:
    """
    parameters:
      - vocab: a dictionary of the entire vocabulary fitted off of the training data
      - vocab_size: the number of words allowed in the dictionary

    usage:
      - vocabulary.word_index = truncate_vocabulary(vocabulary.word_index)

    returns:
      - a dictionary of words in the vocabulary with size equal to vocab_size
    """

    assert isinstance(vocab, dict)

    # remove the padding string from the vocab
    del vocab[PAD_VALUE]
    # turn the vocab into an interator of its words
    vocab = iter(vocab.keys())

    # create a new dictionary and store the padding string at index 0
    truncated_vocab = dict()
    truncated_vocab[PAD_VALUE] = 0
    
    # cycle through the first vocab_size words in the vocabulary add it to the truncated vocabulary
    num_of_words = 1
    while num_of_words < vocab_size:
        truncated_vocab[next(vocab)] = num_of_words
        num_of_words += 1

    return truncated_vocab

In [None]:
def preprocess_text(data, vocab, max_length=TEXT_MAX_LENGTH, dtype=object, padding=PADDING, truncating=TRUNCATING, value=PAD_VALUE) -> pd.Series:
    """
    parameters:
      - data: the text data to preprocess
      - vocab: the complete vocabulary dictionary
      - max_length: the max length of the vocabulary
      - dtype: the type of the output sequences
          - possible dtype values:
            > int32
            > object
      - padding: string value representing whether to pad before or after each text sequence
          - possible padding values:
            > 'pre'
            > 'post'
      - truncating: string value representing whether to remove words before or after each text sequence if the sequence is longer than max_length
          - possible truncating values:
            > 'pre'
            > 'post'
      - value: the value to be used for padding

    usage:
      - training_set.text = preprocess_text(training_set.text, vocabulary)

    returns:
      - a pandas series where each index correpsonds to a text sequence limited to length max_length,
        and all words not in the vocabulary are replaced with the oov token
    """

    assert isinstance(data, pd.Series)

    # convert each text sequence to its corresponding vocabulary mapping
    data = vocab.texts_to_sequences(data)
    # truncate or pad each sequence to the max length
    data = pd.Series(list(pad_sequences(data, maxlen=max_length, dtype=dtype, padding=padding, truncating=truncating, value=value)))

    # dictionary where each key, value pair is the reverse of all key, value pairs in the vocabulary
    reverse_vocab = dict([(value, key) for (key, value) in vocab.word_index.items()])

    return data.apply(lambda x: ' '.join([reverse_vocab.get(word, '?') for word in x]))

In [None]:
def min_max_normalization(data, lower_bound=0, upper_bound=1):
    """
    parameters:
      - attribute: the column to perform min-max normalization on
      - lower_bound: the lower bound of the range to rescale to
      - upper_bound: the upper bound of the range to rescale to

    returns:
      - a pandas series containing the normalized data
    """

    assert isinstance(data, pd.Series)

    # calculates the minimum and maximum values of the attribute
    min_value = min([min(x) for x in data])
    max_value = max([max(x) for x in data])

    return data.apply(lambda x: lower_bound + ((x - min_value) * (upper_bound - lower_bound))/(max_value - min_value))

# Feature Selection


In [None]:
def get_count_vectorizer(vocabulary, decode_error='strict', analyzer='word', max_df=1, min_df=1, max_features=VOCAB_SIZE) -> sk.feature_extraction.text.CountVectorizer:
    """
    parameters:
      - vocabulary: a predetermined vocabulary       
      - decode_error: determines what to do if a byte sequence is found that is not of 'utf-8'
          - possible decode_errors:
            > 'strict' - throw UnicodeDecodeError
            > 'ignore'
            > 'replace'
      - ngram_range: lower and upper bounds for the ngram
          - possible ngram_range values:
            > (min_n, max_n), i.e. (1, 1) is only unigrams, (1, 2) is unigrams and bigrams
      - analyazer: determines what the features should be created from
          - possible analyzers:
            > 'word'
            > 'char'
            > 'char_wb'
      - max_df: ignore terms with a document frequency stricly higher than the given threshold
          - possible max_df values:
            > float in range [0, 1]
            > int
      - min_df: ignore terms with a document frequency stricly lower than the given threshold
          - possible min_df values:
            > float in range [0, 1]
            > int
      - max_features: the max size of the vocabulary

    usage:
      - vectorizer = get_count_vectorizer(dataset.text)

    returns: 
      - an sklearn count vectorizer with the specified parameters
    """

    assert isinstance(vocabulary, dict)

    return CountVectorizer(decode_error=decode_error, analyzer=analyzer, max_df=max_df, min_df=min_df, max_features=max_features, vocabulary=vocabulary)

In [None]:
def get_tfidf_vectorizer(data, vocabulary, decode_error='strict', ngram_range=(1, 1), analyzer='word', max_df=1, min_df=1, max_features=VOCAB_SIZE) -> sk.feature_extraction.text.TfidfVectorizer:
    """
    parameters:
      - data: training data to fit the tfidf vectorizer off of
      - vocabulary: a predetermined vocabulary       
      - decode_error: determines what to do if a byte sequence is found that is not of 'utf-8'
          - possible decode_errors:
            > 'strict' - throw UnicodeDecodeError
            > 'ignore'
            > 'replace'
      - ngram_range: lower and upper bounds for the ngram
          - possible ngram_range values:
            > (min_n, max_n), i.e. (1, 1) is only unigrams, (1, 2) is unigrams and bigrams
      - analyazer: determines what the features should be created from
          - possible analyzers:
            > 'word'
            > 'char'
            > 'char_wb'
      - max_df: ignore terms with a document frequency stricly higher than the given threshold
          - possible max_df values:
            > float in range [0, 1]
            > int
      - min_df: ignore terms with a document frequency stricly lower than the given threshold
          - possible min_df values:
            > float in range [0, 1]
            > int
      - max_features: the max size of the vocabulary

    usage:
      - tfidf_vectorizer = get_tfidf_vectorizer(dataset.text)

    returns:
      - an sklearn tfidf vectorizer with the specified parameters
    """

    assert isinstance(data, pd.Series)
    assert isinstance(vocabulary, dict)

    vectorizer = TfidfVectorizer(decode_error=decode_error, analyzer=analyzer, max_df=max_df, min_df=min_df, max_features=max_features, vocabulary=vocabulary)
    vectorizer.fit(data)

    return vectorizer

In [None]:
def get_word2vec_model(data, min_count=1, size=WORD_SIZE, workers=1) -> gensim.models.word2vec.Word2Vec:
    """
    parameters:
      - data: the training data to fit the model off of
      - min_count: the threshold value for words; words greater than this value will be included in the model
      - size: the number of dimensions to represent each word
      - workers: number of cores to use for parallelization

    usage:
      - word2vec_model = get_word2vec_model(train_x)

    returns:
      - a word2vec model trained off of the inputed data
    """

    assert isinstance(data, pd.Series)

    return Word2Vec(tokenize_text(data, 'word'), min_count=min_count, size=size, workers=workers)

In [None]:
def get_word2vec_embeddings(data, model_word2vec) -> np.ndarray:
    """
    parameters:
      - data: the data to convert to word embeddings
      - model_word2vec: the word2vec model to use when converting the text

    usage:
      - training_data = get_word2vec_embeddings(train_x, model_word2vec)

    returns:
      - a word level embedding of the inputed text
    """

    assert isinstance(data, pd.Series)

    # split all text into a tokenized list of words
    data = tokenize_text(data, 'word')

    return data.apply(lambda x: model_word2vec.wv[x][0])

In [None]:
def get_doc2vec_model(data, min_count=1, vector_size=DOC_SIZE, workers=1) -> gensim.models.word2vec.Word2Vec:
    """
    parameters:
      - data: the training data to fit the model off of
      - min_count: the threshold value for words; words greater than this value will be included in the model
      - size: the number of dimensions to represent each word
      - workers: number of cores to use for parallelization

    usage:
      - word2vec_model = get_word2vec_model(train_x)

    returns:
      - a word2vec model trained off of the inputed data
    """

    #assert isinstance(data, pd.Series)

    return Doc2Vec(data, min_count=min_count, vector_size=vector_size, workers=workers)

In [None]:
def get_doc2vec_embeddings(data, model_doc2vec) -> np.ndarray:
    """
    """

    assert isinstance(data, pd.Series)

    data = tokenize_text(data, 'word')

    return data.apply(lambda x: model_doc2vec.infer_vector(x))

In [None]:
def tag_text(data, labels=[]) -> list:
    """
    """

    assert isinstance(data, pd.Series)

    if not len(labels):
        labels = data.index

    tagged_data = []

    for text, label in zip(data, labels):
        tagged_data.append(TaggedDocument(words=text.split(), tags=[label]))
        
    return tagged_data

# Model Evaluation

In [None]:
def evaluate_model(model, test_data, test_labels):
    """
    parameters:
      - model: the desired model to evaluate
      - test_data: the test set to evaluate
      - test_labels: the corresponding class labels of the test set

    usage:
      - evaluate_model(model)

    returns:
      - loss: the loss of the model
      - accuracy: the accuracy of the model
    """

    loss, accuracy = model.evaluate(x=test_data, y=test_labels)

    yhat_classes = model.predict_classes(test_data, verbose=0)
    # reduce to 1d array
    yhat_classes = yhat_classes[:, 0]
    
    # precision tp / (tp + fp)
    precision = precision_score(test_labels, yhat_classes, zero_division=1)
    recall = recall_score(test_labels, yhat_classes, zero_division=1)
    fscore_weighted = weighted_fscore(.5, precision, recall)

    print('Loss: %f' % loss)
    print('Accuracy: %f' % accuracy)
    print('Precision: %f' % precision)
    print('Recall: %f' % recall)
    print('fscore: %f' % fscore_weighted)

    return loss, accuracy

In [None]:
def weighted_fscore(weight, precision, recall) -> float:
    """
    parameters:
      - weight: integer value representing the weight to give precision
      - precision: integer, the precision of the model
      - recall: integer, the recall of the model

    usage:
      - fscore_weighted = weighted_fscore(.5, precision, recall)

    returns:
      - the calculated weighted f_score of the model
    """

    return (1 + weight**2)*((precision*recall)/((precision*weight**2)+recall))

# Dataset Preperation

In [None]:
# obtain the raw dataset
dataset = download_dataset()

# clean the dataset
dataset.text = clean_text(dataset.text, punctuation=False, stopwords=False, uppercase=False, stemming=False, print_time=True)

# remove all duplicate rows
#dataset = dataset.drop_duplicates(subset=['text'])

# encode the labels
dataset.label = encode_labels(dataset.label)

In [None]:
# create a training, validation and test set
train_x, train_y, validation_x, validation_y, test_x, test_y = split_dataset(dataset, stratify=True)

In [None]:
# create the vocabulary 
vocabulary = obtain_vocabulary(train_x, num_words=VOCAB_SIZE)
# remove all words not in the vocabulary
vocabulary.word_index = truncate_vocabulary(vocabulary.word_index, vocab_size=VOCAB_SIZE)

In [None]:
# truncate or pad each article to a certain length and replace each word in the vocabulary with the OOV token
train_x      = preprocess_text(     train_x, vocabulary, max_length=TEXT_MAX_LENGTH, padding=PADDING, truncating=TRUNCATING)
validation_x = preprocess_text(validation_x, vocabulary, max_length=TEXT_MAX_LENGTH, padding=PADDING, truncating=TRUNCATING)
test_x       = preprocess_text(      test_x, vocabulary, max_length=TEXT_MAX_LENGTH, padding=PADDING, truncating=TRUNCATING)

# Count Model

In [None]:
# create a count vetorizer to count occurences of each word in the vocabulary
count_vectorizer = get_count_vectorizer(vocabulary.word_index)
count_vectorizer.stop_words_ = None

# convert the training, validation, and test sets into count variants
count_training_data   = count_vectorizer.transform(train_x).toarray()
count_validation_data = count_vectorizer.transform(validation_x).toarray()
count_test_data       = count_vectorizer.transform(test_x).toarray()

In [None]:
# define the model parameters
INPUT_SIZE = len(count_training_data[0])
EMBED_SIZE = 128
DROPOUT_RATE = .5
CONSTRAINT = None

EPOCHS = 5
STEPS_PER_EPOCH = 15
SIZE = 2500

# parameters used for naming the model
MODEL_TYPE = 'BiLSTM'
FEATURE_TYPE = 'Count'

# if the models performance is higher than this accuracy --> save it
ACCEPTABLE_ACCURACY = .70

In [None]:
# create the model
count_model = keras.models.Sequential([
    keras.layers.Embedding(INPUT_SIZE, EMBED_SIZE,
                           embeddings_constraint=CONSTRAINT,
                           mask_zero=True, # not shown in the book
                           input_shape=[None]),
    keras.layers.Dropout(DROPOUT_RATE),
    keras.layers.Bidirectional(keras.layers.LSTM(EMBED_SIZE)),
    keras.layers.Dense(1, activation='sigmoid')
])

# compile the model
count_model.compile(loss='binary_crossentropy', optimizer='adagrad', metrics=['accuracy'])

In [None]:
# create a dictionary to store the models training, and validation set accuracy and loss after each epoch
count_model = {[('train_loss', []), ('train_accuracy', []), ('val_loss', []), ('val_accuracy', [])]}

# iterate through each epoch
for EPOCH in range(1, EPOCHS):
    # establish the lower and upper bounds of the batch size
    LOWER_BOUND, UPPER_BOUND = 0, min(BATCH_SIZE, len(count_training_data))

    # cycle through the batches
    while LOWER_BOUND < len(count_training_data):
        # fit the model to the current batch
        loss, accuracy = count_model.fit( x=count_training_data[LOWER_BOUND:UPPER_BOUND],
                                          y=train_y[LOWER_BOUND:UPPER_BOUND],
                                          epochs=1,
                                          steps_per_epoch=STEPS_PER_EPOCH,
                                          verbose=PROGRESS_BAR )
        # add the training loss and accuracy to the cache
        count_cache['train_loss'].append(loss)
        count_cache['train_accuracy'].append(accuracy)
      
        # update the batch bounds
        LOWER_BOUND, UPPER_BOUND = UPPER_BOUND, UPPER_BOUND + BATCH_SIZE

    # obtain the loss and accuracy of the model on the validation set
    loss, accuracy = count_model.evaluate(x=count_validation_data,
                                          y=validation_y)
    
    count_cache['val_loss'].append(loss)
    count_cache['val_accuracy'].append(accuracy)

    # save the model if it performed higher than ACCEPTABLE_ACCURACY
    if accuracy >= ACCEPTABLE_ACCURACY:
        MODEL_NAME = MODEL_TYPE + FEATURE_TYPE + 'ModelEpoch' + str(EPOCH) + '.h5'
        count_model.save_weights(filepath=MODEL_PATH + MODEL_NAME, save_format='h5')

In [None]:
# evaluate the model on the test set
count_model.evaluate(x=count_test_data, y=train_y)

In [None]:
# obtain the performance of the model
evaluate_model(count_model, count_test_data, test_y)

# TF-IDF Model

In [None]:
# create a tfidf vectorizer to find the term frequency multiplied by the inverse of the document frequency of all words in the articles
tfidf_vectorizer = get_tfidf_vectorizer(train_x, vocabulary.word_index)
tfidf_vectorizer.stop_words = None

# convert the training, validation, and test set into tf-idf variants
tfidf_training_data   = tfidf_vectorizer.transform(train_x).toarray()
tfidf_validation_data = tfidf_vectorizer.transform(validation_x).toarray()
tfidf_test_data       = tfidf_vectorizer.transform(test_x).toarray()

In [None]:
# define the model parameters
INPUT_SIZE = len(tfidf_training_data[0])
EMBED_SIZE = 128
DROPOUT_RATE = .5
CONSTRAINT = None

EPOCHS = 4
STEPS_PER_EPOCH = 8
BATCH_SIZE = 1250

# parameters used for naming the model
MODEL_TYPE = 'BiLSTM'
FEATURE_TYPE = 'Tfidf'

# if the models performance is higher than this accuracy --> save it
ACCEPTABLE_ACCURACY = .70

In [None]:
# create the model
tfidf_model = keras.models.Sequential([
    keras.layers.Embedding(INPUT_SIZE, EMBED_SIZE,
                           embeddings_constraint=CONSTRAINT,
                           mask_zero=True, # not shown in the book
                           input_shape=[None]),
    keras.layers.Dropout(DROPOUT_RATE),
    keras.layers.Bidirectional(keras.layers.LSTM(EMBED_SIZE)),
    keras.layers.Dense(1, activation='sigmoid')
])

# compile the model
tfidf_model.compile(loss='binary_crossentropy', optimizer='adagrad', metrics=['accuracy'])

In [None]:
# create a dictionary to store the models training, and validation set accuracy and loss after each epoch
tfidf_cache = {[('train_loss', []), ('train_accuracy', []), ('val_loss', []), ('val_accuracy', [])]}

# iterate through each epoch
for EPOCH in range(1, EPOCHS):
    # establish the lower and upper bounds of the batch size
    LOWER_BOUND, UPPER_BOUND = 0, min(BATCH_SIZE, len(tfidf_training_data))

    # cycle through the batches
    while LOWER_BOUND < len(tfidf_training_data):
        # fit the model to the current batch
        loss, accuracy = tfidf_model.fit( x=tfidf_training_data[LOWER_BOUND:UPPER_BOUND],
                                          y=train_y[LOWER_BOUND:UPPER_BOUND],
                                          epochs=1,
                                          steps_per_epoch=STEPS_PER_EPOCH,
                                          verbose=PROGRESS_BAR ))
        # add the training loss and accuracy to the cache
        tfidf_cache['train_loss'].append(loss)
        tfidf_cache['train_accuracy'].append(accuracy)
      
        # update the batch bounds
        LOWER_BOUND, UPPER_BOUND = UPPER_BOUND, UPPER_BOUND + BATCH_SIZE

    # obtain the loss and accuracy of the model on the validation set
    loss, accuracy = tfidf_model.evaluate(x=tfidf_validation_data,
                                          y=validation_y)

    # add the validation loss and accuracy to the cache
    tfidf_cache['val_loss'].append(loss)
    tfidf_cache['val_accuracy'].append(accuracy)

    # save the model if it performed higher than ACCEPTABLE_ACCURACY
    if accuracy >= ACCEPTABLE_ACCURACY:
        MODEL_NAME = MODEL_TYPE + FEATURE_TYPE + 'ModelEpoch' + str(EPOCH) + '.h5' 
        tfidf_model.save_weights(filepath=MODEL_PATH + MODEL_NAME, save_format='h5')

In [None]:
# evaluate the model on the test set
x = tfidf_model.evaluate(x=tfidf_test_data, y=test_y)

In [None]:
# obtain the performance of the model
evaluate_model(tfidf_model, tfidf_test_data, test_y)

# Word2Vec Model

In [None]:
# defines the embedding size
WORD2VEC_SIZE = 150

# create the Word2Vec model
word2vec_model = get_word2vec_model(train_x, size=WORD2VEC_SIZE)

# transform the training, validation, and test set into theior corresponding Word2Vec embeddings
word2vec_training_data   = get_word2vec_embeddings(train_x, word2vec_model)
word2vec_validation_data = get_word2vec_embeddings(validation_x, word2vec_model)
word2vec_test_data       = get_word2vec_embeddings(test_x, word2vec_model)

# obtain the minimum and maximum values from the training set
max_num = max([max(x) for x in word2vec_training_data])
min_num = min([min(x) for x in word2vec_training_data])
upper_bound = max_num + abs(min_num)

# normalize the data sets to be in the range 0 -> upper_bound
word2vec_training_data   = np.asarray(list(min_max_normalization(word2vec_training_data, 0, upper_bound)))
word2vec_validation_data = np.asarray(list(min_max_normalization(word2vec_validation_data, 0, upper_bound)))
word2vec_test_data       = np.asarray(list(min_max_normalization(word2vec_test_data, 0, upper_bound)))

In [None]:
# define the parameters of the model
INPUT_SIZE = len(word2vec_training_data[0])
EMBED_SIZE = 100
DROPOUT_RATE = .5
CONSTRAINT = None

EPOCHS = 30
STEPS_PER_EPOCH = 50
BATCH_SIZE = 2500

# parameters used for naming the model
MODEL_TYPE = 'BiLSTM'
FEATURE_TYPE = 'Word2Vec'

# if the model is higher than this accuracy --> save it
ACCEPTABLE_ACCURACY = .70

In [None]:
# create the model
word2vec_model = keras.models.Sequential([
    keras.layers.Embedding(INPUT_SIZE, EMBED_SIZE,
                           embeddings_constraint=CONSTRAINT,
                           mask_zero=True, # not shown in the book
                           input_shape=[None]),
    keras.layers.Dropout(DROPOUT_RATE),
    keras.layers.Bidirectional(keras.layers.LSTM(EMBED_SIZE)),
    keras.layers.Dense(1, activation='sigmoid')
])

# compile the model
word2vec_model.compile(loss='binary_crossentropy', optimizer='adagrad', metrics=['accuracy'])

In [None]:
# create a dictionary to store the models training, and validation set accuracy and loss after each epoch
word2vec_cache = {[('train_loss', []), ('train_accuracy', []), ('val_loss', []), ('val_accuracy', [])]}

# iterate through each epoch
for EPOCH in range(1, EPOCHS):
    # establish the lower and upper bounds of the batch size
    LOWER_BOUND, UPPER_BOUND = 0, min(BATCH_SIZE, len(word2vec_training_data))

    # cycle through the batches
    while LOWER_BOUND < len(training_data):
        # fit the model to the current batch
        loss, accuracy = word2vec_model.fit( x=word2vec_training_data[LOWER_BOUND:UPPER_BOUND],
                                             y=train_y[LOWER_BOUND:UPPER_BOUND],
                                             epochs=1,
                                             steps_per_epoch=STEPS_PER_EPOCH,
                                             verbose=VERBOSE )
        # add the training loss and accuracy to the cache
        word2vec_cache['train_loss'].append(loss)
        word2vec_cache['train_accuracy'].append(accuracy)
        
        # update the batch bounds
        LOWER_BOUND, UPPER_BOUND = UPPER_BOUND, UPPER_BOUND + BATCH_SIZE

    # obtain the loss and accuracy of the model on the validation set
    loss, accuracy = model.evaluate(x=word2vec_validation_data,
                                    y=validation_y)
    
    # add the validation loss and accuracy to the cache
    word2vec_cache['val_loss'].append(loss)
    word2vec_cache['val_accuracy'].append(accuracy)

    # save the model if it performed higher than ACCEPTABLE_ACCURACY
    if accuracy >= ACCEPTABLE_ACCURACY:
        MODEL_NAME = MODEL_TYPE + FEATURE_TYPE + 'ModelEpoch' + str(EPOCH) + '.h5' 
        model.save_weights(filepath=MODEL_PATH + MODEL_NAME, save_format='h5')

In [None]:
# evaluate the model on the test set
x = word2vec_model.evaluate(x=word2vec_test_data, y=test_y)

In [None]:
# obtain the performance of the model
evaluate_model(word2vec_model, test_data, test_y)

# Doc2Vec Model

In [None]:
# defines the embedding size
DOC2VEC_SIZE = 150

# creates the Doc2Vec model
doc2vec_model = get_doc2vec_model(tag_text(train_x), vector_size=DOC2VEC_SIZE)

# transform the training, validation, and test set into their corresponding Doc2Vecembeddings
doc2vec_training_data   = get_doc2vec_embeddings(train_x, doc2vec_model)
doc2vec_validation_data = get_doc2vec_embeddings(validation_x, doc2vec_model)
doc2vec_test_data       = get_doc2vec_embeddings(test_x, doc2vec_model)

# obtain the minimum and maximum values from the training set
max_num = max([max(x) for x in doc2vec_training_data])
min_num = min([min(x) for x in doc2vec_training_data])
upper_bound = max_num + abs(min_num)

# normalize the data sets to be in the range 0 -> upper_bound
doc2vec_training_data   = np.asarray(list(min_max_normalization(doc2vec_training_data, 0, upper_bound)))
doc2vec_validation_data = np.asarray(list(min_max_normalization(doc2vec_validation_data, 0, upper_bound)))
doc2vec_test_data       = np.asarray(list(min_max_normalization(doc2vec_test_data, 0, upper_bound)))

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [None]:
# define the parameters of the model
INPUT_SIZE = len(doc2vec_training_data[0])
EMBED_SIZE = 100
DROPOUT_RATE = .5
CONSTRAINT = None

VERBOSE = PROGRESS_BAR

EPOCHS = 30
STEPS_PER_EPOCH = 50
BATCH_SIZE = 2500

# parameters used for naming the model
MODEL_TYPE = 'BiLSTM'
FEATURE_TYPE = 'Doc2Vec' + str(DOC2VEC_SIZE)

# if the model is higher than this accuracy --> save it
ACCEPTABLE_ACCURACY = .70

In [None]:
# create the model
doc2vec_model = keras.models.Sequential([
    keras.layers.Embedding(INPUT_SIZE, EMBED_SIZE,
                           embeddings_constraint=CONSTRAINT,
                           mask_zero=True, 
                           input_shape=[None]),
    keras.layers.Dropout(DROPOUT_RATE),
    keras.layers.Bidirectional(keras.layers.LSTM(EMBED_SIZE)),
    keras.layers.Dense(1, activation='sigmoid')
])

# compile the model
doc2vec_model.compile(loss='binary_crossentropy', optimizer='adagrad', metrics=['accuracy'])

In [None]:
# create a dictionary to store the models training, and validation set accuracy and loss after each epoch
doc2vec_cache = {[('train_loss', []), ('train_accuracy', []), ('val_loss', []), ('val_accuracy', [])]}

# iterate through each epoch
for EPOCH in range(1, EPOCHS):
    # establish the lower and upper bounds of the batch size
    LOWER_BOUND, UPPER_BOUND = 0, min(BATCH_SIZE, len(doc2vec_training_data))

    # cycle through the batches
    while LOWER_BOUND < len(doc2vec_training_data):
        
        # fit the model to the batch
        loss, accuracy = doc2vec_model.fit( x=doc2vec_training_data[LOWER_BOUND:UPPER_BOUND],
                                            y=train_y[LOWER_BOUND:UPPER_BOUND],
                                            epochs=1,
                                            steps_per_epoch=STEPS_PER_EPOCH,
                                            verbose=VERBOSE )
        # add the training loss and accuracy to the cache
        doc2vec_cache['train_loss'].append(loss)
        doc2vec_cache['train_accuracy'].append(accuracy)
      
        # update the batch bounds
        LOWER_BOUND, UPPER_BOUND = UPPER_BOUND, UPPER_BOUND + BATCH_SIZE

    # obtain the loss and accuracy of the model on the validation set
    loss, accuracy = doc2vec_model.evaluate(x=doc2vec_validation_data,
                                            y=validation_y)
    
    # add the validation loss and accuracy to the cache
    doc2vec_cache['val_loss'].append(loss)
    doc2vec_cache['val_accuracy'].append(accuracy)

    # save the model if it performed higher than ACCEPTABLE_ACCURACY
    if accuracy >= ACCEPTABLE_ACCURACY:
        MODEL_NAME = MODEL_TYPE + FEATURE_TYPE + 'ModelEpoch' + str(EPOCH) + '.h5' #8000 Vocab size, 300 article length
        doc2vec_model.save_weights(filepath=MODEL_PATH + MODEL_NAME, save_format='h5')

In [None]:
# evaluate the model on the test set
x = doc2vec_model.evaluate(x=doc2vec_test_data, y=test_y)



In [None]:
# obtain the performance of the model
evaluate_model(doc2vec_model, test_data, test_y)

Precision: 0.576555
Recall: 0.228942
fscore: 0.295079


# Data Visualization Methods



In [None]:
def get_average_article_length(training_data) -> int:
    """

    """

    return sum([len(article) for article in training_data])/len(training_data)

In [None]:
def draw_boxplot(data):
    """
    parameters:
      - data: the complete dataset to use
      - columns: the specified attributes to use in the boxplot
    """

    # Create a figure instance
    fig = plt.figure(1, figsize=(4,10))
    # Create an axes instance
    ax = fig.add_subplot(111)
    ax.set_xticklabels('length')

    # Create the boxplot
    bp = ax.boxplot([data], patch_artist=True)

In [None]:
def draw_model_performance(history, metrics, label='accuracy'):
    """

    """
    if not isinstance(metrics, list):
       metrics = [metrics]

    assert all([metric in history for metric in metrics])

    assert label in ['accuracy', 'loss']

    for metric in metrics:
        plt.plot(history[metric])

    plt.title('model ' + label)
    plt.ylabel(label)
    plt.xlabel('epoch')
    plt.legend(['train', 'test'][:len(metrics)], loc='upper left')
    plt.show()