In [None]:
from pathlib import Path

import gensim
import numpy
import tensorflow
from pandas import read_json
from sklearn.metrics import roc_curve, balanced_accuracy_score, auc
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.naive_bayes import GaussianNB

THREAT_TWEETS_JSON = Path('data/threat.tweets.json')
ACCURACIES = {}
CV = 10
GNB_DICT = {}
MODELS = {}
RANDOM_STATE = 42
TEST_SIZE = 0.2
TOKENIZER = tensorflow.keras.preprocessing.text.Tokenizer()
WORD_EMBEDDINGS = []
WORD2VEC = gensim.models.KeyedVectors.load_word2vec_format(
    'models/word2vec/GoogleNews-vectors-negative300.bin',
    binary=True
)

DATA = read_json(path_or_buf=THREAT_TWEETS_JSON)
TOKENIZER.fit_on_texts(DATA['text'])


def embed_text(words, embedder, dim):
    embeddings = []
    for word in words:
        if word in embedder.key_to_index:
            embeddings.append(embedder[word])
        else:
            embeddings.append(numpy.zeros(dim))
    if len(embeddings) == 0:
        return numpy.zeros(dim)
    return numpy.mean(embeddings, axis=0)


X = numpy.array([
    embed_text(words, WORD2VEC, dim=300)
    for words in DATA['text']
])
Y = numpy.array([1 if yi == True else 0 for yi in DATA['relevant']])
GNB = GaussianNB()
GNB_DICT['cv_score'] = numpy.average(a=cross_val_score(
    estimator=GNB,
    X=X,
    y=Y,
    scoring='balanced_accuracy',
    cv=CV
))
X_TRAIN, X_TEST, Y_TRAIN, Y_TEST = train_test_split(
    X, Y,
    test_size=TEST_SIZE,
    random_state=RANDOM_STATE
)
GNB.fit(X=X_TRAIN, y=Y_TRAIN)
Y_PRED = GNB.predict(X=X_TEST)
FPR, TPR, _ = roc_curve(y_true=Y_TEST, y_score=Y_PRED)
GNB_DICT['test-accuracy'] = balanced_accuracy_score(y_true=Y_TEST, y_pred=Y_PRED)
GNB_DICT['fpr'] = FPR
GNB_DICT['tpr'] = TPR
GNB_DICT['roc-auc'] = auc(x=FPR, y=TPR)
GNB_DICT['model'] = GNB
MODELS['naïve-bayes-50d'] = GNB_DICT