In [1]:
%load_ext autoreload
%autoreload 2

In [171]:
import pandas as pd
import pickle
import os
import numpy as np
from nlp_surveillance.utils.my_utils import get_sentence_and_date_from_annotated_span
from nlp_surveillance.annotator import *
from nlp_surveillance.edb_clean import get_cleaned_edb
from nlp_surveillance.who_scraper import get_annotated_2018_whos
from nlp_surveillance.optimize_date_and_count import get_date_optimization_edb, _extract_sentences_from_spans
import datetime

# Testing

## WHO DF

In [None]:
parsed_whos_df = get_annotated_2018_whos()

### EDB 

In [None]:
edb = get_cleaned_edb()

# Optimize date

In [89]:
edb = get_date_optimization_edb(use_pickle=False)

HBox(children=(IntProgress(value=0, max=146), HTML(value='')))




HBox(children=(IntProgress(value=0, max=146), HTML(value='')))




In [91]:
from nlp_surveillance.utils.text_from_url import clean_text
edb['sentence'] = edb['sentence'].apply(clean_text)

In [322]:
edb = edb.reset_index(drop=True)

# Handmade Naïve Bayes

In [378]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()

In [411]:
feature_matrix = cv.fit_transform(edb['sentence'].apply(
    lambda x: ' '.join(x)).tolist())
print(feature_matrix[0])

  (0, 1546)	1
  (0, 2713)	1
  (0, 2036)	1
  (0, 843)	1
  (0, 593)	1
  (0, 1473)	1
  (0, 1998)	1
  (0, 2483)	1
  (0, 430)	1
  (0, 1827)	1
  (0, 185)	1


In [455]:
feature_matrix.shape

(6587, 3252)

In [414]:
feature_names = cv.get_feature_names()
print(feature_names[8])

ability


In [415]:
feature_mapping = cv.vocabulary_
print(feature_mapping['ability'])

8


In [387]:
from collections import defaultdict
def get_label_index(labels):
    label_index = defaultdict(list)
    for index, label in enumerate(labels):
        label_index[label].append(index)
    return label_index

In [388]:
label_index = get_label_index(edb['is_label'].tolist())

In [420]:
def get_prior(label_index):
    prior = {label: len(index) for label, index 
            in label_index.items()}
    total_count = sum(prior.values())
    for label in prior:
        prior[label] /= float(total_count)
    return prior

In [421]:
prior = get_prior(dict(label_index))
print(prior)

{False: 0.9427660543494762, True: 0.05723394565052376}


In [437]:
np.asarray(feature_matrix[label_index[True], :].sum(axis=0))[0].

array([4, 2, 0, ..., 0, 0, 0], dtype=int64)

In [443]:
def get_likelihood(feature_matrix, label_index, smoothing=1):
    likelihood = {}
    for label, index in label_index.items():
        likelihood[label] = (feature_matrix[index, :].sum(axis=0)
                             + smoothing)
        likelihood[label] = np.asarray(likelihood[label])[0]
        total_count = likelihood[label].sum()
        likelihood[label] = likelihood[label] / float(total_count)
    return likelihood

In [444]:
likelihood = get_likelihood(feature_matrix, label_index, 1)

In [445]:
def get_posterior(feature_matrix, prior, likelihood):
    num_example = feature_matrix.shape[0]
    posteriors = []
    for i in range(num_example):
        posterior = {key: np.log(prior_label) 
                     for key, prior_label in prior.items()}
        for label, likelihood_label in likelihood.items():
            feature_matrix_vector = feature_matrix.getrow(i)
            counts = feature_matrix_vector.data
            indices = feature_matrix_vector.indices
            for count, index in zip(counts, indices):
                posterior[label] += np.log(likelihood_label[index]) * count
        min_log_posterior = min(posterior.values())
        for label in posterior:
            try:
                posterior[label] = (np.exp(posterior[label]) 
                                    - min_log_posterior)
            except:
                posterior[label] = float('inf')
            sum_posterior = sum(posterior.values())
            for label in posterior:
                if posterior[label] == float('inf'):
                    posterior[label] = 1
                else:
                    posterior[label] /= sum_posterior
            posteriors.append(posterior.copy())
    return posteriors

In [446]:
test_matrx = (edb['sentence'].iloc[200])
test = cv.transform(test_matrx)

In [447]:
get_posterior(test, prior, likelihood)

[{False: 2034.7428620443238, True: -2033.7428620443238},
 {False: 0.9960331733578045, True: 0.003966826642195497},
 {False: 1078.4785507300758, True: -1077.4785507300758},
 {False: 0.9931892151659599, True: 0.006810784834040037},
 {False: 120.22158021229004, True: -119.22158021229004},
 {False: 0.9523939127212122, True: 0.04760608727878788},
 {False: 145482.13944258034, True: -145481.13944258034},
 {False: 0.9999154035727699, True: 8.459642723007814e-05}]

# Working Naïve Bayes

In [548]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.utils.class_weight import compute_sample_weight

In [555]:
sample_weight = compute_sample_weight(class_weight='balanced', y=training_label)

In [526]:
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB())])

In [560]:
training_sentences = edb['sentence'].apply(lambda x: ' '.join(x))[::2]
training_label = edb['is_label'][::2]
text_clf.fit(training_sentences, training_label, clf__sample_weight=sample_weight)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [561]:
test_sentences = edb['sentence'].apply(lambda x: ' '.join(x))[1::2]
test_label = edb['is_label'][1::2]

In [562]:
predicted = text_clf.predict(test_sentences)

In [577]:
# TODO: Evaluate predict_proba
sorted(text_clf.predict_proba(test_sentences)[:,1], reverse=True)

[0.993739250448742,
 0.993739250448742,
 0.993739250448742,
 0.9915772574875437,
 0.9867407981399337,
 0.9867407981399337,
 0.9867232247066937,
 0.9805013493444121,
 0.980470159042191,
 0.980470159042191,
 0.980470159042191,
 0.980470159042191,
 0.980470159042191,
 0.980470159042191,
 0.980470159042191,
 0.979035728417167,
 0.9723175107736208,
 0.9695755856912563,
 0.9695755856912563,
 0.9653814413913978,
 0.9650757007006341,
 0.9440107799643006,
 0.9440107799643006,
 0.9380322285639732,
 0.9380322285639732,
 0.9380322285639732,
 0.9380322285639732,
 0.9359850810937785,
 0.9359850810937785,
 0.9359850810937785,
 0.9331043404576872,
 0.931305676902902,
 0.9298789945912136,
 0.9298789945912136,
 0.9295855885698763,
 0.9295855885698763,
 0.929153595743018,
 0.929153595743018,
 0.929153595743018,
 0.929153595743018,
 0.929153595743018,
 0.929153595743018,
 0.929153595743018,
 0.924713946008333,
 0.916791261924684,
 0.916791261924684,
 0.914730953334412,
 0.914730953334412,
 0.9125267689737

In [563]:
from sklearn import metrics
print(metrics.classification_report(test_label, predicted))

              precision    recall  f1-score   support

       False       0.97      0.76      0.85      3102
        True       0.13      0.58      0.21       191

   micro avg       0.75      0.75      0.75      3293
   macro avg       0.55      0.67      0.53      3293
weighted avg       0.92      0.75      0.81      3293



In [564]:
metrics.confusion_matrix(test_label, predicted)

array([[2345,  757],
       [  81,  110]])

## Apply SVM

In [566]:
from sklearn.linear_model import SGDClassifier
text_clf_2 = Pipeline([('vect', CountVectorizer()),
                       ('tfidf', TfidfTransformer()),
                       ('clf', SGDClassifier(loss='hinge', penalty='l2', 
                                             alpha=1e-3, random_state=42,
                                             max_iter=20, tol=1e-3))])

In [567]:
text_clf_2.fit(training_sentences, training_label, clf__sample_weight=sample_weight)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...om_state=42, shuffle=True, tol=0.001,
       validation_fraction=0.1, verbose=0, warm_start=False))])

In [568]:
predicted_2 = text_clf_2.predict(test_sentences)

In [569]:
print(metrics.classification_report(test_label, predicted_2))

              precision    recall  f1-score   support

       False       0.96      0.77      0.86      3102
        True       0.13      0.54      0.21       191

   micro avg       0.76      0.76      0.76      3293
   macro avg       0.55      0.66      0.53      3293
weighted avg       0.92      0.76      0.82      3293



In [570]:
metrics.confusion_matrix(test_label, predicted_2)

array([[2385,  717],
       [  87,  104]])

# Optimize Count

In [593]:
from nlp_surveillance.temp_opt_count import get_count_optimization_edb
date_edb = get_count_optimization_edb()

HBox(children=(IntProgress(value=0, max=299), HTML(value='')))

http://www.cidrap.umn.edu/news-perspective/2018/04/stewardship-resistance-scan-apr-20-2018 caused <urlopen error timed out>
http://outbreaknewstoday.com/five-killed-350-hospitalized-typhoid-outbreak-spreads-Simbabwe-85682/ caused HTTP Error 404: Not Found
https://www.eurosurveillance.org/docserver/fulltext/eurosurveillance/23/41/eurosurv-23-41.pdf?expires=1539331213&id=id&accname=guest&checksum=B77A7E3D3E61994124B88DDF4144DCFF caused HTTP Error 403: Forbidden
https://www.ouest-france.fr/sante/alpes-maritimes-un-foyer-de-dengue-detecte-apres-quatre-nouveaux-cas-6024838 caused HTTP Error 403: Forbidden
https://mainichi.jp/english/articles/20181016/p2g/00m/0dm/046000c caused HTTP Error 404: Not Found
https://www.eurosurveillance.org/docserver/fulltext/eurosurveillance/23/46/eurosurv-23-46.pdf?expires=1542619045&id=id&accname=guest&checksum=06E0705C273BC49990CC352FB3D98F68 caused HTTP Error 403: Forbidden
https://www.eurosurveillance.org/docserver/fulltext/eurosurveillance/23/46/eurosurv-2

HBox(children=(IntProgress(value=0, max=294), HTML(value='')))

Bad date range split: up to week 21 and ONE ['up ', ' week 21 ', ' ONE']


In [594]:
count_edb = date_edb

In [687]:
from nlp_surveillance.temp_opt_count import _extract_sentences_from_spans, _annotate_text_in_edb
count_edb_anno = _annotate_text_in_edb(count_edb)

HBox(children=(IntProgress(value=0, max=294), HTML(value='')))

Bad date range split: up to week 21 and ONE ['up ', ' week 21 ', ' ONE']


In [688]:
count_edb_ext = _extract_sentences_from_spans(count_edb_anno)

Index(['Fälle gesamt*', 'links', 'text', 'annotated'], dtype='object')


HBox(children=(IntProgress(value=0, max=294), HTML(value='')))

In [689]:
pickle.dump(count_edb_ext, open('fuckmylife.p', 'wb'))

In [712]:
count_edb_ext = pickle.load(open('fuckmylife.p', 'rb'))

In [713]:
import re
count_edb_ext['Fälle gesamt*'] =  count_edb_ext['Fälle gesamt*'].apply(lambda x: re.sub("[^0-9]", "",x)).apply(int)

In [714]:
mask = ((count_edb_ext['count'] -2  <= count_edb_ext['Fälle gesamt*']) & 
        (count_edb_ext['count'] +2 >= count_edb_ext['Fälle gesamt*']))

In [715]:
count_edb_ext['is_label'] = mask

In [723]:
tqdm.pandas()

In [724]:
count_edb_ext['sentence'] = count_edb_ext['sentence'].progress_apply(str)

100%|██████████| 32939/32939 [00:00<00:00, 495883.72it/s]


In [725]:
count_edb_ext['sentence'] = count_edb_ext['sentence'].progress_apply(clean_text)

100%|██████████| 32939/32939 [48:56<00:00,  2.91it/s]  


In [726]:
count_edb_ext.head()

Unnamed: 0,Fälle gesamt*,sentence,count,is_label
0,1121,"[The, WHO, Health, Emergencies, Programme, cur...",50,False
1,1121,"[The, confirmation, additional, animal, case, ...",1,False
2,1121,"[As, March, case, reported, National, Institut...",967,False
3,1121,"[Of, case, reported, case]",749,False
4,1121,"[Of, case, reported, case]",218,False


In [727]:
pickle.dump(count_edb_ext, open('fml.p', 'wb'))
count_edb_ext = pickle.load(open('fml.p', 'rb'))

In [728]:
count_edb_ext['sentence'] = count_edb_ext['sentence'].apply(lambda x: ' '.join(x))

In [741]:
len(count_edb_ext[count_edb_ext['is_label'] == True])

1701

In [744]:
count_text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB())])

In [745]:
count_training_sentences = count_edb_ext['sentence'][::2]
count_training_label = count_edb_ext['is_label'][::2]

In [746]:
count_sample_weight = compute_sample_weight(class_weight='balanced', y=count_training_label)
count_text_clf.fit(count_training_sentences, count_training_label, clf__sample_weight=count_sample_weight)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [747]:
count_test_sentences = count_edb_ext['sentence'][1::2]
count_test_label = count_edb_ext['is_label'][1::2]

In [748]:
count_predicted = count_text_clf.predict(count_test_sentences)

In [749]:
print(metrics.classification_report(count_test_label, count_predicted))

              precision    recall  f1-score   support

       False       0.97      0.76      0.85     15638
        True       0.10      0.52      0.17       831

   micro avg       0.75      0.75      0.75     16469
   macro avg       0.54      0.64      0.51     16469
weighted avg       0.92      0.75      0.82     16469



In [750]:
metrics.confusion_matrix(count_test_label, count_predicted)

array([[11932,  3706],
       [  400,   431]])