In [2]:
%load_ext autoreload
%autoreload 2

In [47]:
from nlp_surveillance.pipeline import ExtractSentencesAndLabel
from nlp_surveillance.classifier import summarize
from collections import defaultdict
from tqdm import tqdm
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nlp_surveillance.classifier import extract_sentence

In [134]:
df = ExtractSentencesAndLabel('counts').data_output()

In [135]:
df['sentence']= df['sentence'].apply(lambda x: list(set(x.split()) - set(stopwords.words('english'))))

In [136]:
df = extract_sentence.split_list_and_distribute_to_new_rows(df, 'sentence')

In [137]:
as_tuples = df.apply(tuple, axis=1).tolist()

In [138]:
as_tuples = [({'sent':two}, one) for one,two in as_tuples]

In [35]:
def gender_features(word):
    return {'last_letter': word[-1]}

In [36]:
from nltk.corpus import names

In [37]:
labeled_names = ([(name, 'male') for name in names.words('male.txt')]+([(name, 'female') for name in names.words('female.txt')]))

In [38]:
import random 
random.shuffle(labeled_names)

In [39]:
featuresets = [(gender_features(n), gender) for (n,gender) in labeled_names]

In [40]:
import nltk

In [139]:
clf = nltk.NaiveBayesClassifier.train(as_tuples)

In [42]:
clf.classify({'last_letter': 'n'})

'male'

In [43]:
nltk.classify.accuracy(clf,featuresets[:500] )

0.77

In [133]:
# Dates
clf.show_most_informative_features(20)

Most Informative Features
                    sent = 'since'          True : False  =      8.4 : 1.0
                    sent = 'public'         True : False  =      7.1 : 1.0
                    sent = 'include'        True : False  =      5.8 : 1.0
                    sent = 'worm'           True : False  =      5.0 : 1.0
                    sent = 'first'         False : True   =      5.0 : 1.0
                    sent = 'emerging'       True : False  =      4.5 : 1.0
                    sent = 'medical'        True : False  =      4.5 : 1.0
                    sent = 'Northern'       True : False  =      4.5 : 1.0
                    sent = 'dengue'        False : True   =      4.3 : 1.0
                    sent = 'countries'     False : True   =      4.3 : 1.0
                    sent = 'Sep'           False : True   =      4.0 : 1.0
                    sent = '8'             False : True   =      3.6 : 1.0
                    sent = 'statement'      True : False  =      3.5 : 1.0

In [140]:
# Counts
clf.show_most_informative_features(20)

Most Informative Features
                    sent = 'poultry'        True : False  =     26.8 : 1.0
                    sent = 'Delaware'       True : False  =     24.8 : 1.0
                    sent = 'Laibin'         True : False  =     21.9 : 1.0
                    sent = '42-year-old'    True : False  =     21.9 : 1.0
                    sent = 'Bulgaria,'      True : False  =     17.0 : 1.0
                    sent = 'desert'         True : False  =     17.0 : 1.0
                    sent = 'vomiting'       True : False  =     17.0 : 1.0
                    sent = 'squirrels'      True : False  =     17.0 : 1.0
                    sent = 'China,'         True : False  =     17.0 : 1.0
                    sent = 'Guangxi'        True : False  =     16.1 : 1.0
                    sent = 'media'          True : False  =     16.1 : 1.0
                    sent = 'lived'          True : False  =     15.6 : 1.0
                    sent = 'Animal'         True : False  =     13.6 : 1.0

In [58]:
import os
import pickle
import re

In [96]:
dir_ = os.listdir('nlp_surveillance/')

In [97]:
batches = [f'nlp_surveillance/{file}' for file in dir_ if 'batch' in file]

In [98]:
padded = []
for batch in batches:
    found_number = re.search(r'(\d+)',batch)[0]
    padded_num = found_number.zfill(4)
    padded.append((padded_num, batch))

In [99]:
batches = sorted(padded)

In [100]:
first = batches[0][1]
with open(first, 'rb') as batch_handle:
    recommender_with_entities = pickle.load(batch_handle)
    recommender_with_entities = [recommender_with_entities]
for batch in batches[1:]:
    with open(batch[1], 'rb') as batch_handle:
        to_append = pickle.load(batch_handle)
        recommender_with_entities.append(to_append)

In [101]:
recommender_with_entities = [pd.DataFrame(d) for d in recommender_with_entities]

In [102]:
concatted = pd.concat(recommender_with_entities, ignore_index=True)

In [103]:
concatted['counts'] = concatted['counts'].apply(lambda x: int(sum(x)/len(x)) if x  else x)

In [104]:
concatted['counts'] = concatted['counts'].apply(np.log10).apply(np.round)

In [105]:
to_train = concatted.drop(columns='date')

In [107]:
from nlp_surveillance.pipeline import RecommenderLabeling

In [110]:
label = RecommenderLabeling().data_output()['label']

In [115]:
to_train = pd.concat([to_train, label],axis=1, ignore_index=False)

In [135]:
to_train_d = to_train.dropna()

In [133]:
import numpy as np
import pandas as pd
import sklearn
import sklearn.model_selection as ms
import sklearn.feature_extraction.text as text
import sklearn.naive_bayes as nb

In [161]:
from sklearn.utils.class_weight import compute_sample_weight

In [162]:
tf = text.TfidfVectorizer()
X = tf.fit_transform(to_train_d['geoname'])
y = to_train_d['label'].apply(int)
y_balanced = compute_sample_weight(class_weight='balanced', y=y)

In [163]:
p = 100 * X.nnz / float(X.shape[0] * X.shape[1])
print(f"Each sample has ~{p:.2f}% non-zero features.")


Each sample has ~1.78% non-zero features.


In [164]:
(X_train, X_test, y_train, y_test) = \
    ms.train_test_split(X, y, test_size=.2)

In [165]:
bnb = ms.GridSearchCV(
    nb.BernoulliNB(),
    param_grid={'alpha': np.logspace(-2., 2., 50)})
bnb.fit(X_train, y_train)



GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'alpha': array([1.00000e-02, 1.20679e-02, 1.45635e-02, 1.75751e-02, 2.12095e-02,
       2.55955e-02, 3.08884e-02, 3.72759e-02, 4.49843e-02, 5.42868e-02,
       6.55129e-02, 7.90604e-02, 9.54095e-02, 1.15140e-01, 1.38950e-01,
       1.67683e-01, 2.02359e-01, 2.44205e-01, 2.94705e-01, 3.55... 3.23746e+01, 3.90694e+01,
       4.71487e+01, 5.68987e+01, 6.86649e+01, 8.28643e+01, 1.00000e+02])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [166]:
bnb.score(X_test, y_test)


0.9280742459396751

In [167]:
predicted = bnb.predict(X_test)

In [169]:
from sklearn import metrics
print(metrics.classification_report(y_test, predicted))
print(metrics.confusion_matrix(y_test, predicted))

              precision    recall  f1-score   support

           0       0.94      0.99      0.96       403
           1       0.20      0.04      0.06        28

   micro avg       0.93      0.93      0.93       431
   macro avg       0.57      0.51      0.51       431
weighted avg       0.89      0.93      0.90       431

[[399   4]
 [ 27   1]]


In [150]:
# We first get the words corresponding to each feature
names = np.asarray(tf.get_feature_names())
# Next, we display the 50 words with the largest
# coefficients.
print(','.join(names[np.argsort(
    bnb.best_estimator_.coef_[0, :])[::-1][:50]]))

of,republic,united,federal,nigeria,kingdom,democratic,states,the,congo,and,ireland,northern,great,britain,state,india,kenya,uganda,liberia,commonwealth,people,australia,new,spain,islamic,papua,guinea,independent,plurinational,brazil,pakistan,burma,yemen,china,bolivia,algeria,switzerland,dominican,union,federative,czechia,zealand,afghanistan,france,lebanon,costa,peru,saudi,taiwan
