In [41]:
import sys
import glob
import numpy
from random import shuffle

In [42]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline

from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import PassiveAggressiveClassifier

from sklearn.neighbors import KNeighborsClassifier

from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [43]:
def sz(a): # str/eval
    return {
        'text': a.text,
        'title': a.title,
        'url': a.url,
        'keywords': a.keywords,
        'tags': a.tags,
        'summary': a.summary,
        'date': int(a.publish_date.timestamp()) if a.publish_date != None else None,
        'raw' : a.html
    }

In [44]:
def objRead(f):
    return eval(open(f, 'r').read())

def objWrite(f, data):
    f = f.split('.')[0]
    open(f + '.lobj', 'w').write(str(data)) # light obj

In [45]:
for f in glob.iglob('**/*.lobj', recursive=True):
    print(f)

other.lobj
disasters/shootings/sandyhook.lobj
disasters/shootings/orlando.lobj
disasters/shootings/lasvegas.lobj
disasters/shootings/sanbernardino.lobj
disasters/earthquakes/sumatra.lobj
disasters/earthquakes/haiti.lobj
disasters/earthquakes/sichuan.lobj
disasters/earthquakes/kashmir.lobj
disasters/wildfires/thomas.lobj
disasters/wildfires/attica.lobj
disasters/wildfires/mountcarmel.lobj
disasters/wildfires/nevada.lobj
disasters/hurricanes/jeanne.lobj
disasters/hurricanes/stan.lobj
disasters/hurricanes/maria.lobj
disasters/hurricanes/katrina.lobj


In [46]:
data = [
    { 'articles': objRead('disasters/shootings/lasvegas.lobj'), 'label': 'lasvegas' },
    { 'articles': objRead('disasters/shootings/orlando.lobj'), 'label': 'orlando' },
    { 'articles': objRead('disasters/shootings/sandyhook.lobj'), 'label': 'sandyhook' },
    { 'articles': objRead('disasters/shootings/sanbernardino.lobj'), 'label': 'sanbernardino' },
    { 'articles': objRead('disasters/earthquakes/kashmir.lobj'), 'label': 'kashmir' },
    { 'articles': objRead('disasters/earthquakes/haiti.lobj'), 'label': 'haiti' },
    { 'articles': objRead('disasters/earthquakes/sumatra.lobj'), 'label': 'sumatra' },
    { 'articles': objRead('disasters/earthquakes/sichuan.lobj'), 'label': 'sichuan' },
#     { 'articles': objRead('disasters/wildfires/mountcarmel.lobj'), 'label': 'mountcarmel' }, # only 19 articles
    { 'articles': objRead('disasters/wildfires/attica.lobj'), 'label': 'attica' },
    { 'articles': objRead('disasters/wildfires/nevada.lobj'), 'label': 'nevada' },
    { 'articles': objRead('disasters/wildfires/thomas.lobj'), 'label': 'thomas' },
    { 'articles': objRead('disasters/hurricanes/maria.lobj'), 'label': 'maria' },
    { 'articles': objRead('disasters/hurricanes/katrina.lobj'), 'label': 'katrina' }
#     { 'articles': objRead('disasters/hurricanes/jeanne.lobj'), 'label': 'jeanne' }, # only 35 articles
#     { 'articles': objRead('disasters/hurricanes/stan.lobj'), 'label': 'stan'} # only 29 articles
]

labels = [x['label'] for x in data]

In [47]:
classifiers = [
    SGDClassifier(loss='hinge',
                  penalty='elasticnet',
                  alpha=1e-3,
                  max_iter=1000,
                  tol=None),
    MultinomialNB(),
    PassiveAggressiveClassifier(max_iter=200),
    KNeighborsClassifier(n_neighbors=len(data)),
    RandomForestClassifier(n_estimators=200,
                           max_depth=3),
    LogisticRegression(),
    LinearSVC()
]

In [48]:
for label in labels:    
    train_data, test_data = [], []
    train_label, test_label = numpy.array([]), numpy.array([])

    TRAIN_SIZE = 40

    for cat in data:
        l = int(cat['label'] != label)
        shuffle(cat['articles'])
        for x in cat['articles'][:TRAIN_SIZE]:
            if 'washingtonpost' in x['url']:
                continue
            train_data.append(x['text'])
            train_label = numpy.append(train_label, l)
        for x in cat['articles'][TRAIN_SIZE:]:
            if 'washingtonpost' in x['url']:
                continue
            test_data.append(x['text'])
            test_label = numpy.append(test_label, l)

    for c in classifiers:
        clf = Pipeline([('vect', CountVectorizer()),
                        ('tfidf', TfidfTransformer()),
                        ('clf', c),
        ])
        clf.fit(train_data, train_label)
        pred_label = clf.predict(test_data)
        print(type(c).__name__, accuracy_score(test_label, pred_label))
        print(classification_report(test_label, pred_label, target_names = [label, 'others']))
        print(confusion_matrix(test_label, pred_label))

SGDClassifier 0.9863013698630136
             precision    recall  f1-score   support

   lasvegas       1.00      0.83      0.91        54
     others       0.99      1.00      0.99       603

avg / total       0.99      0.99      0.99       657

[[ 45   9]
 [  0 603]]
MultinomialNB 0.9178082191780822
             precision    recall  f1-score   support

   lasvegas       0.00      0.00      0.00        54
     others       0.92      1.00      0.96       603

avg / total       0.84      0.92      0.88       657

[[  0  54]
 [  0 603]]


  'precision', 'predicted', average, warn_for)


PassiveAggressiveClassifier 0.9832572298325722
             precision    recall  f1-score   support

   lasvegas       0.98      0.81      0.89        54
     others       0.98      1.00      0.99       603

avg / total       0.98      0.98      0.98       657

[[ 44  10]
 [  1 602]]
KNeighborsClassifier 0.969558599695586
             precision    recall  f1-score   support

   lasvegas       1.00      0.63      0.77        54
     others       0.97      1.00      0.98       603

avg / total       0.97      0.97      0.97       657

[[ 34  20]
 [  0 603]]
RandomForestClassifier 0.9223744292237442
             precision    recall  f1-score   support

   lasvegas       1.00      0.06      0.11        54
     others       0.92      1.00      0.96       603

avg / total       0.93      0.92      0.89       657

[[  3  51]
 [  0 603]]
LogisticRegression 0.9330289193302892
             precision    recall  f1-score   support

   lasvegas       1.00      0.19      0.31        54
     others  

  'precision', 'predicted', average, warn_for)


PassiveAggressiveClassifier 0.9847328244274809
             precision    recall  f1-score   support

    orlando       1.00      0.81      0.90        54
     others       0.98      1.00      0.99       601

avg / total       0.98      0.98      0.98       655

[[ 44  10]
 [  0 601]]
KNeighborsClassifier 0.9389312977099237
             precision    recall  f1-score   support

    orlando       1.00      0.26      0.41        54
     others       0.94      1.00      0.97       601

avg / total       0.94      0.94      0.92       655

[[ 14  40]
 [  0 601]]
RandomForestClassifier 0.9206106870229007
             precision    recall  f1-score   support

    orlando       1.00      0.04      0.07        54
     others       0.92      1.00      0.96       601

avg / total       0.93      0.92      0.89       655

[[  2  52]
 [  0 601]]
LogisticRegression 0.9404580152671755
             precision    recall  f1-score   support

    orlando       1.00      0.28      0.43        54
     others 

  'precision', 'predicted', average, warn_for)


PassiveAggressiveClassifier 0.9848024316109423
             precision    recall  f1-score   support

  sandyhook       0.98      0.84      0.91        57
     others       0.99      1.00      0.99       601

avg / total       0.98      0.98      0.98       658

[[ 48   9]
 [  1 600]]
KNeighborsClassifier 0.9148936170212766
             precision    recall  f1-score   support

  sandyhook       1.00      0.02      0.03        57
     others       0.91      1.00      0.96       601

avg / total       0.92      0.91      0.88       658

[[  1  56]
 [  0 601]]
RandomForestClassifier 0.9133738601823708
             precision    recall  f1-score   support

  sandyhook       0.00      0.00      0.00        57
     others       0.91      1.00      0.95       601

avg / total       0.83      0.91      0.87       658

[[  0  57]
 [  0 601]]


  'precision', 'predicted', average, warn_for)


LogisticRegression 0.9148936170212766
             precision    recall  f1-score   support

  sandyhook       1.00      0.02      0.03        57
     others       0.91      1.00      0.96       601

avg / total       0.92      0.91      0.88       658

[[  1  56]
 [  0 601]]
LinearSVC 0.9817629179331308
             precision    recall  f1-score   support

  sandyhook       0.98      0.81      0.88        57
     others       0.98      1.00      0.99       601

avg / total       0.98      0.98      0.98       658

[[ 46  11]
 [  1 600]]
SGDClassifier 0.9801223241590215
               precision    recall  f1-score   support

sanbernardino       1.00      0.77      0.87        57
       others       0.98      1.00      0.99       597

  avg / total       0.98      0.98      0.98       654

[[ 44  13]
 [  0 597]]
MultinomialNB 0.9128440366972477
               precision    recall  f1-score   support

sanbernardino       0.00      0.00      0.00        57
       others       0.91      1.00

  'precision', 'predicted', average, warn_for)


PassiveAggressiveClassifier 0.9694189602446484
               precision    recall  f1-score   support

sanbernardino       0.84      0.81      0.82        57
       others       0.98      0.98      0.98       597

  avg / total       0.97      0.97      0.97       654

[[ 46  11]
 [  9 588]]
KNeighborsClassifier 0.9770642201834863
               precision    recall  f1-score   support

sanbernardino       1.00      0.74      0.85        57
       others       0.98      1.00      0.99       597

  avg / total       0.98      0.98      0.98       654

[[ 42  15]
 [  0 597]]
RandomForestClassifier 0.9220183486238532
               precision    recall  f1-score   support

sanbernardino       1.00      0.11      0.19        57
       others       0.92      1.00      0.96       597

  avg / total       0.93      0.92      0.89       654

[[  6  51]
 [  0 597]]
LogisticRegression 0.9357798165137615
               precision    recall  f1-score   support

sanbernardino       1.00      0.26     

  'precision', 'predicted', average, warn_for)


PassiveAggressiveClassifier 0.9969834087481146
             precision    recall  f1-score   support

    kashmir       1.00      0.89      0.94        19
     others       1.00      1.00      1.00       644

avg / total       1.00      1.00      1.00       663

[[ 17   2]
 [  0 644]]
KNeighborsClassifier 0.9788838612368024
             precision    recall  f1-score   support

    kashmir       1.00      0.26      0.42        19
     others       0.98      1.00      0.99       644

avg / total       0.98      0.98      0.97       663

[[  5  14]
 [  0 644]]
RandomForestClassifier 0.971342383107089
             precision    recall  f1-score   support

    kashmir       0.00      0.00      0.00        19
     others       0.97      1.00      0.99       644

avg / total       0.94      0.97      0.96       663

[[  0  19]
 [  0 644]]


  'precision', 'predicted', average, warn_for)


LogisticRegression 0.9728506787330317
             precision    recall  f1-score   support

    kashmir       1.00      0.05      0.10        19
     others       0.97      1.00      0.99       644

avg / total       0.97      0.97      0.96       663

[[  1  18]
 [  0 644]]
LinearSVC 0.9939668174962293
             precision    recall  f1-score   support

    kashmir       1.00      0.79      0.88        19
     others       0.99      1.00      1.00       644

avg / total       0.99      0.99      0.99       663

[[ 15   4]
 [  0 644]]
SGDClassifier 0.9832826747720365
             precision    recall  f1-score   support

      haiti       1.00      0.80      0.89        56
     others       0.98      1.00      0.99       602

avg / total       0.98      0.98      0.98       658

[[ 45  11]
 [  0 602]]
MultinomialNB 0.9148936170212766
             precision    recall  f1-score   support

      haiti       0.00      0.00      0.00        56
     others       0.91      1.00      0.96    

  'precision', 'predicted', average, warn_for)


PassiveAggressiveClassifier 0.9878419452887538
             precision    recall  f1-score   support

      haiti       1.00      0.86      0.92        56
     others       0.99      1.00      0.99       602

avg / total       0.99      0.99      0.99       658

[[ 48   8]
 [  0 602]]
KNeighborsClassifier 0.9164133738601824
             precision    recall  f1-score   support

      haiti       1.00      0.02      0.04        56
     others       0.92      1.00      0.96       602

avg / total       0.92      0.92      0.88       658

[[  1  55]
 [  0 602]]
RandomForestClassifier 0.9148936170212766
             precision    recall  f1-score   support

      haiti       0.00      0.00      0.00        56
     others       0.91      1.00      0.96       602

avg / total       0.84      0.91      0.87       658

[[  0  56]
 [  0 602]]


  'precision', 'predicted', average, warn_for)


LogisticRegression 0.9164133738601824
             precision    recall  f1-score   support

      haiti       1.00      0.02      0.04        56
     others       0.92      1.00      0.96       602

avg / total       0.92      0.92      0.88       658

[[  1  55]
 [  0 602]]
LinearSVC 0.9772036474164134
             precision    recall  f1-score   support

      haiti       1.00      0.73      0.85        56
     others       0.98      1.00      0.99       602

avg / total       0.98      0.98      0.98       658

[[ 41  15]
 [  0 602]]
SGDClassifier 0.9754224270353302
             precision    recall  f1-score   support

    sumatra       1.00      0.62      0.76        42
     others       0.97      1.00      0.99       609

avg / total       0.98      0.98      0.97       651

[[ 26  16]
 [  0 609]]
MultinomialNB 0.9354838709677419
             precision    recall  f1-score   support

    sumatra       0.00      0.00      0.00        42
     others       0.94      1.00      0.97    

  'precision', 'predicted', average, warn_for)


PassiveAggressiveClassifier 0.978494623655914
             precision    recall  f1-score   support

    sumatra       0.94      0.71      0.81        42
     others       0.98      1.00      0.99       609

avg / total       0.98      0.98      0.98       651

[[ 30  12]
 [  2 607]]
KNeighborsClassifier 0.9723502304147466
             precision    recall  f1-score   support

    sumatra       0.93      0.62      0.74        42
     others       0.97      1.00      0.99       609

avg / total       0.97      0.97      0.97       651

[[ 26  16]
 [  2 607]]
RandomForestClassifier 0.9385560675883257
             precision    recall  f1-score   support

    sumatra       1.00      0.05      0.09        42
     others       0.94      1.00      0.97       609

avg / total       0.94      0.94      0.91       651

[[  2  40]
 [  0 609]]
LogisticRegression 0.9385560675883257
             precision    recall  f1-score   support

    sumatra       1.00      0.05      0.09        42
     others  

  'precision', 'predicted', average, warn_for)


PassiveAggressiveClassifier 0.9893129770992366
             precision    recall  f1-score   support

    sichuan       1.00      0.87      0.93        54
     others       0.99      1.00      0.99       601

avg / total       0.99      0.99      0.99       655

[[ 47   7]
 [  0 601]]
KNeighborsClassifier 0.9480916030534351
             precision    recall  f1-score   support

    sichuan       1.00      0.37      0.54        54
     others       0.95      1.00      0.97       601

avg / total       0.95      0.95      0.94       655

[[ 20  34]
 [  0 601]]
RandomForestClassifier 0.917557251908397
             precision    recall  f1-score   support

    sichuan       0.00      0.00      0.00        54
     others       0.92      1.00      0.96       601

avg / total       0.84      0.92      0.88       655

[[  0  54]
 [  0 601]]


  'precision', 'predicted', average, warn_for)


LogisticRegression 0.917557251908397
             precision    recall  f1-score   support

    sichuan       0.00      0.00      0.00        54
     others       0.92      1.00      0.96       601

avg / total       0.84      0.92      0.88       655

[[  0  54]
 [  0 601]]


  'precision', 'predicted', average, warn_for)


LinearSVC 0.9770992366412213
             precision    recall  f1-score   support

    sichuan       1.00      0.72      0.84        54
     others       0.98      1.00      0.99       601

avg / total       0.98      0.98      0.98       655

[[ 39  15]
 [  0 601]]
SGDClassifier 0.974124809741248
             precision    recall  f1-score   support

     attica       1.00      0.72      0.83        60
     others       0.97      1.00      0.99       597

avg / total       0.97      0.97      0.97       657

[[ 43  17]
 [  0 597]]
MultinomialNB 0.908675799086758
             precision    recall  f1-score   support

     attica       0.00      0.00      0.00        60
     others       0.91      1.00      0.95       597

avg / total       0.83      0.91      0.87       657

[[  0  60]
 [  0 597]]


  'precision', 'predicted', average, warn_for)


PassiveAggressiveClassifier 0.9756468797564688
             precision    recall  f1-score   support

     attica       1.00      0.73      0.85        60
     others       0.97      1.00      0.99       597

avg / total       0.98      0.98      0.97       657

[[ 44  16]
 [  0 597]]
KNeighborsClassifier 0.91324200913242
             precision    recall  f1-score   support

     attica       1.00      0.05      0.10        60
     others       0.91      1.00      0.95       597

avg / total       0.92      0.91      0.88       657

[[  3  57]
 [  0 597]]
RandomForestClassifier 0.908675799086758
             precision    recall  f1-score   support

     attica       0.00      0.00      0.00        60
     others       0.91      1.00      0.95       597

avg / total       0.83      0.91      0.87       657

[[  0  60]
 [  0 597]]


  'precision', 'predicted', average, warn_for)


LogisticRegression 0.9101978691019786
             precision    recall  f1-score   support

     attica       1.00      0.02      0.03        60
     others       0.91      1.00      0.95       597

avg / total       0.92      0.91      0.87       657

[[  1  59]
 [  0 597]]
LinearSVC 0.9649923896499238
             precision    recall  f1-score   support

     attica       1.00      0.62      0.76        60
     others       0.96      1.00      0.98       597

avg / total       0.97      0.96      0.96       657

[[ 37  23]
 [  0 597]]
SGDClassifier 0.9785276073619632
             precision    recall  f1-score   support

     nevada       0.98      0.76      0.86        55
     others       0.98      1.00      0.99       597

avg / total       0.98      0.98      0.98       652

[[ 42  13]
 [  1 596]]
MultinomialNB 0.9156441717791411
             precision    recall  f1-score   support

     nevada       0.00      0.00      0.00        55
     others       0.92      1.00      0.96    

  'precision', 'predicted', average, warn_for)


PassiveAggressiveClassifier 0.9708588957055214
             precision    recall  f1-score   support

     nevada       0.82      0.84      0.83        55
     others       0.98      0.98      0.98       597

avg / total       0.97      0.97      0.97       652

[[ 46   9]
 [ 10 587]]
KNeighborsClassifier 0.9677914110429447
             precision    recall  f1-score   support

     nevada       1.00      0.62      0.76        55
     others       0.97      1.00      0.98       597

avg / total       0.97      0.97      0.96       652

[[ 34  21]
 [  0 597]]
RandomForestClassifier 0.9217791411042945
             precision    recall  f1-score   support

     nevada       1.00      0.07      0.14        55
     others       0.92      1.00      0.96       597

avg / total       0.93      0.92      0.89       652

[[  4  51]
 [  0 597]]
LogisticRegression 0.9309815950920245
             precision    recall  f1-score   support

     nevada       1.00      0.18      0.31        55
     others 

  'precision', 'predicted', average, warn_for)


PassiveAggressiveClassifier 0.9786585365853658
             precision    recall  f1-score   support

     thomas       0.97      0.68      0.80        41
     others       0.98      1.00      0.99       615

avg / total       0.98      0.98      0.98       656

[[ 28  13]
 [  1 614]]
KNeighborsClassifier 0.975609756097561
             precision    recall  f1-score   support

     thomas       0.88      0.71      0.78        41
     others       0.98      0.99      0.99       615

avg / total       0.97      0.98      0.97       656

[[ 29  12]
 [  4 611]]
RandomForestClassifier 0.9375
             precision    recall  f1-score   support

     thomas       0.00      0.00      0.00        41
     others       0.94      1.00      0.97       615

avg / total       0.88      0.94      0.91       656

[[  0  41]
 [  0 615]]


  'precision', 'predicted', average, warn_for)


LogisticRegression 0.9451219512195121
             precision    recall  f1-score   support

     thomas       1.00      0.12      0.22        41
     others       0.94      1.00      0.97       615

avg / total       0.95      0.95      0.92       656

[[  5  36]
 [  0 615]]
LinearSVC 0.9771341463414634
             precision    recall  f1-score   support

     thomas       0.96      0.66      0.78        41
     others       0.98      1.00      0.99       615

avg / total       0.98      0.98      0.98       656

[[ 27  14]
 [  1 614]]
SGDClassifier 0.9831804281345565
             precision    recall  f1-score   support

      maria       1.00      0.80      0.89        56
     others       0.98      1.00      0.99       598

avg / total       0.98      0.98      0.98       654

[[ 45  11]
 [  0 598]]
MultinomialNB 0.9143730886850153
             precision    recall  f1-score   support

      maria       0.00      0.00      0.00        56
     others       0.91      1.00      0.96    

  'precision', 'predicted', average, warn_for)


PassiveAggressiveClassifier 0.981651376146789
             precision    recall  f1-score   support

      maria       1.00      0.79      0.88        56
     others       0.98      1.00      0.99       598

avg / total       0.98      0.98      0.98       654

[[ 44  12]
 [  0 598]]
KNeighborsClassifier 0.9755351681957186
             precision    recall  f1-score   support

      maria       1.00      0.71      0.83        56
     others       0.97      1.00      0.99       598

avg / total       0.98      0.98      0.97       654

[[ 40  16]
 [  0 598]]
RandomForestClassifier 0.9143730886850153
             precision    recall  f1-score   support

      maria       0.00      0.00      0.00        56
     others       0.91      1.00      0.96       598

avg / total       0.84      0.91      0.87       654

[[  0  56]
 [  0 598]]


  'precision', 'predicted', average, warn_for)


LogisticRegression 0.9434250764525994
             precision    recall  f1-score   support

      maria       1.00      0.34      0.51        56
     others       0.94      1.00      0.97       598

avg / total       0.95      0.94      0.93       654

[[ 19  37]
 [  0 598]]
LinearSVC 0.981651376146789
             precision    recall  f1-score   support

      maria       1.00      0.79      0.88        56
     others       0.98      1.00      0.99       598

avg / total       0.98      0.98      0.98       654

[[ 44  12]
 [  0 598]]
SGDClassifier 0.983206106870229
             precision    recall  f1-score   support

    katrina       0.98      0.81      0.89        54
     others       0.98      1.00      0.99       601

avg / total       0.98      0.98      0.98       655

[[ 44  10]
 [  1 600]]
MultinomialNB 0.917557251908397
             precision    recall  f1-score   support

    katrina       0.00      0.00      0.00        54
     others       0.92      1.00      0.96       

  'precision', 'predicted', average, warn_for)


PassiveAggressiveClassifier 0.981679389312977
             precision    recall  f1-score   support

    katrina       0.98      0.80      0.88        54
     others       0.98      1.00      0.99       601

avg / total       0.98      0.98      0.98       655

[[ 43  11]
 [  1 600]]
KNeighborsClassifier 0.932824427480916
             precision    recall  f1-score   support

    katrina       1.00      0.19      0.31        54
     others       0.93      1.00      0.96       601

avg / total       0.94      0.93      0.91       655

[[ 10  44]
 [  0 601]]
RandomForestClassifier 0.917557251908397
             precision    recall  f1-score   support

    katrina       0.00      0.00      0.00        54
     others       0.92      1.00      0.96       601

avg / total       0.84      0.92      0.88       655

[[  0  54]
 [  0 601]]


  'precision', 'predicted', average, warn_for)


LogisticRegression 0.917557251908397
             precision    recall  f1-score   support

    katrina       0.00      0.00      0.00        54
     others       0.92      1.00      0.96       601

avg / total       0.84      0.92      0.88       655

[[  0  54]
 [  0 601]]


  'precision', 'predicted', average, warn_for)


LinearSVC 0.966412213740458
             precision    recall  f1-score   support

    katrina       1.00      0.59      0.74        54
     others       0.96      1.00      0.98       601

avg / total       0.97      0.97      0.96       655

[[ 32  22]
 [  0 601]]
