In [1]:
import sys
import glob
import numpy
from random import shuffle

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline

from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import PassiveAggressiveClassifier

from sklearn.neighbors import KNeighborsClassifier

from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [3]:
def sz(a): # str/eval
    return {
        'text': a.text,
        'title': a.title,
        'url': a.url,
        'keywords': a.keywords,
        'tags': a.tags,
        'summary': a.summary,
        'date': int(a.publish_date.timestamp()) if a.publish_date != None else None,
        'raw' : a.html
    }

In [4]:
def objRead(f):
    return eval(open(f, 'r').read())

def objWrite(f, data):
    f = f.split('.')[0]
    open(f + '.lobj', 'w').write(str(data)) # light obj

In [5]:
for f in glob.iglob('**/*.lobj', recursive=True):
    print(f)

other.lobj
disasters/shootings/sandyhook.lobj
disasters/shootings/orlando.lobj
disasters/shootings/lasvegas.lobj
disasters/shootings/sanbernardino.lobj
disasters/earthquakes/sumatra.lobj
disasters/earthquakes/haiti.lobj
disasters/earthquakes/sichuan.lobj
disasters/earthquakes/kashmir.lobj
disasters/wildfires/thomas.lobj
disasters/wildfires/attica.lobj
disasters/wildfires/mountcarmel.lobj
disasters/wildfires/nevada.lobj
disasters/hurricanes/jeanne.lobj
disasters/hurricanes/stan.lobj
disasters/hurricanes/maria.lobj
disasters/hurricanes/katrina.lobj


In [6]:
data = [
    { 'articles': objRead('disasters/shootings/lasvegas.lobj'), 'label': 'lasvegas' },
    { 'articles': objRead('disasters/shootings/orlando.lobj'), 'label': 'orlando' },
    { 'articles': objRead('disasters/shootings/sandyhook.lobj'), 'label': 'sandyhook' },
    { 'articles': objRead('disasters/shootings/sanbernardino.lobj'), 'label': 'sanbernardino' },
    { 'articles': objRead('disasters/earthquakes/kashmir.lobj'), 'label': 'kashmir' },
    { 'articles': objRead('disasters/earthquakes/haiti.lobj'), 'label': 'haiti' },
    { 'articles': objRead('disasters/earthquakes/sumatra.lobj'), 'label': 'sumatra' },
    { 'articles': objRead('disasters/earthquakes/sichuan.lobj'), 'label': 'sichuan' },
#     { 'articles': objRead('disasters/wildfires/mountcarmel.lobj'), 'label': 'mountcarmel' }, # only 19 articles
    { 'articles': objRead('disasters/wildfires/attica.lobj'), 'label': 'attica' },
    { 'articles': objRead('disasters/wildfires/nevada.lobj'), 'label': 'nevada' },
    { 'articles': objRead('disasters/wildfires/thomas.lobj'), 'label': 'thomas' },
    { 'articles': objRead('disasters/hurricanes/maria.lobj'), 'label': 'maria' },
    { 'articles': objRead('disasters/hurricanes/katrina.lobj'), 'label': 'katrina' }
#     { 'articles': objRead('disasters/hurricanes/jeanne.lobj'), 'label': 'jeanne' }, # only 35 articles
#     { 'articles': objRead('disasters/hurricanes/stan.lobj'), 'label': 'stan'} # only 29 articles
]

labels = [x['label'] for x in data]

In [7]:
classifiers = [
    SGDClassifier(loss='hinge',
                  penalty='elasticnet',
                  alpha=1e-3,
                  max_iter=1000,
                  tol=None),
    MultinomialNB(),
    PassiveAggressiveClassifier(max_iter=200),
    KNeighborsClassifier(n_neighbors=len(data)),
    RandomForestClassifier(n_estimators=200,
                           max_depth=3),
    LogisticRegression(),
    LinearSVC()
]

In [8]:
for label in labels:    
    train_data, test_data = [], []
    train_label, test_label = numpy.array([]), numpy.array([])

    TRAIN_SIZE = 40

    for cat in data:
        l = int(cat['label'] != label)
        shuffle(cat['articles'])
        
        s = 0
        for x in cat['articles'][:TRAIN_SIZE]:
            if 'washingtonpost' in x['url']:
                continue
                
            if l:
                if s > 1.25 * TRAIN_SIZE / len(data):
                    break
                s += 1
                
            train_data.append(x['text'])
            train_label = numpy.append(train_label, l)
            
        s = 0
        for x in cat['articles'][TRAIN_SIZE:]:
            if 'washingtonpost' in x['url']:
                continue
                
            if l:
                if s > 1.25 * TRAIN_SIZE / len(data):
                    break
                s += 1
                
            test_data.append(x['text'])
            test_label = numpy.append(test_label, l)
            
    test_data += train_data
    test_label = numpy.append(test_label, train_label)
    
    for c in classifiers:
        clf = Pipeline([('vect', CountVectorizer()),
                        ('tfidf', TfidfTransformer()),
                        ('clf', c),
        ])
        clf.fit(train_data, train_label)
                
        pred_label = clf.predict(test_data)
        print(type(c).__name__, accuracy_score(test_label, pred_label))
        print(classification_report(test_label, pred_label, target_names = [label, 'others']))
        print(confusion_matrix(test_label, pred_label))

SGDClassifier 0.9788359788359788
             precision    recall  f1-score   support

   lasvegas       0.99      0.97      0.98        93
     others       0.97      0.99      0.98        96

avg / total       0.98      0.98      0.98       189

[[90  3]
 [ 1 95]]
MultinomialNB 0.9735449735449735
             precision    recall  f1-score   support

   lasvegas       0.98      0.97      0.97        93
     others       0.97      0.98      0.97        96

avg / total       0.97      0.97      0.97       189

[[90  3]
 [ 2 94]]
PassiveAggressiveClassifier 0.9735449735449735
             precision    recall  f1-score   support

   lasvegas       0.97      0.98      0.97        93
     others       0.98      0.97      0.97        96

avg / total       0.97      0.97      0.97       189

[[91  2]
 [ 3 93]]
KNeighborsClassifier 0.7301587301587301
             precision    recall  f1-score   support

   lasvegas       0.65      1.00      0.78        93
     others       1.00      0.47      

RandomForestClassifier 0.935064935064935
             precision    recall  f1-score   support

    kashmir       1.00      0.83      0.91        58
     others       0.91      1.00      0.95        96

avg / total       0.94      0.94      0.93       154

[[48 10]
 [ 0 96]]
LogisticRegression 0.987012987012987
             precision    recall  f1-score   support

    kashmir       0.98      0.98      0.98        58
     others       0.99      0.99      0.99        96

avg / total       0.99      0.99      0.99       154

[[57  1]
 [ 1 95]]
LinearSVC 0.9935064935064936
             precision    recall  f1-score   support

    kashmir       0.98      1.00      0.99        58
     others       1.00      0.99      0.99        96

avg / total       0.99      0.99      0.99       154

[[58  0]
 [ 1 95]]
SGDClassifier 0.9473684210526315
             precision    recall  f1-score   support

      haiti       0.93      0.97      0.95        94
     others       0.97      0.93      0.95        9

SGDClassifier 0.9685863874345549
             precision    recall  f1-score   support

     nevada       0.94      1.00      0.97        95
     others       1.00      0.94      0.97        96

avg / total       0.97      0.97      0.97       191

[[95  0]
 [ 6 90]]
MultinomialNB 0.9109947643979057
             precision    recall  f1-score   support

     nevada       0.96      0.85      0.91        95
     others       0.87      0.97      0.92        96

avg / total       0.92      0.91      0.91       191

[[81 14]
 [ 3 93]]
PassiveAggressiveClassifier 0.9267015706806283
             precision    recall  f1-score   support

     nevada       0.95      0.91      0.92        95
     others       0.91      0.95      0.93        96

avg / total       0.93      0.93      0.93       191

[[86  9]
 [ 5 91]]
KNeighborsClassifier 0.6596858638743456
             precision    recall  f1-score   support

     nevada       0.60      0.92      0.73        95
     others       0.83      0.41      