In [1]:
import sys
import glob
import numpy
from random import shuffle

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline

from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import PassiveAggressiveClassifier

from sklearn.neighbors import KNeighborsClassifier

from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [3]:
def sz(a): # str/eval
    return {
        'text': a.text,
        'title': a.title,
        'url': a.url,
        'keywords': a.keywords,
        'tags': a.tags,
        'summary': a.summary,
        'date': int(a.publish_date.timestamp()) if a.publish_date != None else None,
        'raw' : a.html
    }

In [4]:
def objRead(f):
    return eval(open(f, 'r').read())

def objWrite(f, data):
    f = f.split('.')[0]
    open(f + '.lobj', 'w').write(str(data)) # light obj

In [5]:
for f in glob.iglob('**/*.lobj', recursive=True):
    print(f)

other.lobj
disasters/shootings/sandyhook.lobj
disasters/shootings/orlando.lobj
disasters/shootings/lasvegas.lobj
disasters/shootings/sanbernardino.lobj
disasters/earthquakes/sumatra.lobj
disasters/earthquakes/haiti.lobj
disasters/earthquakes/sichuan.lobj
disasters/earthquakes/kashmir.lobj
disasters/wildfires/thomas.lobj
disasters/wildfires/attica.lobj
disasters/wildfires/mountcarmel.lobj
disasters/wildfires/nevada.lobj
disasters/hurricanes/jeanne.lobj
disasters/hurricanes/stan.lobj
disasters/hurricanes/maria.lobj
disasters/hurricanes/katrina.lobj


In [12]:
data = [
    { 'articles': objRead('disasters/shootings/lasvegas.lobj'), 'label': 'lasvegas' },
    { 'articles': objRead('disasters/shootings/orlando.lobj'), 'label': 'orlando' },
    { 'articles': objRead('disasters/shootings/sandyhook.lobj'), 'label': 'sandyhook' },
    { 'articles': objRead('disasters/shootings/sanbernardino.lobj'), 'label': 'sanbernardino' },
    { 'articles': objRead('disasters/earthquakes/kashmir.lobj'), 'label': 'kashmir' },
    { 'articles': objRead('disasters/earthquakes/haiti.lobj'), 'label': 'haiti' },
    { 'articles': objRead('disasters/earthquakes/sumatra.lobj'), 'label': 'sumatra' },
    { 'articles': objRead('disasters/earthquakes/sichuan.lobj'), 'label': 'sichuan' },
#     { 'articles': objRead('disasters/wildfires/mountcarmel.lobj'), 'label': 'mountcarmel' }, # only 19 articles
    { 'articles': objRead('disasters/wildfires/attica.lobj'), 'label': 'attica' },
    { 'articles': objRead('disasters/wildfires/nevada.lobj'), 'label': 'nevada' },
    { 'articles': objRead('disasters/wildfires/thomas.lobj'), 'label': 'thomas' },
    { 'articles': objRead('disasters/hurricanes/maria.lobj'), 'label': 'maria' },
    { 'articles': objRead('disasters/hurricanes/katrina.lobj'), 'label': 'katrina' }
#     { 'articles': objRead('disasters/hurricanes/jeanne.lobj'), 'label': 'jeanne' }, # only 35 articles
#     { 'articles': objRead('disasters/hurricanes/stan.lobj'), 'label': 'stan'} # only 29 articles
]

labels = [x['label'] for x in data]

train_data, test_data = [], []
train_label, test_label = numpy.array([]), numpy.array([])

TRAIN_SIZE = 40
l = 0

for cat in data:
    shuffle(cat['articles'])
    for x in cat['articles'][:TRAIN_SIZE]:
        if 'washingtonpost' in x['url']:
            continue
        train_data.append(x['text'])
        train_label = numpy.append(train_label, l)
    for x in cat['articles'][TRAIN_SIZE:]:
        if 'washingtonpost' in x['url']:
            continue
        test_data.append(x['text'])
        test_label = numpy.append(test_label, l)
    l += 1

In [13]:
classifiers = [
    SGDClassifier(loss='hinge',
                  penalty='elasticnet',
                  alpha=1e-3,
                  max_iter=1000,
                  tol=None),
    MultinomialNB(),
    PassiveAggressiveClassifier(max_iter=200),
    KNeighborsClassifier(n_neighbors=len(data)),
    RandomForestClassifier(n_estimators=200,
                           max_depth=3),
    LogisticRegression(),
    LinearSVC()
]

In [14]:
for c in classifiers:
    clf = Pipeline([('vect', CountVectorizer()),
                    ('tfidf', TfidfTransformer()),
                    ('clf', c),
    ])
    clf.fit(train_data, train_label)
    pred_label = clf.predict(test_data)
    print(type(c).__name__, accuracy_score(test_label, pred_label))
    print(classification_report(test_label, pred_label, target_names = labels))
    print(confusion_matrix(test_label, pred_label))

SGDClassifier 0.9242424242424242
               precision    recall  f1-score   support

     lasvegas       0.86      0.89      0.88        55
      orlando       1.00      0.94      0.97        53
    sandyhook       0.96      0.96      0.96        57
sanbernardino       1.00      0.91      0.95        57
      kashmir       1.00      0.95      0.97        19
        haiti       1.00      0.91      0.95        54
      sumatra       0.97      0.91      0.94        43
      sichuan       0.98      0.98      0.98        54
       attica       0.77      0.98      0.86        60
       nevada       0.93      0.95      0.94        56
       thomas       0.97      0.83      0.89        41
        maria       0.83      0.86      0.84        57
      katrina       0.91      0.93      0.92        54

  avg / total       0.93      0.92      0.93       660

[[49  0  1  0  0  0  0  0  0  1  0  3  1]
 [ 2 50  0  0  0  0  0  0  1  0  0  0  0]
 [ 1  0 55  0  0  0  0  0  1  0  0  0  0]
 [ 1  0  0 52

LinearSVC 0.9196969696969697
               precision    recall  f1-score   support

     lasvegas       0.87      0.87      0.87        55
      orlando       1.00      0.94      0.97        53
    sandyhook       0.79      0.96      0.87        57
sanbernardino       0.82      0.95      0.88        57
      kashmir       1.00      0.95      0.97        19
        haiti       1.00      0.91      0.95        54
      sumatra       0.95      0.88      0.92        43
      sichuan       0.98      0.94      0.96        54
       attica       0.94      0.98      0.96        60
       nevada       0.90      0.95      0.92        56
       thomas       0.92      0.85      0.89        41
        maria       1.00      0.82      0.90        57
      katrina       0.94      0.93      0.93        54

  avg / total       0.93      0.92      0.92       660

[[48  0  5  0  0  0  0  0  0  2  0  0  0]
 [ 2 50  0  1  0  0  0  0  0  0  0  0  0]
 [ 1  0 55  0  0  0  0  0  1  0  0  0  0]
 [ 1  0  0 54  0 