In [1]:
import sys
import glob
import numpy

In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline

from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import PassiveAggressiveClassifier

from sklearn.neighbors import KNeighborsClassifier

from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [3]:
def sz(a): # str/eval
    return {
        'text': a.text,
        'title': a.title,
        'url': a.url,
        'keywords': a.keywords,
        'tags': a.tags,
        'summary': a.summary,
        'date': int(a.publish_date.timestamp()) if a.publish_date != None else None,
        'raw' : a.html
    }

In [4]:
def objRead(f):
    return eval(open(f, 'r').read())

def objWrite(f, data):
    f = f.split('.')[0]
    open(f + '.obj', 'w').write(str(data))

In [5]:
for f in glob.iglob('**/*.obj', recursive=True):
    print(f)

other.obj
disasters/shootings/lasvegas.obj
disasters/shootings/orlando.obj
disasters/shootings/sandyhook.obj
disasters/shootings/sanbernardino.obj
disasters/earthquakes/kashmir.obj
disasters/earthquakes/haiti.obj
disasters/earthquakes/sumatra.obj
disasters/earthquakes/sichuan.obj
disasters/wildfires/mountcarmel.obj
disasters/wildfires/attica.obj
disasters/wildfires/nevada.obj
disasters/wildfires/thomas.obj
disasters/hurricanes/maria.obj
disasters/hurricanes/katrina.obj
disasters/hurricanes/jeanne.obj
disasters/hurricanes/stan.obj


In [6]:
katrina = objRead('disasters/hurricanes/katrina.obj')
sandy = objRead('disasters/shootings/sandyhook.obj')
haiti = objRead('disasters/earthquakes/haiti.obj')
attica = objRead('disasters/wildfires/attica.obj')

In [7]:
# data = [ katrina, sandy, haiti, attica ]
data = [
    objRead('disasters/shootings/lasvegas.obj'),
    objRead('disasters/shootings/orlando.obj'),
    objRead('disasters/shootings/sandyhook.obj'),
    objRead('disasters/shootings/sanbernardino.obj'),
    objRead('disasters/earthquakes/kashmir.obj'),
    objRead('disasters/earthquakes/haiti.obj'),
    objRead('disasters/earthquakes/sumatra.obj'),
    objRead('disasters/earthquakes/sichuan.obj'),
    objRead('disasters/wildfires/mountcarmel.obj'),
    objRead('disasters/wildfires/attica.obj'),
    objRead('disasters/wildfires/nevada.obj'),
    objRead('disasters/wildfires/thomas.obj'),
    objRead('disasters/hurricanes/maria.obj'),
    objRead('disasters/hurricanes/katrina.obj'),
    objRead('disasters/hurricanes/jeanne.obj'),
    objRead('disasters/hurricanes/stan.obj')
]

train_data = []
train_label = numpy.array([])

test_data = []
test_label = numpy.array([])

TRAIN_SIZE = 30
l = 0

for cat in data:
    for x in cat[:TRAIN_SIZE]:
        train_data.append(x['text'])
        train_label = numpy.append(train_label, l)
    for x in cat[TRAIN_SIZE:]:
        test_data.append(x['text'])
        test_label = numpy.append(test_label, l)
    l += 1

In [8]:
classifiers = [
    SGDClassifier(loss='hinge',
                  penalty='elasticnet',
                  alpha=1e-3,
                  max_iter=1000,
                  tol=None),
    MultinomialNB(),
    PassiveAggressiveClassifier(max_iter=200),
    KNeighborsClassifier(n_neighbors=10),
    RandomForestClassifier(n_estimators=200,
                           max_depth=3,
                           random_state=0),
    LogisticRegression(random_state=0),
    LinearSVC()
]

In [9]:
for c in classifiers:
    clf = Pipeline([('vect', CountVectorizer()),
                    ('tfidf', TfidfTransformer()),
                    ('clf', c),
    ])
    clf.fit(train_data, train_label)
    acc = numpy.mean(clf.predict(test_data) == test_label)
    print(type(c).__name__, acc)

SGDClassifier 0.8854679802955665
MultinomialNB 0.8201970443349754
PassiveAggressiveClassifier 0.8731527093596059
KNeighborsClassifier 0.6724137931034483
RandomForestClassifier 0.8497536945812808
LogisticRegression 0.8731527093596059
LinearSVC 0.8854679802955665


In [10]:
confusion_matrix(clf.predict(test_data), test_label)

array([[49,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0, 58,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 3,  0, 64,  0,  0,  1,  0,  0,  0,  1,  0,  0,  0,  0,  0],
       [ 1,  1,  0, 61,  1,  0,  1,  0,  0,  1,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0, 27,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  1, 58,  0,  0,  0,  0,  1,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0, 45,  1,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  1,  2, 62,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0, 68,  0,  0,  0,  0,  0,  0],
       [ 1,  0,  0,  2,  0,  0,  2,  0,  1, 57,  6,  2,  0,  0,  0],
       [ 1,  2,  0,  3,  0,  0,  0,  0,  0,  5, 41,  3,  0,  0,  0],
       [ 4,  1,  1,  1,  0,  0,  0,  0,  0,  1,  2, 58,  0,  0,  0],
       [ 7,  3,  3,  0,  0,  6,  4,  1,  1,  1,  1,  6, 68,  2,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  3,  0],
       [ 0,  0,  0,  0,  0,  0,  0