In [20]:
import sys
import glob
import numpy
from random import shuffle

In [21]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from nltk.stem.snowball import EnglishStemmer

from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import PassiveAggressiveClassifier

from sklearn.neighbors import KNeighborsClassifier

from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [22]:
def sz(a): # str/eval
    return {
        'text': a.text,
        'title': a.title,
        'url': a.url,
        'keywords': a.keywords,
        'tags': a.tags,
        'summary': a.summary,
        'date': int(a.publish_date.timestamp()) if a.publish_date != None else None,
        'raw' : a.html
    }

In [23]:
def objRead(f):
    return eval(open(f, 'r').read())

def objWrite(f, data):
    f = f.split('.')[0]
    open(f + '.lobj', 'w').write(str(data)) # light obj

In [24]:
for f in glob.iglob('**/*.lobj', recursive=True):
    print(f)

other.lobj
disasters/shootings/sandyhook.lobj
disasters/shootings/orlando.lobj
disasters/shootings/lasvegas.lobj
disasters/shootings/sanbernardino.lobj
disasters/earthquakes/sumatra.lobj
disasters/earthquakes/haiti.lobj
disasters/earthquakes/sichuan.lobj
disasters/earthquakes/kashmir.lobj
disasters/wildfires/thomas.lobj
disasters/wildfires/attica.lobj
disasters/wildfires/mountcarmel.lobj
disasters/wildfires/nevada.lobj
disasters/hurricanes/jeanne.lobj
disasters/hurricanes/stan.lobj
disasters/hurricanes/maria.lobj
disasters/hurricanes/katrina.lobj


In [25]:
data = [
    { 'articles': objRead('disasters/shootings/lasvegas.lobj'), 'label': 'lasvegas' },
    { 'articles': objRead('disasters/shootings/orlando.lobj'), 'label': 'orlando' },
    { 'articles': objRead('disasters/shootings/sandyhook.lobj'), 'label': 'sandyhook' },
    { 'articles': objRead('disasters/shootings/sanbernardino.lobj'), 'label': 'sanbernardino' },
    { 'articles': objRead('disasters/earthquakes/kashmir.lobj'), 'label': 'kashmir' },
    { 'articles': objRead('disasters/earthquakes/haiti.lobj'), 'label': 'haiti' },
    { 'articles': objRead('disasters/earthquakes/sumatra.lobj'), 'label': 'sumatra' },
    { 'articles': objRead('disasters/earthquakes/sichuan.lobj'), 'label': 'sichuan' },
#     { 'articles': objRead('disasters/wildfires/mountcarmel.lobj'), 'label': 'mountcarmel' }, # only 19 articles
    { 'articles': objRead('disasters/wildfires/attica.lobj'), 'label': 'attica' },
    { 'articles': objRead('disasters/wildfires/nevada.lobj'), 'label': 'nevada' },
    { 'articles': objRead('disasters/wildfires/thomas.lobj'), 'label': 'thomas' },
    { 'articles': objRead('disasters/hurricanes/maria.lobj'), 'label': 'maria' },
    { 'articles': objRead('disasters/hurricanes/katrina.lobj'), 'label': 'katrina' }
#     { 'articles': objRead('disasters/hurricanes/jeanne.lobj'), 'label': 'jeanne' }, # only 35 articles
#     { 'articles': objRead('disasters/hurricanes/stan.lobj'), 'label': 'stan'} # only 29 articles
]

labels = [x['label'] for x in data]

In [26]:
classifiers = [
    SGDClassifier(loss='hinge',
                  penalty='elasticnet',
                  alpha=1e-3,
                  max_iter=1000,
                  tol=None),
    MultinomialNB(),
    PassiveAggressiveClassifier(max_iter=200),
    KNeighborsClassifier(n_neighbors=len(data)),
    RandomForestClassifier(n_estimators=200,
                           max_depth=3),
    LogisticRegression(),
    LinearSVC()
]

In [39]:
stemmer = EnglishStemmer()
analyzer = CountVectorizer().build_analyzer()

def stemmed_words(text):
    return (stemmer.stem(word) for word in analyzer(text))

# stem_vectorizer = CountVectorizer(analyzer=stemmed_words)
# print(stem_vectorizer.fit_transform(['analyst does analysis']))
# print(stem_vectorizer.get_feature_names())

In [38]:
for label in labels:    
    train_data, test_data = [], []
    train_label, test_label = numpy.array([]), numpy.array([])

    TRAIN_SIZE = 40

    for cat in data:
        l = int(cat['label'] != label)
        shuffle(cat['articles'])
        
        s = 0
        for x in cat['articles'][:TRAIN_SIZE]:
            if 'washingtonpost' in x['url']:
                continue
                
            if l:
                if s > 1.25 * TRAIN_SIZE / len(data):
                    break
                s += 1
                
            train_data.append(x['text'])
            train_label = numpy.append(train_label, l)
            
        s = 0
        for x in cat['articles'][TRAIN_SIZE:]:
            if 'washingtonpost' in x['url']:
                continue
                
            if l:
                if s > 1.25 * TRAIN_SIZE / len(data):
                    break
                s += 1
                
            test_data.append(x['text'])
            test_label = numpy.append(test_label, l)
            
    test_data += train_data
    test_label = numpy.append(test_label, train_label)
    
    for c in classifiers:
        clf = Pipeline([('vect', CountVectorizer(analyzer=stemmed_words,
                                                 stop_words='english')),
                        ('tfidf', TfidfTransformer()),
                        ('clf', c),
        ])
        clf.fit(train_data, train_label)
                
        pred_label = clf.predict(test_data)
        print(type(c).__name__, accuracy_score(test_label, pred_label))
        print(classification_report(test_label, pred_label, target_names = [label, 'others']))
        print(confusion_matrix(test_label, pred_label))

SGDClassifier 0.9259259259259259
             precision    recall  f1-score   support

   lasvegas       1.00      0.85      0.92        93
     others       0.87      1.00      0.93        96

avg / total       0.94      0.93      0.93       189

[[79 14]
 [ 0 96]]
MultinomialNB 0.9153439153439153
             precision    recall  f1-score   support

   lasvegas       0.96      0.86      0.91        93
     others       0.88      0.97      0.92        96

avg / total       0.92      0.92      0.92       189

[[80 13]
 [ 3 93]]
PassiveAggressiveClassifier 0.9312169312169312
             precision    recall  f1-score   support

   lasvegas       1.00      0.86      0.92        93
     others       0.88      1.00      0.94        96

avg / total       0.94      0.93      0.93       189

[[80 13]
 [ 0 96]]
KNeighborsClassifier 0.7619047619047619
             precision    recall  f1-score   support

   lasvegas       0.69      0.94      0.79        93
     others       0.90      0.59      

KNeighborsClassifier 0.525974025974026
             precision    recall  f1-score   support

    kashmir       0.44      1.00      0.61        58
     others       1.00      0.24      0.39        96

avg / total       0.79      0.53      0.47       154

[[58  0]
 [73 23]]
RandomForestClassifier 0.9675324675324676
             precision    recall  f1-score   support

    kashmir       1.00      0.91      0.95        58
     others       0.95      1.00      0.97        96

avg / total       0.97      0.97      0.97       154

[[53  5]
 [ 0 96]]
LogisticRegression 0.9935064935064936
             precision    recall  f1-score   support

    kashmir       1.00      0.98      0.99        58
     others       0.99      1.00      0.99        96

avg / total       0.99      0.99      0.99       154

[[57  1]
 [ 0 96]]
LinearSVC 1.0
             precision    recall  f1-score   support

    kashmir       1.00      1.00      1.00        58
     others       1.00      1.00      1.00        96

avg 

LinearSVC 0.9744897959183674
             precision    recall  f1-score   support

     attica       1.00      0.95      0.97       100
     others       0.95      1.00      0.97        96

avg / total       0.98      0.97      0.97       196

[[95  5]
 [ 0 96]]
SGDClassifier 0.9738219895287958
             precision    recall  f1-score   support

     nevada       0.97      0.98      0.97        95
     others       0.98      0.97      0.97        96

avg / total       0.97      0.97      0.97       191

[[93  2]
 [ 3 93]]
MultinomialNB 0.9214659685863874
             precision    recall  f1-score   support

     nevada       0.94      0.89      0.92        95
     others       0.90      0.95      0.92        96

avg / total       0.92      0.92      0.92       191

[[85 10]
 [ 5 91]]
PassiveAggressiveClassifier 0.9424083769633508
             precision    recall  f1-score   support

     nevada       0.98      0.91      0.94        95
     others       0.91      0.98      0.94       