In [1]:
import numpy
from calamity import Calamity

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from nltk.stem.snowball import EnglishStemmer

from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import PassiveAggressiveClassifier

from sklearn.neighbors import KNeighborsClassifier

from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [3]:
archive = Calamity('disaster-archive')

[+] New event stan had been added
[+] New event maria had been added
[+] New event matthew had been added
[+] New event katrina had been added
[+] New event irma had been added
[+] New event jeanne had been added
[+] New event kashmir had been added
[+] New event haiti had been added
[+] New event sumatra had been added
[+] New event sichuan had been added
[+] New event attica had been added
[+] New event thomas had been added
[+] New event nevada had been added
[+] New event carr had been added
[+] New event mountcarmel had been added
[+] New event virginiatech had been added
[+] New event sandyhook had been added
[+] New event aurora had been added
[+] New event sanbernardino had been added
[+] New event orlando had been added
[+] New event lasvegas had been added
[+] New event charliehebdo had been added
[+] New event japan18 had been added
[+] New event kerala had been added
[+] New event gujarat had been added
Object has been successfully loaded


In [4]:
archive.listTopics()

[('hurricanes', 'stan', 30),
 ('hurricanes', 'maria', 98),
 ('hurricanes', 'matthew', 100),
 ('hurricanes', 'katrina', 99),
 ('hurricanes', 'irma', 64),
 ('hurricanes', 'jeanne', 35),
 ('earthquakes', 'kashmir', 59),
 ('earthquakes', 'haiti', 100),
 ('earthquakes', 'sumatra', 86),
 ('earthquakes', 'sichuan', 100),
 ('wildfires', 'attica', 100),
 ('wildfires', 'thomas', 82),
 ('wildfires', 'nevada', 95),
 ('wildfires', 'carr', 100),
 ('wildfires', 'mountcarmel', 19),
 ('shootings', 'virginiatech', 100),
 ('shootings', 'sandyhook', 99),
 ('shootings', 'aurora', 100),
 ('shootings', 'sanbernardino', 97),
 ('shootings', 'orlando', 99),
 ('shootings', 'lasvegas', 100),
 ('shootings', 'charliehebdo', 100),
 ('floods', 'japan18', 101),
 ('floods', 'kerala', 100),
 ('floods', 'gujarat', 80)]

## MEventMClassifier
---

In [5]:
labels = [
#     'stan',
    'maria',
    'matthew',
    'katrina',
#     'irma',
#     'jeanne',
#     'kashmir',
    'haiti',
    'sumatra',
    'sichuan',
    'attica',
    'thomas',
    'nevada',
    'carr',
#     'mountcarmel',
    'virginiatech',
    'sandyhook',
    'aurora',
    'sanbernardino',
    'orlando',
    'lasvegas',
    'charliehebdo',
    'japan18',
    'kerala',
    'gujarat'
]

In [6]:
CHUNK = 100
data = [archive.mergeEventArticles([x], CHUNK) for x in labels]

train_data, test_data = [], []
train_label, test_label = numpy.array([]), numpy.array([])

TRAIN_SIZE = 50
l = 0

for cat in data:
    for x in cat[:TRAIN_SIZE]:
        train_data.append(x['text'])
        train_label = numpy.append(train_label, l)
    for x in cat[TRAIN_SIZE:]:
        test_data.append(x['text'])
        test_label = numpy.append(test_label, l)
    l += 1

test_data.extend(train_data)
test_label = numpy.append(test_label, train_label)

In [7]:
classifiers = [
    SGDClassifier(loss='hinge',
                  penalty='elasticnet',
                  alpha=1e-3,
                  max_iter=1000,
                  tol=None),
    MultinomialNB(),
    PassiveAggressiveClassifier(max_iter=200),
    KNeighborsClassifier(n_neighbors=len(data)),
    RandomForestClassifier(n_estimators=200,
                           max_depth=3),
    LogisticRegression(),
    LinearSVC()
]

In [8]:
stemmer = EnglishStemmer()
analyzer = CountVectorizer().build_analyzer()

def stemmed_words(text):
    return (stemmer.stem(word) for word in analyzer(text))

In [9]:
for c in classifiers:
    clf = Pipeline([('vect', CountVectorizer(analyzer=stemmed_words,
                                             stop_words='english')),
                    ('tfidf', TfidfTransformer()),
                    ('clf', c),
    ])
    clf.fit(train_data, train_label)
    pred_label = clf.predict(test_data)
    print(type(c).__name__, accuracy_score(test_label, pred_label))
    print(classification_report(test_label, pred_label, target_names = labels))
    print(confusion_matrix(test_label, pred_label))

SGDClassifier 0.9772609819121447
               precision    recall  f1-score   support

        maria       1.00      0.98      0.99        98
      matthew       0.95      0.93      0.94       100
      katrina       0.99      1.00      0.99        99
        haiti       0.93      0.99      0.96       100
      sumatra       1.00      0.95      0.98        86
      sichuan       0.99      1.00      1.00       100
       attica       0.99      0.96      0.97       100
       thomas       0.97      0.93      0.95        82
       nevada       0.94      0.97      0.95        95
         carr       0.93      0.98      0.96       100
 virginiatech       0.94      0.99      0.97       100
    sandyhook       0.98      0.99      0.98        99
       aurora       0.99      0.99      0.99       100
sanbernardino       0.99      0.98      0.98        97
      orlando       1.00      0.99      0.99        99
     lasvegas       0.99      0.98      0.98       100
 charliehebdo       1.00      0

KNeighborsClassifier 0.9235142118863049
               precision    recall  f1-score   support

        maria       0.97      0.95      0.96        98
      matthew       0.74      0.88      0.80       100
      katrina       0.91      0.93      0.92        99
        haiti       0.93      0.92      0.92       100
      sumatra       0.88      0.93      0.90        86
      sichuan       0.92      0.98      0.95       100
       attica       1.00      0.78      0.88       100
       thomas       0.99      0.85      0.92        82
       nevada       0.95      0.79      0.86        95
         carr       0.77      0.97      0.86       100
 virginiatech       0.93      0.96      0.95       100
    sandyhook       0.84      0.93      0.88        99
       aurora       0.99      0.93      0.96       100
sanbernardino       0.98      0.96      0.97        97
      orlando       1.00      0.96      0.98        99
     lasvegas       0.95      0.98      0.97       100
 charliehebdo       0.98

## 1EventvAll
---

In [10]:
CHUNK = 100

for label in labels:
    data = []
    pool = [[label], [x for x in labels if x != label]]
    for x in pool:
        data.append(archive.mergeEventArticles(x, CHUNK))
    
    train_data, test_data = [], []
    train_label, test_label = numpy.array([]), numpy.array([])

    TRAIN_SIZE = 50
    l = 0
    for cat in data:
        for x in cat[:TRAIN_SIZE]:
            train_data.append(x['text'])
            train_label = numpy.append(train_label, l)
        for x in cat[TRAIN_SIZE:]:
            test_data.append(x['text'])
            test_label = numpy.append(test_label, l)
        l += 1
        
    test_data.extend(train_data)
    test_label = numpy.append(test_label, train_label)
                     
    for c in classifiers:
        clf = Pipeline([('vect', CountVectorizer(analyzer=stemmed_words,
                                                 stop_words='english')),
                        ('tfidf', TfidfTransformer()),
                        ('clf', c),
        ])
        clf.fit(train_data, train_label)
        pred_label = clf.predict(test_data)
        print(type(c).__name__, accuracy_score(test_label, pred_label))
        print(classification_report(test_label, pred_label, target_names= [label , 'others']))
        print(confusion_matrix(test_label, pred_label))
        

SGDClassifier 0.9797979797979798
             precision    recall  f1-score   support

      maria       0.99      0.97      0.98        98
     others       0.97      0.99      0.98       100

avg / total       0.98      0.98      0.98       198

[[95  3]
 [ 1 99]]
MultinomialNB 0.8484848484848485
             precision    recall  f1-score   support

      maria       0.77      1.00      0.87        98
     others       1.00      0.70      0.82       100

avg / total       0.88      0.85      0.85       198

[[98  0]
 [30 70]]
PassiveAggressiveClassifier 0.9747474747474747
             precision    recall  f1-score   support

      maria       0.97      0.98      0.97        98
     others       0.98      0.97      0.97       100

avg / total       0.97      0.97      0.97       198

[[96  2]
 [ 3 97]]
KNeighborsClassifier 0.5050505050505051
             precision    recall  f1-score   support

      maria       0.50      1.00      0.67        98
     others       1.00      0.02      

KNeighborsClassifier 0.478494623655914
             precision    recall  f1-score   support

    sumatra       0.47      1.00      0.64        86
     others       1.00      0.03      0.06       100

avg / total       0.75      0.48      0.33       186

[[86  0]
 [97  3]]
RandomForestClassifier 0.9516129032258065
             precision    recall  f1-score   support

    sumatra       0.98      0.92      0.95        86
     others       0.93      0.98      0.96       100

avg / total       0.95      0.95      0.95       186

[[79  7]
 [ 2 98]]
LogisticRegression 0.967741935483871
             precision    recall  f1-score   support

    sumatra       0.95      0.98      0.97        86
     others       0.98      0.96      0.97       100

avg / total       0.97      0.97      0.97       186

[[84  2]
 [ 4 96]]
LinearSVC 0.978494623655914
             precision    recall  f1-score   support

    sumatra       0.98      0.98      0.98        86
     others       0.98      0.98      0.98   

LinearSVC 0.9692307692307692
             precision    recall  f1-score   support

     nevada       0.96      0.98      0.97        95
     others       0.98      0.96      0.97       100

avg / total       0.97      0.97      0.97       195

[[93  2]
 [ 4 96]]
SGDClassifier 0.965
             precision    recall  f1-score   support

       carr       0.96      0.97      0.97       100
     others       0.97      0.96      0.96       100

avg / total       0.97      0.96      0.96       200

[[97  3]
 [ 4 96]]
MultinomialNB 0.93
             precision    recall  f1-score   support

       carr       0.88      1.00      0.93       100
     others       1.00      0.86      0.92       100

avg / total       0.94      0.93      0.93       200

[[100   0]
 [ 14  86]]
PassiveAggressiveClassifier 0.965
             precision    recall  f1-score   support

       carr       0.95      0.98      0.97       100
     others       0.98      0.95      0.96       100

avg / total       0.97      0.9

PassiveAggressiveClassifier 0.9898477157360406
               precision    recall  f1-score   support

sanbernardino       1.00      0.98      0.99        97
       others       0.98      1.00      0.99       100

  avg / total       0.99      0.99      0.99       197

[[ 95   2]
 [  0 100]]
KNeighborsClassifier 0.5888324873096447
               precision    recall  f1-score   support

sanbernardino       0.55      0.99      0.70        97
       others       0.95      0.20      0.33       100

  avg / total       0.75      0.59      0.51       197

[[96  1]
 [80 20]]
RandomForestClassifier 0.9695431472081218
               precision    recall  f1-score   support

sanbernardino       0.99      0.95      0.97        97
       others       0.95      0.99      0.97       100

  avg / total       0.97      0.97      0.97       197

[[92  5]
 [ 1 99]]
LogisticRegression 0.9796954314720813
               precision    recall  f1-score   support

sanbernardino       0.99      0.97      0.98   

LogisticRegression 0.97
             precision    recall  f1-score   support

    japan18       0.99      0.95      0.97       100
     others       0.95      0.99      0.97       100

avg / total       0.97      0.97      0.97       200

[[95  5]
 [ 1 99]]
LinearSVC 0.985
             precision    recall  f1-score   support

    japan18       1.00      0.97      0.98       100
     others       0.97      1.00      0.99       100

avg / total       0.99      0.98      0.98       200

[[ 97   3]
 [  0 100]]
SGDClassifier 0.995
             precision    recall  f1-score   support

     kerala       1.00      0.99      0.99       100
     others       0.99      1.00      1.00       100

avg / total       1.00      0.99      0.99       200

[[ 99   1]
 [  0 100]]
MultinomialNB 0.925
             precision    recall  f1-score   support

     kerala       0.87      1.00      0.93       100
     others       1.00      0.85      0.92       100

avg / total       0.93      0.93      0.92       