In [1]:
import numpy
from calamity import Calamity

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from nltk.stem.snowball import EnglishStemmer

from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import PassiveAggressiveClassifier

from sklearn.neighbors import KNeighborsClassifier

from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

In [3]:
archive = Calamity('disaster-archive')

[+] New event stan had been added
[+] New event maria had been added
[+] New event matthew had been added
[+] New event katrina had been added
[+] New event irma had been added
[+] New event jeanne had been added
[+] New event kashmir had been added
[+] New event haiti had been added
[+] New event sumatra had been added
[+] New event sichuan had been added
[+] New event attica had been added
[+] New event thomas had been added
[+] New event nevada had been added
[+] New event carr had been added
[+] New event mountcarmel had been added
[+] New event virginiatech had been added
[+] New event sandyhook had been added
[+] New event aurora had been added
[+] New event sanbernardino had been added
[+] New event orlando had been added
[+] New event lasvegas had been added
[+] New event charliehebdo had been added
[+] New event japan18 had been added
[+] New event kerala had been added
[+] New event gujarat had been added
Object has been successfully loaded


In [4]:
archive.listTopics()

[('hurricanes', 'stan', 30),
 ('hurricanes', 'maria', 98),
 ('hurricanes', 'matthew', 100),
 ('hurricanes', 'katrina', 99),
 ('hurricanes', 'irma', 64),
 ('hurricanes', 'jeanne', 35),
 ('earthquakes', 'kashmir', 59),
 ('earthquakes', 'haiti', 100),
 ('earthquakes', 'sumatra', 86),
 ('earthquakes', 'sichuan', 100),
 ('wildfires', 'attica', 100),
 ('wildfires', 'thomas', 82),
 ('wildfires', 'nevada', 95),
 ('wildfires', 'carr', 100),
 ('wildfires', 'mountcarmel', 19),
 ('shootings', 'virginiatech', 100),
 ('shootings', 'sandyhook', 99),
 ('shootings', 'aurora', 100),
 ('shootings', 'sanbernardino', 97),
 ('shootings', 'orlando', 99),
 ('shootings', 'lasvegas', 100),
 ('shootings', 'charliehebdo', 100),
 ('floods', 'japan18', 101),
 ('floods', 'kerala', 100),
 ('floods', 'gujarat', 80)]

## MTopicMClassifier
---

In [5]:
CHUNK = 200
labels = [
    'hurricanes',
    'earthquakes',
    'wildfires',
    'shootings',
    'floods'
]

data = [archive.mergeTopicArticles([x], CHUNK) for x in labels]

In [6]:
train_data, test_data = [], []
train_label, test_label = numpy.array([]), numpy.array([])

TRAIN_SIZE = 100
l = 0

for cat in data:
    for x in cat[:TRAIN_SIZE]:
        train_data.append(x['text'])
        train_label = numpy.append(train_label, l)
    for x in cat[TRAIN_SIZE:]:
        test_data.append(x['text'])
        test_label = numpy.append(test_label, l)
    l += 1
    
test_data.extend(train_data)
test_label = numpy.append(test_label, train_label)

In [7]:
classifiers = [
    SGDClassifier(loss='hinge',
                  penalty='elasticnet',
                  alpha=1e-3,
                  max_iter=1000,
                  tol=None),
    MultinomialNB(),
    PassiveAggressiveClassifier(max_iter=200),
    KNeighborsClassifier(n_neighbors=len(data)),
    RandomForestClassifier(n_estimators=200,
                           max_depth=3),
    LogisticRegression(),
    LinearSVC()
]

In [8]:
stemmer = EnglishStemmer()
analyzer = CountVectorizer().build_analyzer()

def stemmed_words(text):
    return (stemmer.stem(word) for word in analyzer(text))

In [9]:
for c in classifiers:
    clf = Pipeline([('vect', CountVectorizer(analyzer=stemmed_words,
                                             stop_words='english')),
                    ('tfidf', TfidfTransformer()),
                    ('clf', c),
    ])
    clf.fit(train_data, train_label)
    pred_label = clf.predict(test_data)
    print(type(c).__name__, accuracy_score(test_label, pred_label))
    print(classification_report(test_label, pred_label, target_names= labels))
    print(confusion_matrix(test_label, pred_label))

SGDClassifier 0.973
             precision    recall  f1-score   support

 hurricanes       0.99      0.96      0.98       200
earthquakes       0.98      0.98      0.98       200
  wildfires       0.98      0.96      0.97       200
  shootings       0.93      1.00      0.97       200
     floods       0.98      0.96      0.97       200

avg / total       0.97      0.97      0.97      1000

[[192   3   1   3   1]
 [  1 196   0   1   2]
 [  0   1 192   6   1]
 [  0   0   0 200   0]
 [  0   0   3   4 193]]
MultinomialNB 0.96
             precision    recall  f1-score   support

 hurricanes       0.96      0.93      0.95       200
earthquakes       0.89      0.99      0.94       200
  wildfires       1.00      0.94      0.97       200
  shootings       0.98      0.99      0.98       200
     floods       0.98      0.95      0.97       200

avg / total       0.96      0.96      0.96      1000

[[186  11   0   1   2]
 [  1 198   0   0   1]
 [  2   6 187   4   1]
 [  1   1   0 198   0]
 [  3

## 1TopicvAll
---

In [10]:
CHUNK = 200
labels = [
    'hurricanes',
    'earthquakes',
    'wildfires',
    'shootings',
    'floods'
]

for label in labels:
    data = []
    pool = [[label], [x for x in labels if x != label]]
    for x in pool:
        data.append(archive.mergeTopicArticles(x, CHUNK))
    
    train_data, test_data = [], []
    train_label, test_label = numpy.array([]), numpy.array([])

    TRAIN_SIZE = 100
    l = 0
    for cat in data:
        for x in cat[:TRAIN_SIZE]:
            train_data.append(x['text'])
            train_label = numpy.append(train_label, l)
        for x in cat[TRAIN_SIZE:]:
            test_data.append(x['text'])
            test_label = numpy.append(test_label, l)
        l += 1
        
    test_data.extend(train_data)
    test_label = numpy.append(test_label, train_label)
                     
    for c in classifiers:
        clf = Pipeline([('vect', CountVectorizer(analyzer=stemmed_words,
                                                 stop_words='english')),
                        ('tfidf', TfidfTransformer()),
                        ('clf', c),
        ])
        clf.fit(train_data, train_label)
        pred_label = clf.predict(test_data)
        print(type(c).__name__, accuracy_score(test_label, pred_label))
        print(classification_report(test_label, pred_label, target_names= [label , 'others']))
        print(confusion_matrix(test_label, pred_label))
        

SGDClassifier 0.99
             precision    recall  f1-score   support

 hurricanes       0.99      0.99      0.99       200
     others       0.99      0.99      0.99       200

avg / total       0.99      0.99      0.99       400

[[198   2]
 [  2 198]]
MultinomialNB 0.97
             precision    recall  f1-score   support

 hurricanes       0.95      0.99      0.97       200
     others       0.99      0.94      0.97       200

avg / total       0.97      0.97      0.97       400

[[199   1]
 [ 11 189]]
PassiveAggressiveClassifier 0.9925
             precision    recall  f1-score   support

 hurricanes       0.99      0.99      0.99       200
     others       0.99      0.99      0.99       200

avg / total       0.99      0.99      0.99       400

[[198   2]
 [  1 199]]
KNeighborsClassifier 0.9225
             precision    recall  f1-score   support

 hurricanes       0.88      0.98      0.93       200
     others       0.98      0.86      0.92       200

avg / total       0.93  

RandomForestClassifier 0.955
             precision    recall  f1-score   support

     floods       0.99      0.92      0.95       200
     others       0.93      0.99      0.96       200

avg / total       0.96      0.95      0.95       400

[[184  16]
 [  2 198]]
LogisticRegression 0.9825
             precision    recall  f1-score   support

     floods       0.99      0.97      0.98       200
     others       0.98      0.99      0.98       200

avg / total       0.98      0.98      0.98       400

[[195   5]
 [  2 198]]
LinearSVC 0.99
             precision    recall  f1-score   support

     floods       0.99      0.98      0.99       200
     others       0.99      0.99      0.99       200

avg / total       0.99      0.99      0.99       400

[[197   3]
 [  1 199]]
