In [37]:
import pandas as pd
import numpy as np
from textblob import TextBlob
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

tweets = pd.read_csv('socialmedia-disaster-tweets-DFE.csv')[['text','choose_one']]
tweets.columns = ['tweet','class']
tweets = tweets[(tweets['class'] == 'Relevant') | (tweets['class'] == 'Not Relevant')]
tweets = tweets.reset_index(drop=True)
tweets.tail()

Unnamed: 0,tweet,class
10855,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,Relevant
10856,Police investigating after an e-bike collided ...,Relevant
10857,The Latest: More Homes Razed by Northern Calif...,Relevant
10858,MEG issues Hazardous Weather Outlook (HWO) htt...,Relevant
10859,#CityofCalgary has activated its Municipal Eme...,Relevant


In [38]:
import re
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = re.sub('[\W]+', ' ', text.lower()) +\
        ' '.join(emoticons).replace('-', '')
    return text

tweets['tweet'] = tweets['tweet'].apply(preprocessor)

In [39]:
import nltk

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/macbook/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [40]:
from nltk.corpus import stopwords

stop = stopwords.words('english')
stop = stop + [u'a',u'b',u'c',u'd',u'e',u'f',u'g',u'h',u'i',u'j',u'k',u'l',u'm',u'n',u'o',u'p',u'q',u'r',u's',u't',u'v',u'w',u'x',u'y',u'z']

In [41]:
def split_into_lemmas(tweet):
    review = unicode(tweet, 'utf8').lower()
    words = TextBlob(tweet).words
    # for each word, take its "base form" = lemma 
    return [word.lemma for word in words if word not in stop]

tweets.tweet.tail().apply(split_into_lemmas)

10855    [m1, 94, 01, 04, utc, 5km, volcano, hawaii, ht...
10856    [police, investigating, bike, collided, car, l...
10857    [latest, home, razed, northern, california, wi...
10858    [meg, issue, hazardous, weather, outlook, hwo,...
10859    [cityofcalgary, activated, municipal, emergenc...
Name: tweet, dtype: object

In [None]:
%%time
bow_transformer = CountVectorizer(analyzer=split_into_lemmas).fit(tweets['tweet'])
print len(bow_transformer.vocabulary_)

In [None]:
%%time
tweets_bow = bow_transformer.transform(tweets['tweet'])
print 'sparse matrix shape:', tweets_bow.shape
print 'number of non-zeros:', tweets_bow.nnz
print 'sparsity: %.2f%%' % (100.0 * tweets_bow.nnz / (tweets_bow.shape[0] * tweets_bow.shape[1]))

In [None]:
tweets_bow_train = tweets_bow[:8000]
tweets_bow_test = tweets_bow[8000:]
tweets_class_train = tweets['class'][:8000]
tweets_class_test = tweets['class'][8000:]

print tweets_bow_train.shape
print tweets_bow_test.shape
print tweets_class_train.shape

In [None]:
%time disaster_detect = MultinomialNB().fit(tweets_bow_train,tweets_class_train)

In [None]:
predictions = disaster_detect.predict(tweets_bow_test)
predictions

In [None]:
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix

print 'accuracy', accuracy_score(tweets_class_test, predictions)
print 'confusion matrix\n', confusion_matrix(tweets_class_test, predictions)
print '(row=expected, col=predicted)'

In [None]:
print classification_report(tweets_class_test, predictions)

In [None]:
def predict_tweet(tweet): 
    new_sample = bow_transformer.transform([tweet])
    print tweet, np.around(disaster_detect.predict_proba(new_sample), decimals=5),"\n"

predict_tweet('Cat stuck in a tree.')
predict_tweet('Car accident. Major damage to property.')

predict_tweet('I ate a sandwich last night.')
predict_tweet('Somehow, Mr. Dreyfuss maintains his sound comic timing even when Frank Oz\'s antic direction calls for hand-waving hysteria.')