In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix as confmat
from sklearn.metrics import f1_score as f1
# from sklearn.metrics import plot_confusion_matrix as plot_confmat

# import re
# import string
import random
import importlib

# import nltk
# from nltk.corpus import stopwords
# from nltk.stem.porter import PorterStemmer

import torch
import preprocess
import dan

import gensim
# load dataset and preprocess
twt = pd.read_csv('train.csv')
twt = twt.set_index('id')
twt.head()

Unnamed: 0_level_0,keyword,location,text,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,,,Our Deeds are the Reason of this #earthquake M...,1
4,,,Forest fire near La Ronge Sask. Canada,1
5,,,All residents asked to 'shelter in place' are ...,1
6,,,"13,000 people receive #wildfires evacuation or...",1
7,,,Just got sent this photo from Ruby #Alaska as ...,1


## 0. Data Preprocessing

### 0.1 Data Cleaning

In [2]:
text = twt['text'].to_list()
prep_text = [preprocess.processing(i) for i in text]
print('{} tweets lowered, tokenized, alphanumerized, stop-stripped, and stemmed.'.format(len(prep_text)))
twt['prepped'] = prep_text
twt.head()

7613 tweets lowered, tokenized, alphanumerized, stop-stripped, and stemmed.


Unnamed: 0_level_0,keyword,location,text,target,prepped
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,,,Our Deeds are the Reason of this #earthquake M...,1,"[deeds, reason, earthquake, may, allah, forgiv..."
4,,,Forest fire near La Ronge Sask. Canada,1,"[forest, fire, near, la, ronge, sask, canada]"
5,,,All residents asked to 'shelter in place' are ...,1,"[residents, asked, shelter, place, notified, o..."
6,,,"13,000 people receive #wildfires evacuation or...",1,"[people, receive, wildfires, evacuation, order..."
7,,,Just got sent this photo from Ruby #Alaska as ...,1,"[got, sent, photo, ruby, alaska, smoke, wildfi..."


### 0.2 Embedding

In [3]:
word2vec = gensim.models.KeyedVectors.load_word2vec_format("GoogleNews-vectors-negative300.bin.gz", binary=True)

twt['vec'] = pd.Series([preprocess.tweet_vec(tweet, word2vec) for tweet in twt['prepped']], index = twt.index)

devtrain_idx = twt.loc[~twt['vec'].isna()].index.tolist()
random.shuffle(devtrain_idx)

train_p = 0.7
train_idx = devtrain_idx[:round(train_p*len(devtrain_idx))]
dev_idx = devtrain_idx[round(train_p*len(devtrain_idx)):]

train = [(vec, targ) for targ, vec in  zip(twt['target'][train_idx], twt['vec'][train_idx])]
dev = [(vec, targ) for targ, vec in  zip(twt['target'][dev_idx], twt['vec'][dev_idx])]

## 1. NN construction

In [36]:
importlib.reload(dan)
net = dan.Net()
net.train(train, dev)

[1,     1] loss: 0.695	Dev FI: 0.006
[1,     2] loss: 0.670	Dev FI: 0.504
[2,     1] loss: 0.616	Dev FI: 0.721
[2,     2] loss: 0.578	Dev FI: 0.731
[3,     1] loss: 0.561	Dev FI: 0.717
[3,     2] loss: 0.527	Dev FI: 0.722
[4,     1] loss: 0.528	Dev FI: 0.725
[4,     2] loss: 0.510	Dev FI: 0.733
[5,     1] loss: 0.523	Dev FI: 0.735
[5,     2] loss: 0.490	Dev FI: 0.735
[6,     1] loss: 0.526	Dev FI: 0.731
[6,     2] loss: 0.478	Dev FI: 0.732
[7,     1] loss: 0.521	Dev FI: 0.736
[7,     2] loss: 0.469	Dev FI: 0.734
[8,     1] loss: 0.516	Dev FI: 0.736
[8,     2] loss: 0.461	Dev FI: 0.737
[9,     1] loss: 0.515	Dev FI: 0.738
[9,     2] loss: 0.447	Dev FI: 0.740
[10,     1] loss: 0.518	Dev FI: 0.740
[10,     2] loss: 0.438	Dev FI: 0.735
[11,     1] loss: 0.517	Dev FI: 0.738
[11,     2] loss: 0.431	Dev FI: 0.735
[12,     1] loss: 0.515	Dev FI: 0.737
[12,     2] loss: 0.428	Dev FI: 0.732
[13,     1] loss: 0.515	Dev FI: 0.731
[13,     2] loss: 0.420	Dev FI: 0.730
[14,     1] loss: 0.516	Dev FI

## Dev Performance and Error Analysis

In [32]:
# dev[0]
ys, y_stars = net.get_eval_data(dev)
# confmat(ys, y_stars)
f1(ys, y_stars)

twt['pred'] = pd.Series(y_stars, index = dev_idx)


# for error analysis
twt[['keyword', 'location', 'text', 'prepped','vec', 'target','pred']].to_csv('error_analysis.csv')

## Create Test Submissions

In [35]:
# create vector of mean of all word vectors
mean_vec = np.zeros((300,1))
for vec_targ in train:
    mean_vec = np.add(mean_vec, vec_targ[0])
mean_vec = mean_vec/len(train)

# read in test data and preprocess tweets
test = pd.read_csv('test.csv')
proc_text = [preprocess.processing(i) for i in text]
targets = np.zeros((len(test), 1))
print('{} tweets read from test.csv'.format(test.shape[0]))


# test_data = [(tweet_vec(x, word2vec), y) if for x, y in zip(proc_test, targets)]
test['vec'] = pd.Series([preprocess.tweet_vec(tweet, word2vec) for tweet in test['text'].tolist()], index = test.index)

print('{} tweets processed in the test set.'.format(test.shape[0]))

test_data = test['vec'].tolist()


_, y_stars = net.get_eval_data(test_data, mode = 'test')

# create columns for submission data
id = test['id'].to_numpy()
target = y_stars

# create df with submission data and write to csv
submission = pd.DataFrame({'id': id, 'target': target})
submission.set_index('id').to_csv('submission.csv')

# also save analysis csv for reviewing decisions
test['prepped'] = proc_text
test['pred'] = target
test[['keyword', 'location', 'text', 'prepped','pred']].to_csv('test_error_analysis.csv')

3263 tweets read from test.csv
3263 tweets processed in the test set.
3263 3263


In [43]:
test[['id', 'pred']].set_index('id').to_csv('submission.csv')

## 2. Naive Bayes with Bag of Words

In [44]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer

# create raw bow text and labels, shuffle
bow = twt['text'].to_numpy()
labels = twt['target'].to_numpy()

shuffle_idx = [x for x in range(len(bow))]
random.shuffle(shuffle_idx)

bow = bow[shuffle_idx]
labels = labels[shuffle_idx]

# train bag-of-words
count_vect = CountVectorizer()
x_bow = count_vect.fit(bow)

# create train dev split
train_bow = bow[:round(train_p*len(bow))]
dev_bow = bow[round(train_p*len(bow)):]
train_bow = count_vect.transform(train_bow)
dev_bow = count_vect.transform(dev_bow)

train_bow_labels = labels[:round(train_p*len(bow))]
dev_bow_lables = labels[round(train_p*len(bow)):]

print('Created {} bag-of-words vectorizations.'.format(train_bow.shape[0] + dev_bow.shape[0]))
clf = MultinomialNB().fit(train_bow, train_bow_labels)
print('Naive Bayes training complete.')

# get dev performances
ys = clf.predict(dev_bow)

# dev performance
print('Naive Bayes dev performance {}'.format(f1(ys, dev_bow_lables)))

Created 7613 bag-of-words vectorizations.
Naive Bayes training complete.
Naive Bayes dev performance 0.7500000000000001


## Missingness Exploration

We suspect that location data can be useful for predicting disasters. However, we have a sizable proportion of the data for which the location data is missing. In order to address this issue, we needed to first identify whether the data is missing completely at random, missing at random, or not missing at random. For the data to be MCAR, the mechanism of missingness of the data must be independent of all other observed features of the data. These other observed features, in our case, include the text of the tweet. Reviewing the Twitter policy on tweet-level location data, we found that twitter users must opt in to location services to embed location data in there tweets, as well as intentionally include location data on each desired tweet. This policy gives us an insight into some mechanisms of missingness - users forget that they can include location data, or the intentionally elect not to for some tweets. This second mechanism would preclude categorization as missing completely at random. Furthermore, as users of social media, the authors have firsthand experience with the relationship between the location from which a tweet was sent, and the inclusion of location data. Twitter users may want to inform followers that they are tweeting from an impressive or otherwise unusual location - a famous concert venue, or a historic city, or even the site of a terrible natural disaster - a decision surely not made at random. Being Missing Not a Random, our options for mitigating location data missingness are few. We could drop the data altogether, but decided that appending it to the contents of the tweet would be an acceptable approach that prevents information loss.
