In [7]:
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix as confmat
from sklearn.metrics import f1_score as f1
# from sklearn.metrics import plot_confusion_matrix as plot_confmat

import re
import string
import random

import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

import gensim
#load dataset and preprocess
twt = pd.read_csv('train.csv')
twt = twt.set_index('id')
twt.shape
print(twt.info())
twt.loc[~twt['keyword'].isna()]

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7613 entries, 1 to 10873
Data columns (total 4 columns):
keyword     7552 non-null object
location    5080 non-null object
text        7613 non-null object
target      7613 non-null int64
dtypes: int64(1), object(3)
memory usage: 297.4+ KB
None


Unnamed: 0_level_0,keyword,location,text,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
48,ablaze,Birmingham,@bbcmtd Wholesale Markets ablaze http://t.co/l...,1
49,ablaze,Est. September 2012 - Bristol,We always try to bring the heavy. #metal #RT h...,0
50,ablaze,AFRICA,#AFRICANBAZE: Breaking news:Nigeria flag set a...,1
52,ablaze,"Philadelphia, PA",Crying out for more! Set me ablaze,0
53,ablaze,"London, UK",On plus side LOOK AT THE SKY LAST NIGHT IT WAS...,0
...,...,...,...,...
10830,wrecked,,@jt_ruff23 @cameronhacker and I wrecked you both,0
10831,wrecked,"Vancouver, Canada",Three days off from work and they've pretty mu...,0
10832,wrecked,London,#FX #forex #trading Cramer: Iger's 3 words tha...,0
10833,wrecked,Lincoln,@engineshed Great atmosphere at the British Li...,0


In [8]:
twt.loc[10844, 'target']

1

## 0. Data Preprocessing

### 0.1 Data Cleaning

In [15]:
stop_words = stopwords.words('english')
porter = PorterStemmer()
def processing (sentence):
    result = sentence.lower() #Lower case 
    result = re.sub(r'\d+', '', result) #Removing numbers
    result = result.translate(str.maketrans('', '', string.punctuation)) #Remove weird characters
    result = result.strip() #Eliminate blanks from begining and end of setences
    result = result.split() #Separate into words
    result = [w for w in result if not w in stop_words] #Eliminate stop_words
    result = [porter.stem(word) for word in result] #Stem Words
    return (result)

text = list(twt["text"])
len(text)

prep_text = [processing(i) for i in text]
print('{} tweets lowered, tokenized, alphanumerized, stop-stripped, and stemmed.'.format(len(prep_text)))
twt['prepped'] = prep_text

7613 tweets lowered, tokenized, alphanumerized, stop-stripped, and stemmed.


In [28]:
# twt.loc[twt['keyword'].any()),]
print(twt.shape)
twt.dropna()
print(twt.shape)
# twt['prepped'].isna().any()


(7613, 5)
(7613, 5)


Unnamed: 0_level_0,keyword,location,text,target,prepped
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,,,Our Deeds are the Reason of this #earthquake M...,1,"[deed, reason, earthquak, may, allah, forgiv, us]"
4,,,Forest fire near La Ronge Sask. Canada,1,"[forest, fire, near, la, rong, sask, canada]"
5,,,All residents asked to 'shelter in place' are ...,1,"[resid, ask, shelter, place, notifi, offic, ev..."
6,,,"13,000 people receive #wildfires evacuation or...",1,"[peopl, receiv, wildfir, evacu, order, califor..."
7,,,Just got sent this photo from Ruby #Alaska as ...,1,"[got, sent, photo, rubi, alaska, smoke, wildfi..."
...,...,...,...,...,...
10869,,,Two giant cranes holding a bridge collapse int...,1,"[two, giant, crane, hold, bridg, collaps, near..."
10870,,,@aria_ahrary @TheTawniest The out of control w...,1,"[ariaahrari, thetawniest, control, wild, fire,..."
10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1,"[utckm, volcano, hawaii, httptcozdtoydebj]"
10872,,,Police investigating after an e-bike collided ...,1,"[polic, investig, ebik, collid, car, littl, po..."


### 0.2 Embedding

In [96]:
# word2vec = gensim.models.KeyedVectors.load_word2vec_format("GoogleNews-vectors-negative300.bin.gz", binary=True)

def tweet_vec (tweet, word2vec):
    word_vecs = [word2vec.get_vector(w) for w in tweet if w in word2vec.vocab]
#     print(tweet)
#     print('Number of words: {}'.format(len(word_vecs)))
    if len(word_vecs) >= 1:
        return np.stack(word_vecs).mean(0)
    else:
        return None

twt['vec'] = pd.Series([tweet_vec(tweet, word2vec) for tweet in twt['prepped']], index = twt.index)

devtrain_idx = twt.loc[~twt['vec'].isna()].index.tolist()
random.shuffle(devtrain_idx)

train_idx = devtrain_idx[:round(train_p*len(devtrain_idx))]
dev_idx = devtrain_idx[round(train_p*len(devtrain_idx)):]

train = [(vec, targ) for targ, vec in  zip(twt['target'][train_idx], twt['vec'][train_idx])]
dev = [(vec, targ) for targ, vec in  zip(twt['target'][dev_idx], twt['vec'][dev_idx])]

## 1. NN construction

In [97]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

class Net(nn.Module):

    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(300,300)  # 6*6 from image dimension
        self.fc2 = nn.Linear(300, 2)
        self.softmax =  nn.Softmax(dim = 1)

    def forward(self, x):
        x = self.fc1(x)
        x = self.fc2(x)
        x = self.softmax(x)
        return x

    def num_flat_features(self, x):
        size = x.size()[1:]  # all dimensions except the batch dimension
        num_features = 1
        for s in size:
            num_features *= s
        return num_features
    
    def get_eval_data(self, data):
        dataloader = torch.utils.data.DataLoader(data, batch_size = 1)

        y_stars = []
        ys = [vec_targ[1] for vec_targ in data]

        for i, data in enumerate(dataloader, 0):
            x, _ = data
            # print(x)
            output = self.forward(x).detach().numpy()[0]
            y_star = np.argmax(output)
            #print(y_star)
            y_stars.append(y_star)
        
        return ys, y_stars

    
trainloader = torch.utils.data.DataLoader(train, batch_size = 5000)
net = Net()
# print(net.parameters())
criterion = nn.CrossEntropyLoss()
# create your optimizer
optimizer = optim.Adam(net.parameters(), lr=0.01)
for epoch in range(10):  # loop over the dataset multiple times
    running_loss = 0.0
    for i, data in enumerate(trainloader, 0):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data
#         print(inputs)
#         print(labels)

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = net(inputs)
#         print(outputs)
        loss = criterion(outputs, labels)
#         print(loss)
        loss.backward()
        optimizer.step()
        
        # print statistics
        running_loss += loss.item()
        if i % 1 == 0: # print every 2000 mini-batches
            ys, y_stars = net.get_eval_data(dev)
            print('[%d, %5d] loss: %.3f\tDev FI: %.3f' % (epoch + 1, i + 1, running_loss, f1(ys, y_stars)))
            running_loss = 0.0

print('Finished Training')
print(net.parameters())

[1,     1] loss: 0.691	Dev FI: 0.002
[1,     2] loss: 0.650	Dev FI: 0.145
[2,     1] loss: 0.636	Dev FI: 0.712
[2,     2] loss: 0.591	Dev FI: 0.712
[3,     1] loss: 0.576	Dev FI: 0.707
[3,     2] loss: 0.538	Dev FI: 0.698
[4,     1] loss: 0.552	Dev FI: 0.709
[4,     2] loss: 0.515	Dev FI: 0.710
[5,     1] loss: 0.541	Dev FI: 0.718
[5,     2] loss: 0.503	Dev FI: 0.716
[6,     1] loss: 0.538	Dev FI: 0.711
[6,     2] loss: 0.489	Dev FI: 0.705
[7,     1] loss: 0.537	Dev FI: 0.710
[7,     2] loss: 0.477	Dev FI: 0.713
[8,     1] loss: 0.535	Dev FI: 0.715
[8,     2] loss: 0.467	Dev FI: 0.717
[9,     1] loss: 0.532	Dev FI: 0.717
[9,     2] loss: 0.459	Dev FI: 0.717
[10,     1] loss: 0.530	Dev FI: 0.719
[10,     2] loss: 0.450	Dev FI: 0.715
Finished Training
<generator object Module.parameters at 0x00000203BF5A40C8>


## Dev Performance and Error Analysis

In [102]:
# dev[0]
ys, y_stars = net.get_eval_data(dev)
# confmat(ys, y_stars)
f1(ys, y_stars)

twt['pred'] = pd.Series(y_stars, index = dev_idx)

# for error analysis
twt.loc[dev_idx,].to_csv('error_analysis.csv')

## Create Test Submissions

In [26]:
# create vector of mean of all word vectors
mean_vec = np.zeros((300,1))
for vec_targ in train:
    mean_vec = np.add(mean_vec, vec_targ[0])
mean_vec = mean_vec/len(final_data)

# read in test data and preprocess tweets
test = pd.read_csv('test.csv')
text = list(test["text"])
proc_text = [processing(i) for i in text]
targets = np.zeros((len(test), 1))
print('{} tweets read from test.csv'.format(test.shape[0]))

test_data = []
counter = 0

# replace empty vectors with mean_vec
for x, y in zip(prep_text, targets):
    vec = tweet_vec(x, word2vec)
    if vec is not None:
        test_data.append((vec, y))
    else:
        test_data.append((mean_vec, y))
        counter += 0
        
print('{} tweets processed in the test set.'.format(len(test_data)))
print('{} empty tweets replaced with mean vector'.format(counter))

testloader = torch.utils.data.DataLoader(test_data, batch_size = 1)

y_stars = []

for i, data in enumerate(testloader, 0):
    x, y = data
    #print(x.dtype)
    #x_dub = torch.Tensor(x, dtype = 'double')
    #print(x.dtype)
    output = net.forward(x.float()).detach().numpy()[0]
    y_star = np.argmax(output)
    #print(y_star)
    y_stars.append(y_star)

# create columns for submission data
id = test['id'].to_numpy()
target = y_stars

# create df with submission data and write to csv
submission = pd.DataFrame({'id': id, 'target': target})

3263 tweets read from test.csv
3263 tweets processed in the test set.
0 empty tweets replaced with mean vector




In [30]:
submission = submission.set_index('id')
submission.to_csv('submission_3_19.csv')

## 2. Naive Bayes with Bag of Words

In [47]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer

# create raw bow text and labels, shuffle
bow = twt['text'].to_numpy()
labels = twt['target'].to_numpy()

shuffle_idx = [x for x in range(len(bow))]
random.shuffle(shuffle_idx)

bow = bow[shuffle_idx]
labels = labels[shuffle_idx]

# train bag-of-words
count_vect = CountVectorizer()
x_bow = count_vect.fit(bow)

# create train dev split
train_bow = bow[:round(train_p*len(bow))]
dev_bow = bow[round(train_p*len(bow)):]
train_bow = count_vect.transform(train_bow)
dev_bow = count_vect.transform(dev_bow)

train_bow_labels = labels[:round(train_p*len(bow))]
dev_bow_lables = labels[round(train_p*len(bow)):]

print('Created {} bag-of-words vectorizations.'.format(train_bow.shape[0] + dev_bow.shape[0]))
clf = MultinomialNB().fit(train_bow, train_bow_labels)
print('Naive Bayes training complete.')

# get dev performances
ys = clf.predict(dev_bow)

# dev performance
print('Naive Bayes dev performance {}'.format(f1(ys, dev_bow_lables)))

Created 7613 bag-of-words vectorizations.
Naive Bayes training complete.
Naive Bayes dev performance 0.7468085106382979


## Missingness Exploration

We suspect that location data can be useful for predicting disasters. However, we have a sizable proportion of the data for which the location data is missing. In order to address this issue, we needed to first identify whether the data is missing completely at random, missing at random, or not missing at random. For the data to be MCAR, the mechanism of missingness of the data must be independent of all other observed features of the data. These other observed features, in our case, include the text of the tweet. Reviewing the Twitter policy on tweet-level location data, we found that twitter users must opt in to location services to embed location data in there tweets, as well as intentionally include location data on each desired tweet. This policy gives us an insight into some mechanisms of missingness - users forget that they can include location data, or the intentionally elect not to for some tweets. This second mechanism would preclude categorization as missing completely at random. Furthermore, as users of social media, we have firsthand experience with the relationship between the location from which a tweet was sent, and the inclusion of location data. Twitter users may want to inform followers that they are tweeting from an impressive or otherwise unusual location - a famous concert venue, or a historic city, or even the site of a terrible natural disaster - indicating to us a relationship between the location to be included and the likelihood of inclusion. Being Missing Not a Random, our options for mitigating location data missingness are few. We could drop the data altogether, but decided that appending it to the contents of the tweet would be an acceptable approach that prevents information loss.
