In [62]:
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix as confmat
from sklearn.metrics import f1_score as f1
# from sklearn.metrics import plot_confusion_matrix as plot_confmat

import re
import string
import random

import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

import gensim
# load dataset and preprocess
twt = pd.read_csv('train.csv')
twt = twt.set_index('id')
twt.head()

Unnamed: 0_level_0,keyword,location,text,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,,,Our Deeds are the Reason of this #earthquake M...,1
4,,,Forest fire near La Ronge Sask. Canada,1
5,,,All residents asked to 'shelter in place' are ...,1
6,,,"13,000 people receive #wildfires evacuation or...",1
7,,,Just got sent this photo from Ruby #Alaska as ...,1


## 0. Data Preprocessing

### 0.1 Data Cleaning

In [63]:
stop_words = stopwords.words('english')
porter = PorterStemmer()
def processing (sentence):
    result = sentence.lower() #Lower case 
    result = re.sub(r'\d+', '', result) #Removing numbers
    result = result.translate(str.maketrans('', '', string.punctuation)) #Remove weird characters
    result = result.strip() #Eliminate blanks from begining and end of setences
    result = result.split() #Separate into words
    result = [w for w in result if not w in stop_words] #Eliminate stop_words
    # result = [porter.stem(word) for word in result] #Stem Words
    return (result)

text = list(twt["text"])
len(text)

prep_text = [processing(i) for i in text]
print('{} tweets lowered, tokenized, alphanumerized, stop-stripped, and stemmed.'.format(len(prep_text)))
twt['prepped'] = prep_text

7613 tweets lowered, tokenized, alphanumerized, stop-stripped, and stemmed.


In [64]:
# twt.loc[twt['keyword'].any()),]
print(twt.shape)
twt.dropna()
print(twt.shape)
# twt['prepped'].isna().any()


(7613, 5)
(7613, 5)


### 0.2 Embedding

In [65]:
# word2vec = gensim.models.KeyedVectors.load_word2vec_format("GoogleNews-vectors-negative300.bin.gz", binary=True)

def tweet_vec (tweet, word2vec):
    word_vecs = [word2vec.get_vector(w) for w in tweet if w in word2vec.vocab]
#     print(tweet)
#     print('Number of words: {}'.format(len(word_vecs)))
    if len(word_vecs) >= 1:
        return np.stack(word_vecs).mean(0)
    else:
        return None

twt['vec'] = pd.Series([tweet_vec(tweet, word2vec) for tweet in twt['prepped']], index = twt.index)

devtrain_idx = twt.loc[~twt['vec'].isna()].index.tolist()
random.shuffle(devtrain_idx)

train_p = 0.7
train_idx = devtrain_idx[:round(train_p*len(devtrain_idx))]
dev_idx = devtrain_idx[round(train_p*len(devtrain_idx)):]

train = [(vec, targ) for targ, vec in  zip(twt['target'][train_idx], twt['vec'][train_idx])]
dev = [(vec, targ) for targ, vec in  zip(twt['target'][dev_idx], twt['vec'][dev_idx])]

## 1. NN construction

In [66]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

class Net(nn.Module):

    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(300,300)  # 6*6 from image dimension
        self.fc2 = nn.Linear(300, 2)
        self.softmax =  nn.Softmax(dim = 1)

    def forward(self, x):
        x = self.fc1(x)
        x = self.fc2(x)
        x = self.softmax(x)
        return x

    def num_flat_features(self, x):
        size = x.size()[1:]  # all dimensions except the batch dimension
        num_features = 1
        for s in size:
            num_features *= s
        return num_features
    
    def get_eval_data(self, data):
        dataloader = torch.utils.data.DataLoader(data, batch_size = 1)

        y_stars = []
        ys = [vec_targ[1] for vec_targ in data]

        for i, data in enumerate(dataloader, 0):
            x, _ = data
            # print(x)
            output = self.forward(x).detach().numpy()[0]
            y_star = np.argmax(output)
            #print(y_star)
            y_stars.append(y_star)
        
        return ys, y_stars

    
trainloader = torch.utils.data.DataLoader(train, batch_size = 5000)
net = Net()
# print(net.parameters())
criterion = nn.CrossEntropyLoss()
# create your optimizer
optimizer = optim.Adam(net.parameters(), lr=0.01)
for epoch in range(100):  # loop over the dataset multiple times
    running_loss = 0.0
    for i, data in enumerate(trainloader, 0):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data
#         print(inputs)
#         print(labels)

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = net(inputs)
#         print(outputs)
        loss = criterion(outputs, labels)
#         print(loss)
        loss.backward()
        optimizer.step()
        
        # print statistics
        running_loss += loss.item()
        if i % 1 == 0: # print every 2000 mini-batches
            ys, y_stars = net.get_eval_data(dev)
            print('[%d, %5d] loss: %.3f\tDev FI: %.3f' % (epoch + 1, i + 1, running_loss, f1(ys, y_stars)))
            running_loss = 0.0

print('Finished Training')
print(net.parameters())

[1,     1] loss: 0.691	Dev FI: 0.002
[1,     2] loss: 0.678	Dev FI: 0.599
[2,     1] loss: 0.612	Dev FI: 0.703
[2,     2] loss: 0.584	Dev FI: 0.708
[3,     1] loss: 0.549	Dev FI: 0.696
[3,     2] loss: 0.535	Dev FI: 0.693
[4,     1] loss: 0.524	Dev FI: 0.700
[4,     2] loss: 0.516	Dev FI: 0.707
[5,     1] loss: 0.515	Dev FI: 0.714
[5,     2] loss: 0.503	Dev FI: 0.716
[6,     1] loss: 0.511	Dev FI: 0.714
[6,     2] loss: 0.489	Dev FI: 0.710
[7,     1] loss: 0.508	Dev FI: 0.711
[7,     2] loss: 0.474	Dev FI: 0.713
[8,     1] loss: 0.507	Dev FI: 0.716
[8,     2] loss: 0.462	Dev FI: 0.723
[9,     1] loss: 0.504	Dev FI: 0.727
[9,     2] loss: 0.451	Dev FI: 0.728
[10,     1] loss: 0.504	Dev FI: 0.722
[10,     2] loss: 0.443	Dev FI: 0.724
[11,     1] loss: 0.504	Dev FI: 0.725
[11,     2] loss: 0.437	Dev FI: 0.726
[12,     1] loss: 0.503	Dev FI: 0.729
[12,     2] loss: 0.429	Dev FI: 0.728
[13,     1] loss: 0.503	Dev FI: 0.729
[13,     2] loss: 0.423	Dev FI: 0.732
[14,     1] loss: 0.501	Dev FI

## Dev Performance and Error Analysis

In [67]:
# dev[0]
ys, y_stars = net.get_eval_data(dev)
# confmat(ys, y_stars)
f1(ys, y_stars)

twt['pred'] = pd.Series(y_stars, index = dev_idx)


# for error analysis
twt[['keyword', 'location', 'text', 'prepped','vec', 'target','pred']].to_csv('error_analysis.csv')

## Create Test Submissions

In [79]:
# create vector of mean of all word vectors
mean_vec = np.zeros((300,1))
for vec_targ in train:
    mean_vec = np.add(mean_vec, vec_targ[0])
mean_vec = mean_vec/len(train)

# read in test data and preprocess tweets
test = pd.read_csv('test.csv')
text = list(test["text"])
proc_text = [processing(i) for i in text]
targets = np.zeros((len(test), 1))
print('{} tweets read from test.csv'.format(test.shape[0]))

test_data = []
counter = 0

# replace empty vectors with mean_vec

# test_data = [(tweet_vec(x, word2vec), y) if for x, y in zip(proc_test, targets)]
for x, y in zip(proc_text, targets):
    vec = tweet_vec(x, word2vec)
    if vec is not None:
        test_data.append((vec, y))
    else:
        test_data.append((mean_vec, y))
        counter += 0

print('{} tweets processed in the test set.'.format(len(test_data)))
print('{} empty tweets ({}%) replaced with mean vector'.format(counter, 100*counter/len(proc_text)))

# print(test_data[0])
testloader = torch.utils.data.DataLoader(test_data, batch_size = 1)

y_stars = []

for i, data in enumerate(testloader, 0):
    x, y = data
    #print(x.dtype)
    #x_dub = torch.Tensor(x, dtype = 'double')
    #print(x.dtype)
    output = net.forward(x.float()).detach().numpy()[0]
    y_star = np.argmax(output)
    #print(y_star)
    y_stars.append(y_star)

# create columns for submission data
id = test['id'].to_numpy()
target = y_stars

# create df with submission data and write to csv
submission = pd.DataFrame({'id': id, 'target': target})
submission.set_index('id').to_csv('submission.csv')

# also save analysis csv for reviewing decisions
test['prepped'] = proc_text
test['pred'] = target
test[['keyword', 'location', 'text', 'prepped','pred']].to_csv('test_error_analysis.csv')

3263 tweets read from test.csv
3263 tweets processed in the test set.
0 empty tweets (0.0%) replaced with mean vector


In [9]:
submission = submission.set_index('id')
submission.to_csv('submission_3_19.csv')

NameError: name 'submission' is not defined

## 2. Naive Bayes with Bag of Words

In [10]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer

# create raw bow text and labels, shuffle
bow = twt['text'].to_numpy()
labels = twt['target'].to_numpy()

shuffle_idx = [x for x in range(len(bow))]
random.shuffle(shuffle_idx)

bow = bow[shuffle_idx]
labels = labels[shuffle_idx]

# train bag-of-words
count_vect = CountVectorizer()
x_bow = count_vect.fit(bow)

# create train dev split
train_bow = bow[:round(train_p*len(bow))]
dev_bow = bow[round(train_p*len(bow)):]
train_bow = count_vect.transform(train_bow)
dev_bow = count_vect.transform(dev_bow)

train_bow_labels = labels[:round(train_p*len(bow))]
dev_bow_lables = labels[round(train_p*len(bow)):]

print('Created {} bag-of-words vectorizations.'.format(train_bow.shape[0] + dev_bow.shape[0]))
clf = MultinomialNB().fit(train_bow, train_bow_labels)
print('Naive Bayes training complete.')

# get dev performances
ys = clf.predict(dev_bow)

# dev performance
print('Naive Bayes dev performance {}'.format(f1(ys, dev_bow_lables)))

NameError: name 'train_p' is not defined

## Missingness Exploration

We suspect that location data can be useful for predicting disasters. However, we have a sizable proportion of the data for which the location data is missing. In order to address this issue, we needed to first identify whether the data is missing completely at random, missing at random, or not missing at random. For the data to be MCAR, the mechanism of missingness of the data must be independent of all other observed features of the data. These other observed features, in our case, include the text of the tweet. Reviewing the Twitter policy on tweet-level location data, we found that twitter users must opt in to location services to embed location data in there tweets, as well as intentionally include location data on each desired tweet. This policy gives us an insight into some mechanisms of missingness - users forget that they can include location data, or the intentionally elect not to for some tweets. This second mechanism would preclude categorization as missing completely at random. Furthermore, as users of social media, we have firsthand experience with the relationship between the location from which a tweet was sent, and the inclusion of location data. Twitter users may want to inform followers that they are tweeting from an impressive or otherwise unusual location - a famous concert venue, or a historic city, or even the site of a terrible natural disaster - indicating to us a relationship between the location to be included and the likelihood of inclusion. Being Missing Not a Random, our options for mitigating location data missingness are few. We could drop the data altogether, but decided that appending it to the contents of the tweet would be an acceptable approach that prevents information loss.
