In [7]:
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix as confmat
from sklearn.metrics import f1_score as f1
# from sklearn.metrics import plot_confusion_matrix as plot_confmat

import re
import string
import random

import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

import gensim
#load dataset and preprocess
twt = pd.read_csv('train.csv')
twt = twt.set_index('id')
twt.shape
print(twt.info())
twt.loc[~twt['keyword'].isna()]

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7613 entries, 1 to 10873
Data columns (total 4 columns):
keyword     7552 non-null object
location    5080 non-null object
text        7613 non-null object
target      7613 non-null int64
dtypes: int64(1), object(3)
memory usage: 297.4+ KB
None


Unnamed: 0_level_0,keyword,location,text,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
48,ablaze,Birmingham,@bbcmtd Wholesale Markets ablaze http://t.co/l...,1
49,ablaze,Est. September 2012 - Bristol,We always try to bring the heavy. #metal #RT h...,0
50,ablaze,AFRICA,#AFRICANBAZE: Breaking news:Nigeria flag set a...,1
52,ablaze,"Philadelphia, PA",Crying out for more! Set me ablaze,0
53,ablaze,"London, UK",On plus side LOOK AT THE SKY LAST NIGHT IT WAS...,0
...,...,...,...,...
10830,wrecked,,@jt_ruff23 @cameronhacker and I wrecked you both,0
10831,wrecked,"Vancouver, Canada",Three days off from work and they've pretty mu...,0
10832,wrecked,London,#FX #forex #trading Cramer: Iger's 3 words tha...,0
10833,wrecked,Lincoln,@engineshed Great atmosphere at the British Li...,0


## 0. Data Preprocessing

### 0.1 Data Cleaning

In [9]:
stop_words = stopwords.words('english')
porter = PorterStemmer()
def processing (sentence):
    result = sentence.lower() #Lower case 
    result = re.sub(r'\d+', '', result) #Removing numbers
    result = result.translate(str.maketrans('', '', string.punctuation)) #Remove weird characters
    result = result.strip() #Eliminate blanks from begining and end of setences
    result = result.split() #Separate into words
    result = [w for w in result if not w in stop_words] #Eliminate stop_words
    result = [porter.stem(word) for word in result] #Stem Words
    return (result)

text = list(twt["text"])
len(text)

prep_text = [processing(i) for i in text]
print('{} tweets lowered, tokenized, alphanumerized, stop-stripped, and stemmed.'.format(len(prep_text)))
twt['prepped'] = prep_text

7613 tweets lowered, tokenized, alphanumerized, stop-stripped, and stemmed.


In [3]:
# twt.loc[twt['keyword'].any()),]
twt.dropna()

Unnamed: 0_level_0,keyword,location,text,target,prepped
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
48,ablaze,Birmingham,@bbcmtd Wholesale Markets ablaze http://t.co/l...,1,"[bbcmtd, wholesal, market, ablaz, httptcolhyxe..."
49,ablaze,Est. September 2012 - Bristol,We always try to bring the heavy. #metal #RT h...,0,"[alway, tri, bring, heavi, metal, rt, httptcoy..."
50,ablaze,AFRICA,#AFRICANBAZE: Breaking news:Nigeria flag set a...,1,"[africanbaz, break, newsnigeria, flag, set, ab..."
52,ablaze,"Philadelphia, PA",Crying out for more! Set me ablaze,0,"[cri, set, ablaz]"
53,ablaze,"London, UK",On plus side LOOK AT THE SKY LAST NIGHT IT WAS...,0,"[plu, side, look, sky, last, night, ablaz, htt..."
...,...,...,...,...,...
10826,wrecked,TN,On the bright side I wrecked http://t.co/uEa0t...,0,"[bright, side, wreck, httptcoueatxrhi]"
10829,wrecked,#NewcastleuponTyne #UK,@widda16 ... He's gone. You can relax. I thoug...,0,"[widda, he, gone, relax, thought, wife, wreck,..."
10831,wrecked,"Vancouver, Canada",Three days off from work and they've pretty mu...,0,"[three, day, work, theyv, pretti, much, wreck,..."
10832,wrecked,London,#FX #forex #trading Cramer: Iger's 3 words tha...,0,"[fx, forex, trade, cramer, iger, word, wreck, ..."


### 0.2 Embedding

In [10]:
# word2vec = gensim.models.KeyedVectors.load_word2vec_format("GoogleNews-vectors-negative300.bin.gz", binary=True)

def tweet_vec (tweet, word2vec):
    word_vecs = [word2vec.get_vector(w) for w in tweet if w in word2vec.vocab]
#     print(tweet)
#     print('Number of words: {}'.format(len(word_vecs)))
    if len(word_vecs) >= 1:
        return np.stack(word_vecs).mean(0)
    else:
        return None

final_data = []
targets = twt['target'].to_numpy()

# prep_text[0]
# tweet_vec(prep_text[0], word2vec)
'''
for i in prep_text:
    vec = tweet_vec(i,word2vec)
    if vec is not None:   
        final_data.append(vec)
'''      
number_empty = 0
for x, y in zip(prep_text, targets):
    vec = tweet_vec(x, word2vec)
    if vec is not None:
        final_data.append((vec, y))
    else:
        number_empty += 1
        
train_p = 0.70
random.shuffle(final_data)

# train is final data 
train = final_data[:round(train_p*len(final_data))]
dev = final_data[round(train_p*len(final_data)):]
print('Preprocessing complete.\n{} tweets vectorized for the training set.'.format(len(train)))
print('{} tweets vectorized for the dev set.\n{} tweets were found to be empty.'.format(len(dev), number_empty))

Preprocessing complete.
5311 tweets vectorized for the training set.
2276 tweets vectorized for the dev set.
26 tweets were found to be empty.


In [5]:
len(train)

5311

## 1. NN construction

In [40]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

class Net(nn.Module):

    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(300,300)  # 6*6 from image dimension
        self.fc2 = nn.Linear(300, 2)
        self.softmax =  nn.Softmax(dim = 1)

    def forward(self, x):
        x = self.fc1(x)
        x = self.fc2(x)
        x = self.softmax(x)
        return x

    def num_flat_features(self, x):
        size = x.size()[1:]  # all dimensions except the batch dimension
        num_features = 1
        for s in size:
            num_features *= s
        return num_features
    
    def get_eval_data(self, data):
        dataloader = torch.utils.data.DataLoader(data, batch_size = 1)

        y_stars = []
        ys = [vec_targ[1] for vec_targ in data]

        for i, data in enumerate(dataloader, 0):
            x, _ = data
            # print(x)
            output = self.forward(x).detach().numpy()[0]
            y_star = np.argmax(output)
            #print(y_star)
            y_stars.append(y_star)
        
        return ys, y_stars

    
trainloader = torch.utils.data.DataLoader(train, batch_size = 5000)
net = Net()
# print(net.parameters())
criterion = nn.CrossEntropyLoss()
# create your optimizer
optimizer = optim.Adam(net.parameters(), lr=0.01)
for epoch in range(10):  # loop over the dataset multiple times
    running_loss = 0.0
    for i, data in enumerate(trainloader, 0):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data
#         print(inputs)
#         print(labels)

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = net(inputs)
#         print(outputs)
        loss = criterion(outputs, labels)
#         print(loss)
        loss.backward()
        optimizer.step()
        
        # print statistics
        running_loss += loss.item()
        if i % 1 == 0: # print every 2000 mini-batches
            ys, y_stars = net.get_eval_data(dev)
            print('[%d, %5d] loss: %.3f\tDev FI: %.3f' % (epoch + 1, i + 1, running_loss, f1(ys, y_stars)))
            running_loss = 0.0

print('Finished Training')
print(net.parameters())

[1,     1] loss: 0.693	Dev FI: 0.004
[1,     2] loss: 0.658	Dev FI: 0.287
[2,     1] loss: 0.634	Dev FI: 0.696
[2,     2] loss: 0.595	Dev FI: 0.676
[3,     1] loss: 0.579	Dev FI: 0.669
[3,     2] loss: 0.541	Dev FI: 0.677
[4,     1] loss: 0.553	Dev FI: 0.697
[4,     2] loss: 0.517	Dev FI: 0.704
[5,     1] loss: 0.545	Dev FI: 0.703
[5,     2] loss: 0.501	Dev FI: 0.700
[6,     1] loss: 0.539	Dev FI: 0.697
[6,     2] loss: 0.488	Dev FI: 0.702
[7,     1] loss: 0.535	Dev FI: 0.708
[7,     2] loss: 0.471	Dev FI: 0.718
[8,     1] loss: 0.534	Dev FI: 0.719
[8,     2] loss: 0.459	Dev FI: 0.712
[9,     1] loss: 0.531	Dev FI: 0.703
[9,     2] loss: 0.454	Dev FI: 0.709
[10,     1] loss: 0.529	Dev FI: 0.712
[10,     2] loss: 0.446	Dev FI: 0.713
Finished Training
<generator object Module.parameters at 0x000002218C40CC48>


## Dev Performance

In [36]:
# dev[0]
ys, y_stars = net.get_eval_data(dev)
# confmat(ys, y_stars)
f1(ys, y_stars)

0.7135788894997251

## Create Test Submissions

In [26]:
# create vector of mean of all word vectors
mean_vec = np.zeros((300,1))
for vec_targ in train:
    mean_vec = np.add(mean_vec, vec_targ[0])
mean_vec = mean_vec/len(final_data)

# read in test data and preprocess tweets
test = pd.read_csv('test.csv')
text = list(test["text"])
proc_text = [processing(i) for i in text]
targets = np.zeros((len(test), 1))
print('{} tweets read from test.csv'.format(test.shape[0]))

test_data = []
counter = 0
# replace empty vectors with mean_vec
for x, y in zip(prep_text, targets):
    vec = tweet_vec(x, word2vec)
    if vec is not None:
        test_data.append((vec, y))
    else:
        test_data.append((mean_vec, y))
        counter += 0
        
print('{} tweets processed in the test set.'.format(len(test_data)))
print('{} empty tweets replaced with mean vector'.format(counter))

testloader = torch.utils.data.DataLoader(test_data, batch_size = 1)

y_stars = []

for i, data in enumerate(testloader, 0):
    x, y = data
    #print(x.dtype)
    #x_dub = torch.Tensor(x, dtype = 'double')
    #print(x.dtype)
    output = net.forward(x.float()).detach().numpy()[0]
    y_star = np.argmax(output)
    #print(y_star)
    y_stars.append(y_star)

# create columns for submission data
id = test['id'].to_numpy()
target = y_stars

# create df with submission data and write to csv
submission = pd.DataFrame({'id': id, 'target': target})

3263 tweets read from test.csv
3263 tweets processed in the test set.
0 empty tweets replaced with mean vector




In [30]:
submission = submission.set_index('id')
submission.to_csv('submission_3_19.csv')

## 2. Naive Bayes with Bag of Words

In [47]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer

# create raw bow text and labels, shuffle
bow = twt['text'].to_numpy()
labels = twt['target'].to_numpy()

shuffle_idx = [x for x in range(len(bow))]
random.shuffle(shuffle_idx)

bow = bow[shuffle_idx]
labels = labels[shuffle_idx]

# train bag-of-words
count_vect = CountVectorizer()
x_bow = count_vect.fit(bow)

# create train dev split
train_bow = bow[:round(train_p*len(bow))]
dev_bow = bow[round(train_p*len(bow)):]
train_bow = count_vect.transform(train_bow)
dev_bow = count_vect.transform(dev_bow)

train_bow_labels = labels[:round(train_p*len(bow))]
dev_bow_lables = labels[round(train_p*len(bow)):]

print('Created {} bag-of-words vectorizations.'.format(train_bow.shape[0] + dev_bow.shape[0]))
clf = MultinomialNB().fit(train_bow, train_bow_labels)
print('Naive Bayes training complete.')

# get dev performances
ys = clf.predict(dev_bow)

# dev performance
print('Naive Bayes dev performance {}'.format(f1(ys, dev_bow_lables)))

Created 7613 bag-of-words vectorizations.
Naive Bayes training complete.
Naive Bayes dev performance 0.7468085106382979
