In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix as confmat
# from sklearn.metrics import plot_confusion_matrix as plot_confmat

import re
import string
import random

import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

import gensim

In [2]:
#load dataset and preprocess
twt = pd.read_csv('train.csv')
twt = twt.set_index('id')
twt.shape

(7613, 4)

In [3]:
print(twt.info())
twt.loc[~twt['keyword'].isna()]

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7613 entries, 1 to 10873
Data columns (total 4 columns):
keyword     7552 non-null object
location    5080 non-null object
text        7613 non-null object
target      7613 non-null int64
dtypes: int64(1), object(3)
memory usage: 297.4+ KB
None


Unnamed: 0_level_0,keyword,location,text,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
48,ablaze,Birmingham,@bbcmtd Wholesale Markets ablaze http://t.co/l...,1
49,ablaze,Est. September 2012 - Bristol,We always try to bring the heavy. #metal #RT h...,0
50,ablaze,AFRICA,#AFRICANBAZE: Breaking news:Nigeria flag set a...,1
52,ablaze,"Philadelphia, PA",Crying out for more! Set me ablaze,0
53,ablaze,"London, UK",On plus side LOOK AT THE SKY LAST NIGHT IT WAS...,0
...,...,...,...,...
10830,wrecked,,@jt_ruff23 @cameronhacker and I wrecked you both,0
10831,wrecked,"Vancouver, Canada",Three days off from work and they've pretty mu...,0
10832,wrecked,London,#FX #forex #trading Cramer: Iger's 3 words tha...,0
10833,wrecked,Lincoln,@engineshed Great atmosphere at the British Li...,0


## 0. Data Preprocessing

### 0.1 Data Cleaning

In [5]:
stop_words = stopwords.words('english')
porter = PorterStemmer()

In [6]:
def processing (sentence):
    result = sentence.lower() #Lower case 
    result = re.sub(r'\d+', '', result) #Removing numbers
    result = result.translate(str.maketrans('', '', string.punctuation)) #Remove weird characters
    result = result.strip() #Eliminate blanks from begining and end of setences
    result = result.split() #Separate into words
    result = [w for w in result if not w in stop_words] #Eliminate stop_words
    result = [porter.stem(word) for word in result] #Stem Words
    return (result)

In [7]:
text = list(twt["text"])
len(text)

7613

### 0.2 Embedding

In [8]:
word2vec = gensim.models.KeyedVectors.load_word2vec_format("GoogleNews-vectors-negative300.bin.gz", binary=True)

In [9]:
def tweet_vec (tweet, word2vec):
    word_vecs = [word2vec.get_vector(w) for w in tweet if w in word2vec.vocab]
#     print(tweet)
#     print('Number of words: {}'.format(len(word_vecs)))
    if len(word_vecs) >= 1:
        return np.stack(word_vecs).mean(0)
    else:
        return None

In [10]:
prep_text = [processing(i) for i in text]
print('{} tweets lowered, tokenized, alphanumerized, stop-stripped, and stemmed.'.format(len(prep_text)))

7613 tweets lowered, tokenized, alphanumerized, stop-stripped, and stemmed.


In [11]:
final_data = []
targets = twt['target'].to_numpy()

# prep_text[0]
# tweet_vec(prep_text[0], word2vec)
'''
for i in prep_text:
    vec = tweet_vec(i,word2vec)
    if vec is not None:   
        final_data.append(vec)
'''      
number_empty = 0
for x, y in zip(prep_text, targets):
    vec = tweet_vec(x, word2vec)
    if vec is not None:
        final_data.append((vec, y))
    else:
        number_empty += 1
        
train_p = 0.70
random.shuffle(final_data)

# train is final data 
train = final_data[:round(train_p*len(final_data))]
dev = final_data[round(train_p*len(final_data)):]
print('Preprocessing complete.\n{} tweets vectorized for the training set.'.format(len(train)))
print('{} tweets vectorized for the dev set.\n{} tweets were found to be empty.'.format(len(dev), number_empty))

Preprocessing complete.
5311 tweets vectorized for the training set.
2276 tweets vectorized for the dev set.
26 tweets were found to be empty.


## 1. NN construction

In [12]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

class Net(nn.Module):

    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(300,300)  # 6*6 from image dimension
        self.fc2 = nn.Linear(300, 2)
        self.softmax =  nn.Softmax()

    def forward(self, x):
        x = self.fc1(x)
        x = self.fc2(x)
        x = self.softmax(x)
        return x

    def num_flat_features(self, x):
        size = x.size()[1:]  # all dimensions except the batch dimension
        num_features = 1
        for s in size:
            num_features *= s
        return num_features
    
trainloader = torch.utils.data.DataLoader(train, batch_size = 1000)
net = Net()

# print(net.parameters())
criterion = nn.CrossEntropyLoss()
# create your optimizer
optimizer = optim.Adam(net.parameters(), lr=0.01)
for epoch in range(10):  # loop over the dataset multiple times
    running_loss = 0.0
    for i, data in enumerate(trainloader, 0):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data
#         print(inputs)
#         print(labels)

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = net(inputs)
#         print(outputs)
        loss = criterion(outputs, labels)
#         print(loss)
        loss.backward()
        optimizer.step()
        
        # print statistics
        running_loss += loss.item()
        if i % 1 == 0: # print every 2000 mini-batches
            print('[%d, %5d] loss: %.3f' % (epoch + 1, i + 1, running_loss))
            running_loss = 0.0

print('Finished Training')
print(net.parameters())
    



[1,     1] loss: 0.693
[1,     2] loss: 0.647
[1,     3] loss: 0.620
[1,     4] loss: 0.585
[1,     5] loss: 0.560
[1,     6] loss: 0.541
[2,     1] loss: 0.545
[2,     2] loss: 0.556
[2,     3] loss: 0.544
[2,     4] loss: 0.532
[2,     5] loss: 0.523
[2,     6] loss: 0.516
[3,     1] loss: 0.534
[3,     2] loss: 0.543
[3,     3] loss: 0.539
[3,     4] loss: 0.523
[3,     5] loss: 0.521
[3,     6] loss: 0.488
[4,     1] loss: 0.531
[4,     2] loss: 0.526
[4,     3] loss: 0.525
[4,     4] loss: 0.514
[4,     5] loss: 0.506
[4,     6] loss: 0.480
[5,     1] loss: 0.526
[5,     2] loss: 0.528
[5,     3] loss: 0.523
[5,     4] loss: 0.510
[5,     5] loss: 0.498
[5,     6] loss: 0.473
[6,     1] loss: 0.527
[6,     2] loss: 0.522
[6,     3] loss: 0.514
[6,     4] loss: 0.504
[6,     5] loss: 0.496
[6,     6] loss: 0.457
[7,     1] loss: 0.521
[7,     2] loss: 0.519
[7,     3] loss: 0.511
[7,     4] loss: 0.499
[7,     5] loss: 0.489
[7,     6] loss: 0.452
[8,     1] loss: 0.513
[8,     2] 

## Dev Performance

In [13]:
devloader = torch.utils.data.DataLoader(dev, batch_size = 1)

y_stars = []
ys = [vec_targ[1] for vec_targ in dev]

for i, data in enumerate(devloader, 0):
    x, y = data
    # print(x)
    output = net.forward(x).detach().numpy()[0]
    y_star = np.argmax(output)
    #print(y_star)
    y_stars.append(y_star)

confmat(ys, y_stars)



array([[1117,  196],
       [ 315,  648]], dtype=int64)

## Create Test Submissions

In [26]:
# create vector of mean of all word vectors
mean_vec = np.zeros((300,1))
for vec_targ in train:
    mean_vec = np.add(mean_vec, vec_targ[0])
mean_vec = mean_vec/len(final_data)

# read in test data and preprocess tweets
test = pd.read_csv('test.csv')
text = list(test["text"])
proc_text = [processing(i) for i in text]
targets = np.zeros((len(test), 1))
print('{} tweets read from test.csv'.format(test.shape[0]))

test_data = []
counter = 0
# replace empty vectors with mean_vec
for x, y in zip(prep_text, targets):
    vec = tweet_vec(x, word2vec)
    if vec is not None:
        test_data.append((vec, y))
    else:
        test_data.append((mean_vec, y))
        counter += 0
        
print('{} tweets processed in the test set.'.format(len(test_data)))
print('{} empty tweets replaced with mean vector'.format(counter))

testloader = torch.utils.data.DataLoader(test_data, batch_size = 1)

y_stars = []

for i, data in enumerate(testloader, 0):
    x, y = data
    #print(x.dtype)
    #x_dub = torch.Tensor(x, dtype = 'double')
    #print(x.dtype)
    output = net.forward(x.float()).detach().numpy()[0]
    y_star = np.argmax(output)
    #print(y_star)
    y_stars.append(y_star)

# create columns for submission data
id = test['id'].to_numpy()
target = y_stars

# create df with submission data and write to csv
submission = pd.DataFrame({'id': id, 'target': target})

3263 tweets read from test.csv
3263 tweets processed in the test set.
0 empty tweets replaced with mean vector




In [30]:
submission = submission.set_index('id')
submission.to_csv('submission_3_19.csv')