In [1]:
import numpy as np 
import pandas as pd 
import os
import spacy
import string
import re
from spacy.symbols import ORTH
from collections import Counter

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

## Load file

In [2]:
df = pd.read_csv('data.csv')

In [3]:
df.head()

Unnamed: 0,source,text,created_at,retweet_count,favorite_count,time,date,Date,Open,High,Low,Close,Adj Close,Volume,diff,label
0,Twitter for iPhone,I will be announcing my Second Term Presidenti...,05-31-2019 20:35:41,35248,128039,2019-05-31 20:35:41,2019-05-31,2019-05-31,2766.149902,2768.97998,2750.52002,2752.060059,2752.060059,3981020000,-14.089843,0
1,Twitter Media Studio,GREAT NEWS! #MAGA https://t.co/91Yk8B11bP,05-31-2019 20:02:16,20493,75339,2019-05-31 20:02:16,2019-05-31,2019-05-31,2766.149902,2768.97998,2750.52002,2752.060059,2752.060059,3981020000,-14.089843,0
2,Twitter for iPhone,As we celebrate LGBT Pride Month and recognize...,05-31-2019 19:12:32,28936,136614,2019-05-31 19:12:32,2019-05-31,2019-05-31,2766.149902,2768.97998,2750.52002,2752.060059,2752.060059,3981020000,-14.089843,0
3,Twitter for iPhone,....on the basis of their sexual orientation. ...,05-31-2019 19:12:32,20416,105421,2019-05-31 19:12:32,2019-05-31,2019-05-31,2766.149902,2768.97998,2750.52002,2752.060059,2752.060059,3981020000,-14.089843,0
4,Twitter for iPhone,.@SeanHannity is having a DEEP STATE SHOW toni...,05-31-2019 18:11:25,18257,65602,2019-05-31 18:11:25,2019-05-31,2019-05-31,2766.149902,2768.97998,2750.52002,2752.060059,2752.060059,3981020000,-14.089843,0


## Tokenization

In [4]:
# first time run this
#!python3 -m spacy download en

In [5]:
re_br = re.compile(r'<\s*br\s*/?>', re.IGNORECASE)
def sub_br(x): return re_br.sub("\n", x)

my_tok = spacy.load('en')
def spacy_tok(x): return [tok.text.lower() for tok in my_tok.tokenizer(sub_br(x))]

In [6]:
x = df.loc[1, 'text']
spacy_tok(x)

['great', 'news', '!', '#', 'maga', 'https://t.co/91yk8b11bp']

In [7]:
df['words'] = df['text'].apply(spacy_tok)

In [8]:
df.head()

Unnamed: 0,source,text,created_at,retweet_count,favorite_count,time,date,Date,Open,High,Low,Close,Adj Close,Volume,diff,label,words
0,Twitter for iPhone,I will be announcing my Second Term Presidenti...,05-31-2019 20:35:41,35248,128039,2019-05-31 20:35:41,2019-05-31,2019-05-31,2766.149902,2768.97998,2750.52002,2752.060059,2752.060059,3981020000,-14.089843,0,"[i, will, be, announcing, my, second, term, pr..."
1,Twitter Media Studio,GREAT NEWS! #MAGA https://t.co/91Yk8B11bP,05-31-2019 20:02:16,20493,75339,2019-05-31 20:02:16,2019-05-31,2019-05-31,2766.149902,2768.97998,2750.52002,2752.060059,2752.060059,3981020000,-14.089843,0,"[great, news, !, #, maga, https://t.co/91yk8b1..."
2,Twitter for iPhone,As we celebrate LGBT Pride Month and recognize...,05-31-2019 19:12:32,28936,136614,2019-05-31 19:12:32,2019-05-31,2019-05-31,2766.149902,2768.97998,2750.52002,2752.060059,2752.060059,3981020000,-14.089843,0,"[as, we, celebrate, lgbt, pride, month, and, r..."
3,Twitter for iPhone,....on the basis of their sexual orientation. ...,05-31-2019 19:12:32,20416,105421,2019-05-31 19:12:32,2019-05-31,2019-05-31,2766.149902,2768.97998,2750.52002,2752.060059,2752.060059,3981020000,-14.089843,0,"[...., on, the, basis, of, their, sexual, orie..."
4,Twitter for iPhone,.@SeanHannity is having a DEEP STATE SHOW toni...,05-31-2019 18:11:25,18257,65602,2019-05-31 18:11:25,2019-05-31,2019-05-31,2766.149902,2768.97998,2750.52002,2752.060059,2752.060059,3981020000,-14.089843,0,"[.@seanhannity, is, having, a, deep, state, sh..."


In [9]:
tweet_length = df['words'].apply(len).tolist()

In [10]:
tweet_length[:10]

[45, 6, 45, 30, 36, 51, 61, 55, 49, 34]

In [11]:
np.percentile(tweet_length, 50)

29.0

In [12]:
np.percentile(tweet_length, 80)

51.0

In [13]:
np.percentile(tweet_length, 90)

55.0

+ 90% of tweets have less than 51 words, therfore we choose 50 as our padding size

## Load pre-trained embeddings

In [14]:
def loadGloveModel(gloveFile="/Users/jialiang.shi/data/glove.6B/glove.6B.100d.txt"):
    """ Loads word vectors into a dictionary."""
    f = open(gloveFile,'r')
    word_vecs = {}
    for line in f:
        splitLine = line.split()
        word = splitLine[0]
        word_vecs[word] = np.array([float(val) for val in splitLine[1:]])
    f.close()
    return word_vecs

In [15]:
word_vecs = loadGloveModel()

In [16]:
len(word_vecs.keys())

400000

In [17]:
word_vecs[',']

array([-0.10767  ,  0.11053  ,  0.59812  , -0.54361  ,  0.67396  ,
        0.10663  ,  0.038867 ,  0.35481  ,  0.06351  , -0.094189 ,
        0.15786  , -0.81665  ,  0.14172  ,  0.21939  ,  0.58505  ,
       -0.52158  ,  0.22783  , -0.16642  , -0.68228  ,  0.3587   ,
        0.42568  ,  0.19021  ,  0.91963  ,  0.57555  ,  0.46185  ,
        0.42363  , -0.095399 , -0.42749  , -0.16567  , -0.056842 ,
       -0.29595  ,  0.26037  , -0.26606  , -0.070404 , -0.27662  ,
        0.15821  ,  0.69825  ,  0.43081  ,  0.27952  , -0.45437  ,
       -0.33801  , -0.58184  ,  0.22364  , -0.5778   , -0.26862  ,
       -0.20425  ,  0.56394  , -0.58524  , -0.14365  , -0.64218  ,
        0.0054697, -0.35248  ,  0.16162  ,  1.1796   , -0.47674  ,
       -2.7553   , -0.1321   , -0.047729 ,  1.0655   ,  1.1034   ,
       -0.2208   ,  0.18669  ,  0.13177  ,  0.15117  ,  0.7131   ,
       -0.35215  ,  0.91348  ,  0.61783  ,  0.70992  ,  0.23955  ,
       -0.14571  , -0.37859  , -0.045959 , -0.47368  ,  0.2385

## Generate Vocab

In [31]:
df_words = df['words'].tolist()

In [32]:
import itertools

ws = list(itertools.chain(*df_words))

In [33]:
len(ws)

170020

In [34]:
word_count = Counter()
word_count.update(ws)

In [35]:
len(word_count)

11961

In [36]:
# delete if occurs < 5 times and it is not in our pretrained embeddings
for word in list(word_count):
    if word_count[word] < 5 and word not in word_vecs:
        del word_count[word]

In [37]:
len(word_count)

8684

In [38]:
vocab2index = {"":0, "UNK":1}
words = ["", "UNK"]
for word in word_count.keys():
    vocab2index[word] = len(words)
    words.append(word)

In [39]:
len(words)

8686

+ There are 8686 words in our vocabulary

## Pre-trained weights for the embedding layer

In [40]:
def random_word_vector(D=100):
    """Create arandom word vector
    
    0.25 is chosen so the unknown vectors have (approximately) same variance 
    as pre-trained ones
    """
    return np.random.uniform(-0.25,0.25,D)

In [41]:
def create_embedding_matrix(word_vecs, vocab2index, words, D=100):
    """Creates embedding matrix from word vectors. """
    V = len(words)
    W = np.zeros((V, D), dtype="float32")
    W[0] = np.zeros(D, dtype='float32')
    i = 1
    for i in range(1, V):
        if words[i] in word_vecs:
            W[i] = word_vecs[words[i]]
        else:
            W[i] = random_word_vector()
    return W

In [42]:
embedding_matrix = create_embedding_matrix(word_vecs, vocab2index, words)
embedding_matrix.shape

(8686, 100)

## Dataset

In [58]:
def encode_sentence(sentence, vocab2index, N=50, padding_start=True):
    x = spacy_tok(sentence)
    enc = np.zeros(N, dtype=np.int32)
    enc1 = np.array([vocab2index.get(w, vocab2index["UNK"]) for w in x])
    l = min(N, len(enc1))
    if padding_start:
        enc[:l] = enc1[:l]
    else:
        enc[N-l:] = enc1[:l]
    return enc, l

In [59]:
sample = df.loc[0,'text']
sample

'I will be announcing my Second Term Presidential Run with First Lady Melania Vice President Mike Pence and Second Lady Karen Pence on June 18th in Orlando Florida at the 20000 seat Amway Center. Join us for this Historic Rally! Tickets: https://t.co/1krDP2oQvG'

In [60]:
encode_sentence(sample, vocab2index, N=50)

(array([ 2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
        19,  7, 13, 20, 18, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
        33, 34, 35, 36, 37, 38, 39, 40, 41, 42,  1,  0,  0,  0,  0,  0],
       dtype=int32), 45)

In [61]:
class TweetDataset(Dataset):
    def __init__(self, df, N=50, padding_start=True):
        tweet = df['text'].tolist()
        self.X = [encode_sentence(q, vocab2index, N, padding_start) for q in tweet]
        self.y = df['label'].values        
      
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        x, s = self.X[idx]
        return x, s, self.y[idx]

In [62]:
from sklearn.model_selection import train_test_split

df_train, df_test, y_train, y_test = train_test_split(df, df['label'], test_size=0.2, random_state=42)

In [63]:
train_df = df_train.reset_index(drop=True)
test_df = df_test.reset_index(drop=True)

In [64]:
len(train_df)

3869

In [65]:
len(test_df)

968

In [66]:
train_ds = TweetDataset(train_df)
test_ds = TweetDataset(test_df)

In [67]:
train_ds[1]

(array([ 699,  290, 1108,   67,  222,  267,  628,    4, 2560, 1659,  106,
         837,  116,  224,   28,  217,   19,  331,  108,    3,    4,  753,
          59, 2561, 1758, 1804,   59, 2562, 2563,   19,  159,   40,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0], dtype=int32), 32, 0)

In [68]:
batch_size = 200
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
val_dl = DataLoader(test_ds, batch_size=batch_size)

## GRU model

In [77]:
class GRUModel(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim, embedding_matrix):
        super(GRUModel, self).__init__()
        self.hidden_dim = hidden_dim
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.embedding.weight.data.copy_(torch.from_numpy(embedding_matrix))
        self.embedding.weight.requires_grad = False ## freeze embeddings
        self.dropout = nn.Dropout(0.5)
        self.gru = nn.GRU(embedding_dim, hidden_dim, batch_first=True)
        self.linear = nn.Linear(hidden_dim, 1)
        
    def forward(self, x, s):
        s, sort_index = torch.sort(s, 0,descending=True)
        s = s.numpy().tolist()
        x = x[sort_index]
        x = self.embedding(x)
        x = self.dropout(x)
        x_pack = pack_padded_sequence(x, list(s), batch_first=True)
        out_pack, ht= self.gru(x_pack)
        out = self.linear(ht[-1])
        return torch.zeros_like(out).scatter_(0, sort_index.unsqueeze(1), out)

In [78]:
def train_epocs(model, epochs=10, lr=0.001):
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(parameters, lr=lr)
    for i in range(epochs):
        model.train()
        sum_loss = 0.0
        total = 0
        for x, s, y in train_dl:
            x = x.long()
            y = y.float()
            y_pred = model(x, s)
            optimizer.zero_grad()
            loss = F.binary_cross_entropy_with_logits(y_pred, y.unsqueeze(1))
            loss.backward()
            optimizer.step()
            sum_loss += loss.item()*y.shape[0]
            total += y.shape[0]
        val_loss, val_acc = val_metrics(model, val_dl)
        if i % 5 == 1:
            print("train loss %.3f val loss %.3f and val accuracy %.3f" % (sum_loss/total, val_loss, val_acc))

In [79]:
def val_metrics(model, valid_dl):
    model.eval()
    correct = 0
    total = 0
    sum_loss = 0.0
    for x, s, y in valid_dl:
        x = x.long()
        y = y.float().unsqueeze(1)
        y_hat = model(x, s)
        loss = F.binary_cross_entropy_with_logits(y_hat, y)
        y_pred = y_hat > 0
        correct += (y_pred.float() == y).float().sum()
        total += y.shape[0]
        sum_loss += loss.item()*y.shape[0]
    return sum_loss/total, correct/total

In [80]:
vocab_size = len(words)
print(vocab_size)
model = GRUModel(vocab_size, 100, 50, embedding_matrix)

8686


In [81]:
train_epocs(model, epochs=40, lr=0.01)

train loss 0.691 val loss 0.696 and val accuracy 0.511
train loss 0.681 val loss 0.702 and val accuracy 0.508
train loss 0.659 val loss 0.713 and val accuracy 0.520
train loss 0.627 val loss 0.741 and val accuracy 0.507
train loss 0.613 val loss 0.749 and val accuracy 0.535
train loss 0.606 val loss 0.752 and val accuracy 0.512
train loss 0.593 val loss 0.757 and val accuracy 0.519
train loss 0.591 val loss 0.778 and val accuracy 0.519
