In [1]:
import numpy as np 
import pandas as pd 
import os
import spacy
import string
import re
from spacy.symbols import ORTH
from collections import Counter

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

In [2]:
df = pd.read_csv('data.csv')

In [3]:
df.head()

Unnamed: 0,source,text,created_at,retweet_count,favorite_count,time,date,Date,Open,High,Low,Close,Adj Close,Volume,diff,label
0,Twitter for iPhone,I will be announcing my Second Term Presidenti...,05-31-2019 20:35:41,35248,128039,2019-05-31 20:35:41,2019-05-31,2019-05-31,2766.149902,2768.97998,2750.52002,2752.060059,2752.060059,3981020000,-14.089843,0
1,Twitter Media Studio,GREAT NEWS! #MAGA https://t.co/91Yk8B11bP,05-31-2019 20:02:16,20493,75339,2019-05-31 20:02:16,2019-05-31,2019-05-31,2766.149902,2768.97998,2750.52002,2752.060059,2752.060059,3981020000,-14.089843,0
2,Twitter for iPhone,As we celebrate LGBT Pride Month and recognize...,05-31-2019 19:12:32,28936,136614,2019-05-31 19:12:32,2019-05-31,2019-05-31,2766.149902,2768.97998,2750.52002,2752.060059,2752.060059,3981020000,-14.089843,0
3,Twitter for iPhone,....on the basis of their sexual orientation. ...,05-31-2019 19:12:32,20416,105421,2019-05-31 19:12:32,2019-05-31,2019-05-31,2766.149902,2768.97998,2750.52002,2752.060059,2752.060059,3981020000,-14.089843,0
4,Twitter for iPhone,.@SeanHannity is having a DEEP STATE SHOW toni...,05-31-2019 18:11:25,18257,65602,2019-05-31 18:11:25,2019-05-31,2019-05-31,2766.149902,2768.97998,2750.52002,2752.060059,2752.060059,3981020000,-14.089843,0


## Tokenization

In [4]:
re_br = re.compile(r'<\s*br\s*/?>', re.IGNORECASE)
def sub_br(x): return re_br.sub("\n", x)

my_tok = spacy.load('en')
def spacy_tok(x): return [tok.text.lower() for tok in my_tok.tokenizer(sub_br(x))]

In [5]:
x = df.loc[1, 'text']
spacy_tok(x)

['great', 'news', '!', '#', 'maga', 'https://t.co/91yk8b11bp']

In [6]:
df['words'] = df['text'].apply(spacy_tok)

In [7]:
df.head()

Unnamed: 0,source,text,created_at,retweet_count,favorite_count,time,date,Date,Open,High,Low,Close,Adj Close,Volume,diff,label,words
0,Twitter for iPhone,I will be announcing my Second Term Presidenti...,05-31-2019 20:35:41,35248,128039,2019-05-31 20:35:41,2019-05-31,2019-05-31,2766.149902,2768.97998,2750.52002,2752.060059,2752.060059,3981020000,-14.089843,0,"[i, will, be, announcing, my, second, term, pr..."
1,Twitter Media Studio,GREAT NEWS! #MAGA https://t.co/91Yk8B11bP,05-31-2019 20:02:16,20493,75339,2019-05-31 20:02:16,2019-05-31,2019-05-31,2766.149902,2768.97998,2750.52002,2752.060059,2752.060059,3981020000,-14.089843,0,"[great, news, !, #, maga, https://t.co/91yk8b1..."
2,Twitter for iPhone,As we celebrate LGBT Pride Month and recognize...,05-31-2019 19:12:32,28936,136614,2019-05-31 19:12:32,2019-05-31,2019-05-31,2766.149902,2768.97998,2750.52002,2752.060059,2752.060059,3981020000,-14.089843,0,"[as, we, celebrate, lgbt, pride, month, and, r..."
3,Twitter for iPhone,....on the basis of their sexual orientation. ...,05-31-2019 19:12:32,20416,105421,2019-05-31 19:12:32,2019-05-31,2019-05-31,2766.149902,2768.97998,2750.52002,2752.060059,2752.060059,3981020000,-14.089843,0,"[...., on, the, basis, of, their, sexual, orie..."
4,Twitter for iPhone,.@SeanHannity is having a DEEP STATE SHOW toni...,05-31-2019 18:11:25,18257,65602,2019-05-31 18:11:25,2019-05-31,2019-05-31,2766.149902,2768.97998,2750.52002,2752.060059,2752.060059,3981020000,-14.089843,0,"[.@seanhannity, is, having, a, deep, state, sh..."


## Generate Vocab

In [8]:
df_words = df['words'].tolist()

In [9]:
import itertools
from collections import defaultdict

# Reshape
ws = list(itertools.chain(*df_words))

In [10]:
len(ws)

170020

In [11]:
word_count = Counter()
word_count.update(ws)

In [12]:
len(word_count.keys())

11961

In [13]:
# delete if occurs < 5 times and it is not in our pretrained embeddings
for word in list(word_count):
    if word_count[word] < 5:
        del word_count[word]

In [14]:
len(word_count.keys())

2845

In [15]:
vocab2index = {"UNK":0} # init with unknown
words = ["UNK"]
for word in word_count:
    vocab2index[word] = len(words)
    words.append(word)

In [16]:
len(words)

2846

In [17]:
vocab_size = len(words)

## Bag of words representation

In [18]:
def bow(x, vocab2index):
    enc = np.zeros(len(vocab2index.keys()))
    words = set(x)
    for word in words:
        enc[vocab2index.get(word, 0)] = 1 # 0 if the UNK index
    return enc

In [19]:
maga = df.loc[1, 'words']
maga

['great', 'news', '!', '#', 'maga', 'https://t.co/91yk8b11bp']

In [20]:
bow(maga, vocab2index)

array([1., 0., 0., ..., 0., 0., 0.])

## Train Test Split

In [21]:
from sklearn.model_selection import train_test_split

df_train, df_test, y_train, y_test = train_test_split(df, df['label'], test_size=0.2, random_state=42)

In [22]:
df_train = df_train.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [23]:
len(df_train)

3869

In [24]:
len(df_test)

968

## Dataset

In [25]:
df.columns

Index(['source', 'text', 'created_at', 'retweet_count', 'favorite_count',
       'time', 'date', 'Date', 'Open', 'High', 'Low', 'Close', 'Adj Close',
       'Volume', 'diff', 'label', 'words'],
      dtype='object')

In [26]:
# standarize star rating
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

In [27]:
df_cont = df[['retweet_count', 'favorite_count']]
scaler.fit(df_cont.values)



StandardScaler(copy=True, with_mean=True, with_std=True)

In [28]:
class BOW(Dataset):
    def __init__(self, df, vocab2index, scaler):
        self.words = df['words'].tolist()
        self.df_cont = df[['retweet_count', 'favorite_count']]
        self.df_cont = scaler.transform(self.df_cont.values)
        self.y = df['label']
        self.vocab2index = vocab2index
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        x = self.words[idx]
        x = bow(x, self.vocab2index)
        return x, self.df_cont[idx], self.y[idx]

In [29]:
train_ds = BOW(df_train, vocab2index, scaler)
test_ds = BOW(df_test, vocab2index, scaler)



In [30]:
train_ds[0]

(array([0., 0., 0., ..., 0., 0., 0.]), array([-0.43412969, -0.50554216]), 1)

In [31]:
batch_size = 500
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
test_dl = DataLoader(test_ds, batch_size=batch_size)

## Model

In [123]:
class BowModel(nn.Module):
    def __init__(self, vocab_size):
        super(BowModel, self).__init__()
        self.input_size = vocab_size + 2
        self.linear1 = nn.Linear(self.input_size, 50)
        self.linear2 = nn.Linear(50, 1)  # binary
        self.bn = nn.BatchNorm1d(50)
        self.dropout = nn.Dropout(0.5)
    
    def forward(self, x_encoding, x_cont):
        x = torch.cat((x_encoding, x_cont), 1) 
        x = self.linear1(x)
        x = self.dropout(x)
        x = F.relu(self.bn(x))
        x = self.dropout(x)
        x = self.linear2(x)
        return x  

In [124]:
x1, x2, y = next(iter(train_dl))
x1 = x1.float()
x2 = x2.float()
y = y.float()

In [125]:
model = BowModel(vocab_size)
y_hat = model(x1, x2)

In [126]:
F.binary_cross_entropy_with_logits(y_hat, y.unsqueeze(1))

tensor(0.7001, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)

In [127]:
def val_metrics(model, test_dl):
    model.eval()
    correct = 0
    total = 0
    sum_loss = 0.0
    for x1, x2, y in test_dl:
        x1 = x1.float()
        x2 = x2.float()
        y = y.float().unsqueeze(1)
        y_hat = model(x1, x2)
        loss = F.binary_cross_entropy_with_logits(y_hat, y)
        y_pred = y_hat > 0
        correct += (y_pred.float() == y).float().sum()
        total += y.shape[0]
        sum_loss += loss.item()*y.shape[0]
    return sum_loss/total, (correct/total).item()

In [128]:
val_metrics(model, test_dl)

(0.6932092845932512, 0.4886363744735718)

## Train Loop

In [129]:
def train_epocs(model, epochs=10, lr=0.001):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    for i in range(epochs):
        model.train()
        total = 0
        loss_sum = 0
        for x1, x2, y in train_dl:
            y_hat = model(x1.float(), x2.float())
            y = y.float().unsqueeze(1)
            loss = F.binary_cross_entropy_with_logits(y_hat, y.float())
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total += y.shape[0]
            loss_sum += loss.item()*y.shape[0]
        val_loss, val_acc = val_metrics(model, test_dl)
        print("train loss %.3f val loss %.3f and accuracy %.3f" % (loss_sum/total, val_loss, val_acc))

In [130]:
train_epocs(model, lr = 0.005, epochs = 10)

train loss 0.698 val loss 0.693 and accuracy 0.518
train loss 0.677 val loss 0.693 and accuracy 0.524
train loss 0.659 val loss 0.695 and accuracy 0.513
train loss 0.632 val loss 0.696 and accuracy 0.503
train loss 0.593 val loss 0.701 and accuracy 0.505
train loss 0.553 val loss 0.715 and accuracy 0.508
train loss 0.502 val loss 0.739 and accuracy 0.505
train loss 0.467 val loss 0.758 and accuracy 0.513
train loss 0.439 val loss 0.813 and accuracy 0.518
train loss 0.410 val loss 0.838 and accuracy 0.502


In [131]:
train_epocs(model, lr = 0.001, epochs = 5)

train loss 0.400 val loss 0.818 and accuracy 0.514
train loss 0.381 val loss 0.823 and accuracy 0.514
train loss 0.376 val loss 0.832 and accuracy 0.515
train loss 0.348 val loss 0.841 and accuracy 0.520
train loss 0.353 val loss 0.866 and accuracy 0.521
