In [1]:
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import random
import re
import torch
import torch.nn as nn
import torch.optim as optim
from collections import Counter
from gensim.models import KeyedVectors
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from IPython import display
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

%matplotlib inline

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [2]:
TRAIN_FILE = 'train-balanced-sarcasm.csv'

In [3]:
train_df = pd.read_csv(TRAIN_FILE)
train_df.head()

Unnamed: 0,label,comment,author,subreddit,score,ups,downs,date,created_utc,parent_comment
0,0,NC and NH.,Trumpbart,politics,2,-1,-1,2016-10,2016-10-16 23:55:23,"Yeah, I get that argument. At this point, I'd ..."
1,0,You do know west teams play against west teams...,Shbshb906,nba,-4,-1,-1,2016-11,2016-11-01 00:24:10,The blazers and Mavericks (The wests 5 and 6 s...
2,0,"They were underdogs earlier today, but since G...",Creepeth,nfl,3,3,0,2016-09,2016-09-22 21:45:37,They're favored to win.
3,0,"This meme isn't funny none of the ""new york ni...",icebrotha,BlackPeopleTwitter,-8,-1,-1,2016-10,2016-10-18 21:03:47,deadass don't kill my buzz
4,0,I could use one of those tools.,cush2push,MaddenUltimateTeam,6,-1,-1,2016-12,2016-12-30 17:00:13,Yep can confirm I saw the tool they use for th...


In [4]:
train_df.shape

(1010826, 10)

In [5]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1010826 entries, 0 to 1010825
Data columns (total 10 columns):
label             1010826 non-null int64
comment           1010773 non-null object
author            1010826 non-null object
subreddit         1010826 non-null object
score             1010826 non-null int64
ups               1010826 non-null int64
downs             1010826 non-null int64
date              1010826 non-null object
created_utc       1010826 non-null object
parent_comment    1010826 non-null object
dtypes: int64(4), object(6)
memory usage: 77.1+ MB


In [6]:
train_df.dropna(subset=['comment'], inplace=True)

In [7]:
train_df['label'].value_counts()

0    505405
1    505368
Name: label, dtype: int64

In [8]:
train_texts, valid_texts, y_train, y_valid = train_test_split(train_df['comment'].values, train_df['label'].values, random_state=17)

In [27]:
def preprocessing(texts):
    return [re.sub(r"([^ \w])", r" \1 ", str.lower(text)) for text in texts]

def tokenization(texts):
    return [text.split() for text in texts]

def build_vocabulary(data):
    vocab = dict()
    for d in data:
        for w in d:
            try:
                vocab[w]
            except:
                vocab[w] = len(vocab)
    return vocab

def build_embeddings_glove(file_path, vocab, d=300):
    emb_dict = dict()
    unk_array = np.zeros(d)
    with open(file_path, 'r', encoding="utf8") as f:
        for line in f:
            values = line.split()
            word = values[0]
            try:
                vocab[word]
                vector = np.asarray(values[1:], "float32")
                emb_dict[word] = vector
                unk_array += vector
            except:
                continue
    emb_dict['UNK'] = unk_array / len(emb_dict)
    return emb_dict

def build_w2v_dict(file_path, vocab, d=300):
    emb_dict = dict()
    unk_array = np.zeros(d)
    w2v_model = KeyedVectors.load_word2vec_format(file_path, binary=True)
    for word in vocab.keys():
        try:
            vector = w2v_model.get_vector(word)
            emb_dict[word] = vector
            unk_array += vector
        except:
            continue
    emb_dict['UNK'] = unk_array / len(emb_dict)
    return emb_dict

def build_emb_matrix_lr(data, emb_dict):
    X = []
    cnt_unk = 0
    cnt_total = 0
    for d in data:
        sentence_emb = np.zeros(len(emb_dict['UNK']))
        for w in d:
            cnt_total += 1
            try:
                sentence_emb += emb_dict[w]
            except:
                cnt_unk += 1
                sentence_emb += emb_dict['UNK']
        X.append(sentence_emb / len(d))
    return np.array(X), cnt_unk / cnt_total

def build_emb_dict_nn(file_path, vocab, d=300):
    emb_dict = dict()
    unk_array = np.zeros(d)
    with open(file_path, 'r', encoding="utf8") as f:
        for line in f:
            values = line.split()
            word = values[0]
            try:
                vocab[word]
                vector = np.asarray(values[1:], "float32")
                emb_dict[word] = vector
                unk_array += vector
            except:
                continue
    emb_dict['UNK'] = unk_array / len(emb_dict)
    emb_dict['PAD'] = np.zeros(d)
    return emb_dict

def build_emb_matrix_nn(file_path, vocab, d=300):
    emb_dict = build_emb_dict_nn(file_path, vocab, d=d)
    emb_matrix = np.zeros((len(emb_dict), d))
    word2idx = {'UNK': 0, 'PAD': 1}
    for word in sorted(list(set(emb_dict.keys()) - set(['UNK', 'PAD']))):
        word2idx[word] = len(word2idx)
    for w, i in word2idx.items():
        emb_matrix[i] = emb_dict[w]
    emb_matrix = torch.tensor(emb_matrix)
    return emb_matrix, word2idx

def build_w2v_dict_nn(file_path, vocab, d=300):
    emb_dict = dict()
    unk_array = np.zeros(d)
    w2v_model = KeyedVectors.load_word2vec_format(file_path, binary=True)
    for word in vocab.keys():
        try:
            vector = w2v_model.get_vector(word)
            emb_dict[word] = vector
            unk_array += vector
        except:
            continue
    emb_dict['UNK'] = unk_array / len(emb_dict)
    emb_dict['PAD'] = np.zeros(d)
    return emb_dict

def build_emb_matrix_nn_w2v(file_path, vocab, d=300):
    emb_dict = build_w2v_dict_nn(file_path, vocab, d=d)
    emb_matrix = np.zeros((len(emb_dict), d))
    word2idx = {'UNK': 0, 'PAD': 1}
    for word in sorted(list(set(emb_dict.keys()) - set(['UNK', 'PAD']))):
        word2idx[word] = len(word2idx)
    for w, i in word2idx.items():
        emb_matrix[i] = emb_dict[w]
    emb_matrix = torch.tensor(emb_matrix)
    return emb_matrix, word2idx

class LR_Doc2Vec:
    def __init__(self, doc2vec_model, C=1.0):
        super(LR_Doc2Vec, self).__init__()
        self.doc2vec_model = doc2vec_model
        self.C = C
        self.lr = LogisticRegression(C=C, random_state=13)
    
    def load_embeddings(self, X):
        X_emb = []
        for x in X:
            X_emb.append(self.doc2vec_model.infer_vector(x))
        X_emb = np.array(X_emb)
        return X_emb

    def fit(self, X_train, y_train):
        X_train_emb = self.load_embeddings(X_train)
        self.lr.fit(X_train_emb, y_train)
        del X_train_emb
        return self
    
    def predict(self, X_test):
        X_test_emb = self.load_embeddings(X_test)
        y_pred = self.lr.predict(X_test_emb)
        del X_test_emb
        return y_pred
    
    def predict_proba(self, X_test):
        X_test_emb = self.load_embeddings(X_test)
        y_pred = self.lr.predict_proba(X_test_emb)
        del X_test_emb
        return y_pred

class BiLSTM(nn.Module):
    def __init__(self, emb_matrix, hidden_size=64, output_size=2, freeze_emb=True):
        super(BiLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding.from_pretrained(emb_matrix)
        if freeze_emb:
            self.embedding.weight.requires_grad = False
        self.lstm = nn.LSTM(
            input_size=self.embedding.embedding_dim,
            hidden_size=hidden_size,
            bidirectional=True,
            batch_first=True
        )
        self.fc = nn.Linear(2 * hidden_size, output_size)
        self.softmax = nn.Softmax(dim=-1)
    
    def forward(self, x):
        x_emb = self.embedding(x)
        # (batch, seq_len, num_directions * hidden_size)
        lstm_out, _ = self.lstm(x_emb.float())
        # (batch, seq_len, num_directions, hidden_size)
        lstm_out = lstm_out.view(lstm_out.shape[0], lstm_out.shape[1], -1, self.hidden_size)
        # lstm_out[:, :, 0, :] -- output of the forward LSTM
        # lstm_out[:, :, 1, :] -- output of the backward LSTM
        # we take the last hidden state of the forward LSTM and the first hidden state of the backward LSTM
        x_fc = torch.cat((lstm_out[:, -1, 0, :], lstm_out[:, 0, 1, :]), dim=1)
        fc_out = self.fc(x_fc)
        out = self.softmax(fc_out)
        return out

def as_matrix(documents, word2idx, max_len=None):
    max_doc_len = max(map(len, documents))
    if max_len is None:
        max_len = max_doc_len
    else:
        max_len = min(max_doc_len, max_len)
    matrix = np.ones((len(documents), max_len), dtype=np.int64)
    for i, doc in enumerate(documents):
        row_ix = [word2idx.get(word, 0) for word in doc[:max_len]]
        matrix[i, :len(row_ix)] = row_ix
    return matrix

def predict_bilstm(model, dev_data, word2idx, max_len=300, device=device, batch_size=16):
    with torch.no_grad():
        val_size = len(dev_data)
        y_pred = np.zeros(val_size, dtype=float)
        for i in range(0, val_size, batch_size):
            x = as_matrix(dev_data[i:(i + batch_size)], word2idx, max_len)
            x = torch.tensor(x).long()
            x = x.to(device)
            prediction = model(x)[:, 1]
            y_pred[i:(i + batch_size)] = prediction.cpu().detach().numpy()
    return y_pred

def set_random_seeds(seed_value=13, device='cpu'):
    '''source https://forums.fast.ai/t/solved-reproducibility-where-is-the-randomness-coming-in/31628/5'''
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    random.seed(seed_value)
    if device != 'cpu': 
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

### LR + TF-IDF

In [10]:
%%time

tfidf_vec = TfidfVectorizer(ngram_range=(1, 3), max_features=50000, min_df=5)
train_texts_tfidf = tfidf_vec.fit_transform(train_texts)
valid_texts_tfidf = tfidf_vec.transform(valid_texts)

Wall time: 53 s


In [11]:
%%time

lr = LogisticRegression(solver='lbfgs', max_iter=500, random_state=13)
lr.fit(train_texts_tfidf, y_train)
y_pred_lr_tfidf = lr.predict_proba(valid_texts_tfidf)[:, 1]
print(roc_auc_score(y_valid, y_pred_lr_tfidf))

0.7942782786624244
Wall time: 28.4 s


### LR + CountVectorizer

In [12]:
%%time

cnt_vec = CountVectorizer(ngram_range=(1, 3), max_features=40000, min_df=1)
train_texts_cnt = cnt_vec.fit_transform(train_texts)
valid_texts_cnt = cnt_vec.transform(valid_texts)

Wall time: 50.4 s


In [13]:
%%time

lr = LogisticRegression(solver='lbfgs', max_iter=500, random_state=13)
lr.fit(train_texts_cnt, y_train)
y_pred_lr_cnt = lr.predict_proba(valid_texts_cnt)[:, 1]
print(roc_auc_score(y_valid, y_pred_lr_cnt))

0.7877326249300394
Wall time: 55.2 s


### LR + GloVe

In [14]:
%%time

train_tokens = tokenization(preprocessing(train_texts))
valid_tokens = tokenization(preprocessing(valid_texts))

vocab = build_vocabulary(train_tokens)
print("Vocabulary size:", len(vocab))

Vocabulary size: 143374
Wall time: 12.2 s


In [15]:
emb_dict = build_embeddings_glove('glove.6B.300d.txt', vocab)
print('Unique vectors in embeddings dictionary:', len(emb_dict))

train_emb_matrix, train_unk = build_emb_matrix_lr(train_tokens, emb_dict)
valid_emb_matrix, valid_unk = build_emb_matrix_lr(valid_tokens, emb_dict)
print('Train embedding matrix shape:', train_emb_matrix.shape)
print('Train: {:.2f}% unknown words'.format(train_unk * 100))
print('Valid embedding matrix shape:', valid_emb_matrix.shape)
print('Valid: {:.2f}% unknown words'.format(valid_unk * 100))

Unique vectors in embeddings dictionary: 78684
Train embedding matrix shape: (758079, 300)
Train: 1.34% unknown words
Valid embedding matrix shape: (252694, 300)
Valid: 1.59% unknown words


In [16]:
%%time

lr = LogisticRegression(C=5, solver='sag', max_iter=500, random_state=13)
lr.fit(train_emb_matrix, y_train)
y_pred_lr_glove = lr.predict_proba(valid_emb_matrix)[:, 1]
print(roc_auc_score(y_valid, y_pred_lr_glove))

0.6748553399754051
Wall time: 46.7 s


### LR + Word2Vec

In [17]:
emb_dict = build_w2v_dict('googlenews/GoogleNews-vectors-negative300.bin', vocab)
print('Unique vectors in embeddings dictionary:', len(emb_dict))

train_emb_matrix, train_unk = build_emb_matrix_lr(train_tokens, emb_dict)
valid_emb_matrix, valid_unk = build_emb_matrix_lr(valid_tokens, emb_dict)
print('Train embedding matrix shape:', train_emb_matrix.shape)
print('Train: {:.2f}% unknown words'.format(train_unk * 100))
print('Valid embedding matrix shape:', valid_emb_matrix.shape)
print('Valid: {:.2f}% unknown words'.format(valid_unk * 100))

Unique vectors in embeddings dictionary: 62373
Train embedding matrix shape: (758079, 300)
Train: 23.67% unknown words
Valid embedding matrix shape: (252694, 300)
Valid: 23.71% unknown words


In [18]:
%%time

lr = LogisticRegression(solver='lbfgs', max_iter=500, random_state=13)
lr.fit(train_emb_matrix, y_train)
y_pred_lr_w2v = lr.predict_proba(valid_emb_matrix)[:, 1]
print(roc_auc_score(y_valid, y_pred_lr_w2v))

0.673213417887968
Wall time: 24.7 s


### LR + Doc2Vec

In [19]:
%%time

train_docs = [TaggedDocument(d, [i]) for (i, d) in enumerate(train_tokens)]
doc2vec_model = Doc2Vec(vector_size=300, min_count=1, epochs=5)
doc2vec_model.build_vocab(train_docs)
doc2vec_model.train(train_docs, total_examples=doc2vec_model.corpus_count, epochs=doc2vec_model.epochs)

Wall time: 3min 8s


In [20]:
%%time

lr_doc2vec = LR_Doc2Vec(doc2vec_model)
lr_doc2vec.fit(train_tokens, y_train)

Wall time: 3min 51s


In [21]:
%%time

y_pred_lr_doc2vec = lr_doc2vec.predict_proba(valid_tokens)[:, 1]
print(roc_auc_score(y_valid, y_pred_lr_doc2vec))

0.6774603171532291
Wall time: 49.2 s


### BiLSTM + GloVe

In [25]:
%%time

emb_matrix, word2idx = build_emb_matrix_nn('glove.6B.300d.txt', vocab)
print('Unique vectors in embedding matrix:', len(emb_matrix))

Vocabulary size: 143374
Unique vectors in embedding matrix: 78685
Wall time: 19.9 s


In [26]:
%%time

BILSTM_STATE_DICT = 'bilstm_glove.pt'
hidden_size = 128

bilstm = BiLSTM(emb_matrix, hidden_size).to(device)
bilstm.load_state_dict(torch.load(BILSTM_STATE_DICT, map_location=device))
bilstm.eval()

y_pred_bilstm_glove = predict_bilstm(bilstm, valid_tokens, word2idx)
print(roc_auc_score(y_valid, y_pred_bilstm_glove))

0.8097815263231354
Wall time: 35.4 s


### BiLSTM + Word2Vec

In [28]:
%%time

emb_matrix, word2idx = build_emb_matrix_nn_w2v('googlenews/GoogleNews-vectors-negative300.bin', vocab)
print('Unique vectors in embedding matrix:', len(emb_matrix))

Unique vectors in embedding matrix: 62374
Wall time: 1min 4s


In [29]:
%%time

BILSTM_STATE_DICT = 'bilstm_w2v.pt'
hidden_size = 128

bilstm = BiLSTM(emb_matrix, hidden_size).to(device)
bilstm.load_state_dict(torch.load(BILSTM_STATE_DICT, map_location=device))
bilstm.eval()

y_pred_bilstm_w2v = predict_bilstm(bilstm, valid_tokens, word2idx)
print(roc_auc_score(y_valid, y_pred_bilstm_w2v))

0.7942717703722182
Wall time: 33.4 s
