In [1]:
import numpy as np
import os
import pandas as pd
import re
from collections import Counter
from gensim.models import KeyedVectors
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

In [2]:
TRAIN_FILE = 'train-balanced-sarcasm.csv'

In [3]:
train_df = pd.read_csv(TRAIN_FILE)
train_df.head()

Unnamed: 0,label,comment,author,subreddit,score,ups,downs,date,created_utc,parent_comment
0,0,NC and NH.,Trumpbart,politics,2,-1,-1,2016-10,2016-10-16 23:55:23,"Yeah, I get that argument. At this point, I'd ..."
1,0,You do know west teams play against west teams...,Shbshb906,nba,-4,-1,-1,2016-11,2016-11-01 00:24:10,The blazers and Mavericks (The wests 5 and 6 s...
2,0,"They were underdogs earlier today, but since G...",Creepeth,nfl,3,3,0,2016-09,2016-09-22 21:45:37,They're favored to win.
3,0,"This meme isn't funny none of the ""new york ni...",icebrotha,BlackPeopleTwitter,-8,-1,-1,2016-10,2016-10-18 21:03:47,deadass don't kill my buzz
4,0,I could use one of those tools.,cush2push,MaddenUltimateTeam,6,-1,-1,2016-12,2016-12-30 17:00:13,Yep can confirm I saw the tool they use for th...


In [4]:
train_df.shape

(1010826, 10)

In [5]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1010826 entries, 0 to 1010825
Data columns (total 10 columns):
label             1010826 non-null int64
comment           1010773 non-null object
author            1010826 non-null object
subreddit         1010826 non-null object
score             1010826 non-null int64
ups               1010826 non-null int64
downs             1010826 non-null int64
date              1010826 non-null object
created_utc       1010826 non-null object
parent_comment    1010826 non-null object
dtypes: int64(4), object(6)
memory usage: 77.1+ MB


In [6]:
train_df.dropna(subset=['comment'], inplace=True)

In [7]:
train_df['label'].value_counts()

0    505405
1    505368
Name: label, dtype: int64

In [8]:
train_texts, valid_texts, y_train, y_valid = train_test_split(train_df['comment'], train_df['label'], random_state=17)

In [9]:
def preprocessing(texts):
    return [re.sub(r"([^ \w])", r" \1 ", str.lower(text)) for text in texts]

def tokenization(texts):
    return [text.split() for text in texts]

def build_vocabulary(data):
    vocab = dict()
    for d in data:
        for w in d:
            try:
                vocab[w]
            except:
                vocab[w] = len(vocab)
    return vocab

def build_embeddings(file_path, vocab, d=300):
    emb_dict = dict()
    unk_array = np.zeros(d)
    with open(file_path, 'r', encoding="utf8") as f:
        for line in f:
            values = line.split()
            word = values[0]
            try:
                vocab[word]
                vector = np.asarray(values[1:], "float32")
                emb_dict[word] = vector
                unk_array += vector
            except:
                continue
    emb_dict['UNK'] = unk_array / len(emb_dict)
    return emb_dict

def build_emb_matrix(data, emb_dict):
    X = []
    cnt_unk = 0
    cnt_total = 0
    for d in data:
        sentence_emb = np.zeros(len(emb_dict['UNK']))
        for w in d:
            cnt_total += 1
            try:
                sentence_emb += emb_dict[w]
            except:
                cnt_unk += 1
                sentence_emb += emb_dict['UNK']
        X.append(sentence_emb / len(d))
    return np.array(X), cnt_unk / cnt_total

In [10]:
%%time

train_tokens = tokenization(preprocessing(train_texts))
valid_tokens = tokenization(preprocessing(valid_texts))

vocab = build_vocabulary(train_tokens)
print("Vocabulary size:", len(vocab))

emb_dict = build_embeddings('glove.6B.300d.txt', vocab)
print('Unique vectors in embeddings dictionary:', len(emb_dict))

train_emb_matrix, train_unk = build_emb_matrix(train_tokens, emb_dict)
valid_emb_matrix, valid_unk = build_emb_matrix(valid_tokens, emb_dict)
print('Train embedding matrix shape:', train_emb_matrix.shape)
print('Train: {:.2f}% unknown words'.format(train_unk * 100))
print('Valid embedding matrix shape:', valid_emb_matrix.shape)
print('Valid: {:.2f}% unknown words'.format(valid_unk * 100))

Vocabulary size: 143374
Unique vectors in embeddings dictionary: 78684
Train embedding matrix shape: (758079, 300)
Train: 1.34% unknown words
Valid embedding matrix shape: (252694, 300)
Valid: 1.59% unknown words
Wall time: 50.9 s


In [11]:
%%time

lr = LogisticRegression(C=5, solver='sag', max_iter=500, random_state=13)
lr.fit(train_emb_matrix, y_train)
y_pred_lr_glove = lr.predict_proba(valid_emb_matrix)[:, 1]
print(roc_auc_score(y_valid, y_pred_lr_glove))

0.6748553399754051
Wall time: 47.8 s
