In [1]:
import numpy as np
import os
import pandas as pd
import re
from collections import Counter
from gensim.models import KeyedVectors
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

In [2]:
TRAIN_FILE = 'train-balanced-sarcasm.csv'

In [3]:
train_df = pd.read_csv(TRAIN_FILE)
train_df.head()

Unnamed: 0,label,comment,author,subreddit,score,ups,downs,date,created_utc,parent_comment
0,0,NC and NH.,Trumpbart,politics,2,-1,-1,2016-10,2016-10-16 23:55:23,"Yeah, I get that argument. At this point, I'd ..."
1,0,You do know west teams play against west teams...,Shbshb906,nba,-4,-1,-1,2016-11,2016-11-01 00:24:10,The blazers and Mavericks (The wests 5 and 6 s...
2,0,"They were underdogs earlier today, but since G...",Creepeth,nfl,3,3,0,2016-09,2016-09-22 21:45:37,They're favored to win.
3,0,"This meme isn't funny none of the ""new york ni...",icebrotha,BlackPeopleTwitter,-8,-1,-1,2016-10,2016-10-18 21:03:47,deadass don't kill my buzz
4,0,I could use one of those tools.,cush2push,MaddenUltimateTeam,6,-1,-1,2016-12,2016-12-30 17:00:13,Yep can confirm I saw the tool they use for th...


In [4]:
train_df.shape

(1010826, 10)

In [5]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1010826 entries, 0 to 1010825
Data columns (total 10 columns):
label             1010826 non-null int64
comment           1010773 non-null object
author            1010826 non-null object
subreddit         1010826 non-null object
score             1010826 non-null int64
ups               1010826 non-null int64
downs             1010826 non-null int64
date              1010826 non-null object
created_utc       1010826 non-null object
parent_comment    1010826 non-null object
dtypes: int64(4), object(6)
memory usage: 77.1+ MB


In [6]:
train_df.dropna(subset=['comment'], inplace=True)

In [7]:
train_df['label'].value_counts()

0    505405
1    505368
Name: label, dtype: int64

In [8]:
train_texts, valid_texts, y_train, y_valid = train_test_split(train_df['comment'], train_df['label'], random_state=17)

In [9]:
def preprocessing(texts):
    return [re.sub(r"([^ \w])", r" \1 ", str.lower(text)) for text in texts]

def tokenization(texts):
    return [text.split() for text in texts]

class LR_Doc2Vec:
    def __init__(self, doc2vec_model, C=1.0):
        super(LR_Doc2Vec, self).__init__()
        self.doc2vec_model = doc2vec_model
        self.C = C
        self.lr = LogisticRegression(C=C, random_state=13)
    
    def load_embeddings(self, X):
        X_emb = []
        for x in X:
            X_emb.append(self.doc2vec_model.infer_vector(x))
        X_emb = np.array(X_emb)
        return X_emb

    def fit(self, X_train, y_train):
        X_train_emb = self.load_embeddings(X_train)
        self.lr.fit(X_train_emb, y_train)
        del X_train_emb
        return self
    
    def predict(self, X_test):
        X_test_emb = self.load_embeddings(X_test)
        y_pred = self.lr.predict(X_test_emb)
        del X_test_emb
        return y_pred
    
    def predict_proba(self, X_test):
        X_test_emb = self.load_embeddings(X_test)
        y_pred = self.lr.predict_proba(X_test_emb)
        del X_test_emb
        return y_pred

In [10]:
%%time

train_tokens = tokenization(preprocessing(train_texts))
valid_tokens = tokenization(preprocessing(valid_texts))

Wall time: 6.18 s


In [27]:
%%time

train_docs = [TaggedDocument(d, [i]) for (i, d) in enumerate(train_tokens)]
doc2vec_model = Doc2Vec(vector_size=300, min_count=1, epochs=5)
doc2vec_model.build_vocab(train_docs)
doc2vec_model.train(train_docs, total_examples=doc2vec_model.corpus_count, epochs=doc2vec_model.epochs)

Wall time: 2min 37s


In [28]:
%%time

lr_doc2vec = LR_Doc2Vec(doc2vec_model)
lr_doc2vec.fit(train_tokens, y_train)

Wall time: 3min 39s


In [29]:
%%time

y_pred_lr_doc2vec = lr_doc2vec.predict_proba(valid_tokens)[:, 1]
print(roc_auc_score(y_valid, y_pred_lr_doc2vec))

0.6771911698751345
Wall time: 45.4 s
