In [1]:
import pickle
import gensim
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from collections import defaultdict
from sklearn.linear_model import LogisticRegression
import time
import preprocess
from itertools import product

In [2]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
subm = pd.read_csv('data/sample_submission.csv')

In [3]:
data = preprocess.load()

In [17]:
docs = []
for uid, tokens in enumerate(data['train_tokens']):
    docs.append(gensim.models.doc2vec.TaggedDocument(tokens, ['TRAIN_SENT %s' % uid]))

In [18]:
model = gensim.models.doc2vec.Doc2Vec(docs)
model.save('data/doc2vec')

In [24]:
v = model.infer_vector(data['test_tokens'][0])
sims = model.docvecs.most_similar([v], topn=10)

In [61]:
# hits >= 1
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
preds = np.zeros((len(test), len(label_cols)))
for k, doc in enumerate(data['test_tokens']):
    v = model.infer_vector(data['test_tokens'][k])
    sims = model.docvecs.most_similar([v], topn=10)
    ids = [(int(id.split()[1]), s) for id,s in sims]
    hits = np.sum((np.vstack([(train[id:id+1][label_cols].values) for (id,s) in ids])), axis=0)
    preds[k,:] = np.where(hits >= 1, 1, 0).tolist()

submid = pd.DataFrame({'id': subm["id"]})
submission = pd.concat([submid, pd.DataFrame(preds, columns = label_cols)], axis=1)
submission.to_csv('submission/submission-doc2vec-0or1.csv', index=False)

In [65]:
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
preds = np.zeros((len(test), len(label_cols)))
for k, doc in enumerate(data['test_tokens']):
    v = model.infer_vector(data['test_tokens'][k])
    sims = model.docvecs.most_similar([v], topn=10)
    ids = [(int(id.split()[1]), s) for id,s in sims]
    hits = np.sum((np.vstack([(train[id:id+1][label_cols].values) for (id,s) in ids])), axis=0)
    preds[k,:] = (hits/10).tolist()

submid = pd.DataFrame({'id': subm["id"]})
submission = pd.concat([submid, pd.DataFrame(preds, columns = label_cols)], axis=1)
submission.to_csv('submission/submission-doc2vec-percentage.csv', index=False)

In [21]:
train_embedded[0]

array([ 0.03369045, -0.02747776,  0.03844769,  0.05061621,  0.10103497,
       -0.04662947,  0.06483382, -0.05166072,  0.11619633, -0.06861715,
       -0.07906462, -0.05022816,  0.08388603, -0.08241028,  0.0599994 ,
       -0.05640804,  0.00133157, -0.05157386,  0.09211881, -0.00694055,
        0.08366387,  0.11108589, -0.01944638, -0.00126969,  0.00251589,
       -0.17428024, -0.00572306,  0.03241219, -0.01736025, -0.07372978,
       -0.00986082,  0.03882933,  0.02748811, -0.04585944,  0.10010006,
        0.03222277,  0.0610371 ,  0.11278611, -0.01364839, -0.02170032,
       -0.00025584, -0.04807695,  0.09582652, -0.10436545, -0.05588139,
       -0.00790225, -0.05490515, -0.04990567, -0.01148711,  0.06806511,
       -0.07195638,  0.04679437, -0.02065961,  0.02318708,  0.02759757,
       -0.02615554, -0.10299507,  0.00969942,  0.02318759, -0.03136772,
       -0.07193238,  0.02813243,  0.13022809,  0.04930704, -0.0454647 ,
       -0.02204282, -0.00842689,  0.07592494, -0.02911519,  0.08

In [4]:
model = gensim.models.doc2vec.Doc2Vec.load('data/doc2vec')

In [10]:
model.docvecs[0].shape

(100,)

In [12]:
test_embedded = np.zeros((len(test), 100))
for k, doc in enumerate(data['test_tokens']):
    test_embedded[k,:] = model.infer_vector(data['test_tokens'][k])

In [19]:
train_embedded = np.zeros((len(train), 100))
for k in range(len(train)):
    train_embedded[k,:] = model.docvecs[k]

In [22]:
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
preds = np.zeros((len(test), len(label_cols)))
for i, j in enumerate(label_cols):
    m = LogisticRegression()
    mf = m.fit(train_embedded, train[j])
    preds[:,i] = mf.predict_proba(test_embedded)[:,1]

In [23]:
submid = pd.DataFrame({'id': subm["id"]})
submission = pd.concat([submid, pd.DataFrame(preds, columns = label_cols)], axis=1)
submission.to_csv('submission/submission-doc2vec-lr.csv', index=False)