In [1]:
import nltk
import gensim
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from collections import defaultdict

In [2]:
class MyTokenizer:
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        transformed_X = []
        for document in X:
            tokenized_doc = []
            for sent in nltk.sent_tokenize(document):
                tokenized_doc += nltk.word_tokenize(sent)
            transformed_X.append(np.array(tokenized_doc))
        return np.array(transformed_X)
    
    def fit_transform(self, X, y=None):
        return self.transform(X)

In [3]:
import pandas as pd, numpy as np
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
subm = pd.read_csv('data/sample_submission.csv')

In [6]:
#train_tokens = MyTokenizer().fit_transform(train['comment_text'])
#test_tokens = MyTokenizer().fit_transform(test['comment_text'])
with open('data/train.npy', 'rb') as f: train_tokens = np.load(f)
with open('data/test.npy', 'rb') as f: test_tokens = np.load(f)

In [30]:
def array_to_list(array):
    if isinstance(array, np.ndarray):
        return array_to_list(array.tolist())
    elif isinstance(array, list):
        return [array_to_list(item) for item in array]
    elif isinstance(array, tuple):
        return tuple(array_to_list(item) for item in array)
    else:
        return array

In [36]:
'''
Arguments:
sentences=None, size=100, alpha=0.025, window=5, min_count=5, max_vocab_size=None, 
sample=0.001, seed=1, workers=3, min_alpha=0.0001, sg=0, hs=0, negative=5,
cbow_mean=1, hashfxn=<built-in function hash>, iter=5, null_word=0, trim_rule=None, 
sorted_vocab=1, batch_words=10000, compute_loss=False, callbacks=()
'''
sg=1
size=300
window=10
min_count=3
model = gensim.models.word2vec.Word2Vec(array_to_list(train_tokens), 
                                        sg=sg, size=size, window=window, min_count=min_count)

In [37]:
model_name = '{0}-sz{1}-win{2}-minc{3}'.format('sg' if sg==1 else 'cbow', size, window, min_count)
model.save('data/w2v-' + model_name + '.model')

In [38]:
class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        # if a text is empty we should return a vector of zeros
        # with the same dimensionality as all the other vectors
        self.dim = len(word2vec.vectors[0])

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return np.array([
            np.mean([self.word2vec[w] for w in words if w in self.word2vec]
                    or [np.zeros(self.dim)], axis=0)
            for words in X
        ])
    
    def fit_transform(self, X, y=None):
        return self.transform(X)

In [40]:
mean_embedding_vectorizer = MeanEmbeddingVectorizer(model.wv)
train_mean_embedded = mean_embedding_vectorizer.transform(train_tokens)
test_mean_embedded = mean_embedding_vectorizer.transform(test_tokens)

In [45]:
from sklearn.linear_model import LogisticRegression

label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
preds = np.zeros((len(test), len(label_cols)))
for i, j in enumerate(label_cols):
    m = LogisticRegression()
    mf = m.fit(train_mean_embedded, train[j])
    preds[:,i] = mf.predict_proba(test_mean_embedded)[:,1]

In [46]:
submid = pd.DataFrame({'id': subm["id"]})
submission = pd.concat([submid, pd.DataFrame(preds, columns = label_cols)], axis=1)
submission.to_csv('submission/submission-toxicw2v-docmean-lr-{}.csv'.format(model_name), index=False)