In [19]:
import nltk
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from collections import defaultdict

In [3]:
import gensim
w2v = gensim.models.KeyedVectors.load_word2vec_format("/Users/janfan/Downloads/GoogleNews-vectors-negative300-SLIM.bin.gz",
                                                      binary=True)

In [26]:
class MyTokenizer:
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        transformed_X = []
        for document in X:
            tokenized_doc = []
            for sent in nltk.sent_tokenize(document):
                tokenized_doc += nltk.word_tokenize(sent)
            transformed_X.append(np.array(tokenized_doc))
        return np.array(transformed_X)
    
    def fit_transform(self, X, y=None):
        return self.transform(X)

    
class TfidfEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.word2weight = None
        # if a text is empty we should return a vector of zeros
        # with the same dimensionality as all the other vectors
        self.dim = len(word2vec.vectors[0])

    def fit(self, X, y):
        #tfidf = TfidfVectorizer(analyzer=lambda x: x)
        tfidf = TfidfVectorizer(analyzer='word',
                                tokenizer=lambda x: x,
                                preprocessor=lambda x: x,
                                token_pattern=None)
        tfidf.fit(X)
        # if a word was never seen - it must be at least as infrequent
        # as any of the known words - so the default idf is the max of 
        # known idf's
        max_idf = max(tfidf.idf_)
        self.word2weight = defaultdict(
            lambda: max_idf,
            [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])

        return self

    def transform(self, X):
        return np.array([
                np.mean([self.word2vec[w] * self.word2weight[w]
                         for w in words if w in self.word2vec] or
                        [np.zeros(self.dim)], axis=0)
                for words in X
            ])

In [4]:
import pandas as pd, numpy as np
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
subm = pd.read_csv('data/sample_submission.csv')

In [16]:
train_tokens = MyTokenizer().fit_transform(train['comment_text'])
test_tokens = MyTokenizer().fit_transform(test['comment_text'])
np.save('data/train', train_tokens)
np.save('data/test', test_tokens)

In [27]:
embedding_vectorizer = TfidfEmbeddingVectorizer(w2v)
train_embedded = embedding_vectorizer.fit(train_tokens, None)

In [28]:
train_embedded = embedding_vectorizer.transform(train_tokens)
test_embedded = embedding_vectorizer.transform(test_tokens)

In [29]:
from sklearn.linear_model import LogisticRegression

label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
preds = np.zeros((len(test), len(label_cols)))
for i, j in enumerate(label_cols):
    m = LogisticRegression()
    mf = m.fit(train_embedded, train[j])
    preds[:,i] = mf.predict_proba(test_embedded)[:,1]

In [30]:
submid = pd.DataFrame({'id': subm["id"]})
submission = pd.concat([submid, pd.DataFrame(preds, columns = label_cols)], axis=1)
submission.to_csv('submission/submission-w2v-doctfidf-lr.csv', index=False)