In [1]:
import os
import math
import re

In [2]:
import gensim
import numpy as np

In [3]:
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = re.sub('[\W]+', ' ', text.lower()) +\
        ' '.join(emoticons).replace('-', '')
    return text.strip()

In [4]:
basepath = './aclImdb'
def read_corpus(train_or_test='train'):
    for l in ('pos', 'neg'):
        path = os.path.join(basepath, train_or_test, l)
        for file in os.listdir(path):
            with open(os.path.join(path, file), 'r', encoding='utf-8') as infile:
                yield (preprocessor(infile.read()), l)

In [5]:
X_train, y_train = zip(*read_corpus())
len(y_train)

25000

In [6]:
X_test, y_test = zip(*read_corpus('test'))
len(y_test)

25000

In [7]:
gw2v_model = gensim.models.KeyedVectors.load_word2vec_format('googleWord2Vec.bin', binary=True)

In [24]:
def X2vec(X, w2v_model):
    return list(map(lambda s: sum(np.array(w2v_model[i]) for i in s.split() if i in w2v_model), X))

In [9]:
X_train_gw2v = X2vec(X_train, gw2v_model)
X_test_gw2v  = X2vec(X_test, gw2v_model)

In [34]:
from sklearn.linear_model import LogisticRegression
gw2v_clf = LogisticRegression(random_state=0, C=0.0003)
gw2v_clf.fit(X_train_gw2v, y_train)

LogisticRegression(C=0.0003, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=0,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [35]:
print('Train accuracy: %.3f' % gw2v_clf.score(X_train_gw2v, y_train))
print('Test accuracy: %.3f' % gw2v_clf.score(X_test_gw2v, y_test))

Accuracy: 0.864
Accuracy: 0.860


In [25]:
import fasttext
fw2v_model = fasttext.load_model('wiki.en.bin')

In [26]:
X_train_fw2v = X2vec(X_train, fw2v_model)
X_test_fw2v  = X2vec(X_test, fw2v_model)

In [48]:
fw2v_clf = LogisticRegression(random_state=0)
fw2v_clf.fit(X_train_fw2v, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [50]:
print('Train accuracy: %.3f' % fw2v_clf.score(X_train_fw2v, y_train))
print('Test accuracy: %.3f' % fw2v_clf.score(X_test_fw2v, y_test))

Train accuracy: 0.854
Test accuracy: 0.839
