#### A try to run doc2vec model from https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/doc2vec-IMDB.ipynb

In [4]:
import os

import pandas as pd
import gensim
from gensim.models import Doc2Vec

from gensim.models.doc2vec import TaggedDocument
from collections import namedtuple

In [49]:
SentimentDocument = namedtuple('SentimentDocument', 'words tags split sentiment')

alldocs = []  # will hold all docs in original order
for split in ['test', 'train']:
    for label in ['pos', 'neg']:
        path='/home/katia/Documents/MLWorkshops2017/aclImdb/' + split+ '/' + tag + '/'
        for number, file in enumerate(os.listdir(path)):
            file = open(path+file, 'r')
            text = file.read()
            words = gensim.utils.to_unicode(text).split()
            tags = [number]
            sentiment = 1.0 if label == 'pos' else 0.0
            alldocs.append(SentimentDocument(words, tags, split, sentiment))
doc_list = alldocs[:]

In [50]:
train_docs = [doc for doc in alldocs if doc.split == 'train']
test_docs = [doc for doc in alldocs if doc.split == 'test']

In [51]:
sum([doc.sentiment for doc in train_docs])

12500.0

In [52]:
len(doc_list), len(train_docs), len(test_docs)

(50000, 25000, 25000)

In [53]:
import multiprocessing

multiprocessing.cpu_count()

4

In [54]:
# PV-DBOW
train_model = Doc2Vec(dm=0, size=100, negative=5, hs=0, min_count=2, workers=4)

In [63]:
train_model.build_vocab(doc_list)

Helper methods for evaluating error rate.

In [56]:
import numpy as np
import statsmodels.api as sm
from random import sample

# for timing
from contextlib import contextmanager
from timeit import default_timer
import time 

@contextmanager
def elapsed_timer():
    start = default_timer()
    elapser = lambda: default_timer() - start
    yield lambda: elapser()
    end = default_timer()
    elapser = lambda: end-start
    
def logistic_predictor_from_data(train_targets, train_regressors):
    logit = sm.Logit(train_targets, train_regressors)
    predictor = logit.fit(disp=0.1)
    #print(predictor.summary())
    return predictor

def error_rate_for_model(test_model, train_set, test_set, infer=False, infer_steps=3, infer_alpha=0.1, infer_subsample=0.1):
    """Report error rate on test_doc sentiments, using supplied model and train_docs"""

    train_targets, train_regressors = zip(*[(doc.sentiment, test_model.docvecs[doc.tags[0]]) for doc in train_set])
    train_regressors = sm.add_constant(train_regressors)
    print(len(train_targets), len(train_regressors)), 
    predictor = logistic_predictor_from_data(train_targets, train_regressors)

    test_data = test_set
    if infer:
        if infer_subsample < 1.0:
            test_data = sample(test_data, int(infer_subsample * len(test_data)))
        test_regressors = [test_model.infer_vector(doc.words, steps=infer_steps, alpha=infer_alpha) for doc in test_data]
    else:
        test_regressors = [test_model.docvecs[doc.tags[0]] for doc in test_docs]
    test_regressors = sm.add_constant(test_regressors)
    
    # predict & evaluate
    test_predictions = predictor.predict(test_regressors)
    corrects = sum(np.rint(test_predictions) == [doc.sentiment for doc in test_data])
    errors = len(test_predictions) - corrects
    error_rate = float(errors) / len(test_predictions)
    return (error_rate, errors, len(test_predictions), predictor)

In [57]:
from collections import defaultdict
best_error = defaultdict(lambda :1.0)  # to selectively-print only best errors achieved

In [64]:
from random import shuffle
import datetime

alpha, min_alpha, passes = (0.025, 0.001, 5)
alpha_delta = (alpha - min_alpha) / passes

print("START %s" % datetime.datetime.now())
name="Doc2Vec(dm/c,d100,n5,w5,mc2,t4)"
for epoch in range(passes):
    shuffle(doc_list)  # shuffling gets best results
    
    # train
    duration = 'na'
    train_model.alpha, train_model.min_alpha = alpha, alpha
    with elapsed_timer() as elapsed:
        train_model.train(doc_list, total_examples=train_model.corpus_count, epochs=train_model.iter)
        duration = '%.1f' % elapsed()
        
    # evaluate
    eval_duration = ''
    with elapsed_timer() as eval_elapsed:
        err, err_count, test_count, predictor = error_rate_for_model(train_model, train_docs, test_docs)
    eval_duration = '%.1f' % eval_elapsed()
    best_indicator = ' '
    if err <= best_error[name]:
        best_error[name] = err
        best_indicator = '*' 
    print("%s%f : %i passes : %s %ss %ss" % (best_indicator, err, epoch + 1, name, duration, eval_duration))

    if ((epoch + 1) % 5) == 0 or epoch == 0:
        eval_duration = ''
        with elapsed_timer() as eval_elapsed:
            infer_err, err_count, test_count, predictor = error_rate_for_model(train_model, train_docs, test_docs, infer=True)
        eval_duration = '%.1f' % eval_elapsed()
        best_indicator = ' '
        if infer_err < best_error[name + '_inferred']:
            best_error[name + '_inferred'] = infer_err
            best_indicator = '*'
        print("%s%f : %i passes : %s %ss %ss" % (best_indicator, infer_err, epoch + 1, name + '_inferred', duration, eval_duration))

    print('completed pass %i at alpha %f' % (epoch + 1, alpha))
    alpha -= alpha_delta
    
print("END %s" % str(datetime.datetime.now()))

START 2017-07-05 15:43:05.234123
25000 25000
Optimization terminated successfully.
         Current function value: 0.693147
         Iterations 1
*0.500000 : 1 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,t4) 101.6s 0.5s
25000 25000
Optimization terminated successfully.
         Current function value: 0.693147
         Iterations 1
*0.487200 : 1 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,t4)_inferred 101.6s 5.3s
completed pass 1 at alpha 0.025000
25000 25000
Optimization terminated successfully.
         Current function value: 0.693147
         Iterations 1
*0.500000 : 2 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,t4) 108.8s 0.4s
completed pass 2 at alpha 0.020200
25000 25000
Optimization terminated successfully.
         Current function value: 0.693147
         Iterations 1
*0.500000 : 3 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,t4) 112.5s 0.5s
completed pass 3 at alpha 0.015400
25000 25000
Optimization terminated successfully.
         Current function value: 0.693147
         Iterations 1
*0.500000 : 4 p