# Sentiment Analysis Using Doc2Vec

### We use Word2Vec for sentiment analysis by attempting to classify the Cornell IMDB movie review corpus (http://www.cs.cornell.edu/people/pabo/movie-review-data/).

## Setup

In [None]:
# gensim modules
from gensim import utils
from gensim.models.doc2vec import LabeledSentence
from gensim.models import Doc2Vec

# numpy
import numpy

# classifier
from sklearn.linear_model import LogisticRegression

#### Data:
* **test-neg.txt**: 12500 negative movie reviews from the test data
* **test-pos.txt**: 12500 positive movie reviews from the test data
* **train-neg.txt**: 12500 negative movie reviews from the training data
* **train-pos.txt**: 12500 positive movie reviews from the training data
* **train-unsup.txt**: 50000 Unlabelled movie reviews

## Feeding Data to Doc2Vec

In [None]:
# Each sentences need to be labeled in this format
# [['word1', 'word2', 'word3', 'lastword'], ['label1']]
# This is doen with LabeledSentence

In [None]:
class LabeledLineSentence(object):
    def __init__(self, sources):
        self.sources = sources
        
        flipped = {}
        
        for key, value in sources.items():
            if value not in flipped:
                flipped[value] = [key]
            else:
                raise Exception('Non-unique prefix encountered')
    def __iter__(self):
        for source, prefix in sources.items():
            with utils.smart_open(source) as fin:
                for item_no, line in enumerate(fin):
                    yield LabeledSentence(utils.to_unicode(line).split(), [prefix + '_%s' % item_no])
                    
    def to_array(self):
        self.sentences = []
        for source, prefix in sources.items():
            with utils.smart_open(source) as fin:
                for item_no, line in enumerate(fin):
                    self.sentences.append(LabeledSentence(utils.to_unicode(line).split(), [prefix + '_%s' % item_no]))
        return self.sentences
    
    def sentences_perm(self):
        return numpy.random.permutation(self.sentences)

In [None]:
sources = {'data/test-neg.txt':'TEST_NEG', 'data/test-pos.txt':'TEST_POS', 
           'data/train-neg.txt':'TRAIN_NEG', 'data/train-pos.txt':'TRAIN_POS', 'data/train-unsup.txt':'TRAIN_UNS'}
sentences = LabeledLineSentence(sources)

## Model - Building the Vocabulary Table

In [None]:
# model = Doc2Vec(min_count=1, window=10, size=100, sample=1e-4, negative=5, workers=7)
# model.build_vocab(sentences.to_array())

model = Doc2Vec(min_count=1, window=5, size=100, sample=1e-4, negative=5, workers=1)
model.build_vocab(sentences.to_array())

## Training Doc2Vec

In [None]:
for epoch in range(1):
    model.train(sentences.sentences_perm())

In [None]:
"""
# this will give better results for words/sentences that only occur once because will be trained with different leanring rates 
model = Doc2Vec(alpha=0.025, min_alpha=0.025)  # use fixed learning rate
model.build_vocab(sentences)
for epoch in range(10):
    model.train(sentences)
    model.alpha -= 0.002  # decrease the learning rate
    model.min_alpha = model.alpha  # fix the learning rate, no decay
    """

## Inspecting the Model

In [None]:
model.most_similar('good')
"""
[(u'tekashi', 0.45127424597740173),
 (u'glamorous', 0.4344240427017212),
 (u'spectacular', 0.42718690633773804),
 (u'astounding', 0.42001062631607056),
 (u'valentinov', 0.41705751419067383),
 (u'sweetest', 0.4043062925338745),
 (u'complementary', 0.4039931297302246),
 (u'boyyyyy', 0.39713743329048157),
 (u'macdonaldsland', 0.3965899348258972),
 (u'elven', 0.39042729139328003)]
 """

In [None]:
model.most_similar('bad')

In [None]:
model['TRAIN_NEG_0']

## Save and Load Model

In [None]:
model.save('./imdb.d2v')

In [None]:
model = Doc2Vec.load('./imdb.d2v')

## Classifying Sentiments

### Training vectors

In [None]:
train_arrays = numpy.zeros((25000, 100))
train_labels = numpy.zeros(25000)

for i in range(12500):
    prefix_train_pos = 'TRAIN_POS_' + str(i)
    prefix_train_neg = 'TRAIN_NEG_' + str(i)
    train_arrays[i] = model[prefix_train_pos]
    train_arrays[12500 + i] = model[prefix_train_neg]
    train_labels[i] = 1
    train_labels[12500 + i] = 0

In [None]:
print train_arrays

In [None]:
print train_labels

### Testing vectors

In [None]:
test_arrays = numpy.zeros((25000, 100))
test_labels = numpy.zeros(25000)

for i in range(12500):
    prefix_test_pos = 'TEST_POS_' + str(i)
    prefix_test_neg = 'TEST_NEG_' + str(i)
    test_arrays[i] = model[prefix_test_pos]
    test_arrays[12500 + i] = model[prefix_test_neg]
    test_labels[i] = 1
    test_labels[12500 + i] = 0

### Classification

In [None]:
classifier = LogisticRegression()
classifier.fit(train_arrays, train_labels)

In [None]:
classifier.score(test_arrays, test_labels)