In [159]:
# IMPORTS
import glob
import numpy as np
import subprocess

from __future__ import print_function
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.models import Word2Vec
from nltk.stem import PorterStemmer
from scipy.stats import binom


# CONSTANTS
NUM_FOLDS = 10
ACL_PATH = "aclImdb_v1/aclImdb/{}/*"
ACL_FOLDER_PATHS = [ACL_PATH.format("train/unsup"),
                    ACL_PATH.format("train/pos"),
                    ACL_PATH.format("train/neg"),
                    ACL_PATH.format("test/pos"),
                    ACL_PATH.format("test/neg")]
PANG_POS_PATH = "POS-tokenized/POS/*"
PANG_NEG_PATH = "NEG-tokenized/NEG/*"
SEED = 0
NUM_FOLDS = 10
POS = 1
NEG = -1

# Functions

### Fetch Data

In [72]:
def _get_docs_from_folder(folder_path, pretokenised):
    files = glob.glob(folder_path)
    folder_documents = []
    for file_name in files:
        with open(file_name) as fp:
            if pretokenised:
                doc_words = [word.strip("\n") for word in fp.readlines()]
                folder_documents.append(doc_words)
            else:
                # TODO: Use tokeniser: https://www.nltk.org/api/nltk.tokenize.html
                document = fp.read()
                doc_words = document.split(" ")
                folder_documents.append(doc_words)
    return folder_documents

In [73]:
def get_embeddings_training_set(folder_paths_list):
    all_docs = []
    for folder_path in folder_paths_list:
        folder_documents = _get_docs_from_folder(folder_path, 
                                                 pretokenised=False)
        all_docs.extend(folder_documents)
    return all_docs


In [119]:
def get_pang_dataset(pos_path, neg_path, stemming, presence, bigrams, cutoff):
    pos_data = _get_docs_from_folder(pos_path, pretokenised=True)
    neg_data = _get_docs_from_folder(neg_path, pretokenised=True)
    
    validation_set = []
    test_and_training_docs = []
    for i, pos_and_neg_doc in enumerate(zip(pos_data, neg_data)):
        pos_doc, neg_doc = pos_and_neg_doc
        # Apply any transformations.
        if stemming:
            porter_stemmer = PorterStemmer()
            pos_doc = _apply_stemming(porter_stemmer, pos_doc)
            neg_doc = _apply_stemming(porter_stemmer, neg_doc)        
        if bigrams:
            pos_doc = _unigrams_to_bigrams(pos_doc)
            neg_doc = _unigrams_to_bigrams(neg_doc)
        if cutoff > 0:
            pos_doc = _apply_feature_cutoff(pos_doc, cutoff)
            neg_doc = _apply_feature_cutoff(neg_doc, cutoff)
        if presence:
            pos_doc = set(pos_doc)
            neg_doc = set(neg_doc)
        
        # Obtain validation set.
        if i%10 == 0:
            validation_set.append((pos_doc, POS))
            validation_set.append((neg_doc, NEG))
        else:
            test_and_training_docs.append((pos_doc, neg_doc))
        
    # Split remaining data into folds.
    data_set = {}
    for fold_index in range(NUM_FOLDS):
        data_set[fold_index] = []
    for i, (pos_doc, neg_doc) in enumerate(test_and_training_docs):
        data_set[i%NUM_FOLDS].append((pos_doc, POS))
        data_set[i%NUM_FOLDS].append((neg_doc, NEG))
    return validation_set, data_set

def _apply_stemming(porter_stemmer, doc):
    return [porter_stemmer.stem(word) for word in doc]

def _unigrams_to_bigrams(doc):
    return [word1 + word2 
            for word1, word2 
            in zip(doc[:-1], doc[1:])]

def _apply_feature_cutoff(doc, cutoff):
    token_count = dict()
    for word in doc:
        if word not in token_count:
            token_count[word] = 0
        token_count[word] = token_count[word] + 1
    for word in doc:
        if token_count[word] < cutoff:
            doc.remove(word)
    return doc

### Doc2Vec Classification

In [145]:
def train_doc2vec_model(training_set, epochs=10):
    documents = [TaggedDocument(doc, [i]) 
                 for i, doc in enumerate(training_set)]
    model = Doc2Vec(documents, seed=SEED, dbow_words=1, 
                    epochs=epochs, workers=4)
    return model

In [138]:
def store_docs_as_embeddings(dataset, doc2vec_model, name_of_file):
    with open(name_of_file, "w+") as fp:
        for document in dataset:
            doc_words, sentiment = document
            feature_vector = doc2vec_model.infer_vector(doc_words)
            feature_values = ["{}:{}".format(index + 1, value)
                              for index, value 
                              in enumerate(feature_vector)]
            line = "{} {}\n".format(sentiment, 
                                  " ".join(feature_values))
            fp.write(line)

In [168]:
def run_svm(training_set, test_set, doc2vec_model):
    training_file_name = "training_embeddings.txt"
    test_file_name = "test_embeddings.txt"
    model_file_name = "model_file.txt"
    predictions_file_name = "predictions.txt"
    
    if (doc2vec_model == None):
        print("ERROR: SVM with counts not yet supported!")
        print("Please spcify model.")
        return
    else:
        store_docs_as_embeddings(training_set, doc2vec_model,
                                 training_file_name)
        store_docs_as_embeddings(test_set, doc2vec_model, 
                                 test_file_name)
    # TRAIN
    subprocess.call(["svm_light/svm_learn", 
                     training_file_name, 
                     model_file_name])
    
    # CLASSIFY
    subprocess.call(["svm_light/svm_classify",
                     test_file_name,
                     model_file_name,
                     predictions_file_name])
    
    # TEST
    accuracy_count = 0
    with open(predictions_file_name) as pred_fp:
        for test_doc in test_set:
            doc_words, doc_sentiment = test_doc
            result = float(pred_fp.readline())
            if np.sign(result) == doc_sentiment:
                accuracy_count += 1
    return float(accuracy_count) / len(test_set) * 100

### Permutation Test

In [121]:
def permutation_test():
    pass

# Executing Code

In [100]:
# GET DATA SETS
print("Loading ACL training set...")
embeddings_training_set = get_embeddings_training_set(
    ACL_FOLDER_PATHS)
print("Fetched {} docs.".format(len(embeddings_training_set)))

print("Loading Pang validation set...")
validation_set, pang_folds = get_pang_dataset(
    PANG_POS_PATH, PANG_NEG_PATH, 
    stemming=False, presence=False, bigrams=False, cutoff=0)
print("Fetched {} validation docs and ".format(len(validation_set)))
print("{} folds with {} docs each.".format(len(pang_folds), 
                                            len(pang_folds[0])))

Loading ACL training set...
Fetched 100000 docs.
Loading Pang validation set...
Fetched 200 validation docs and 
10 folds with 180 docs each.


In [171]:
# TRAIN DOC2VEC MODEL
print("Training Doc2Vec model...")
doc2vec_model = train_doc2vec_model(embeddings_training_set,
                                    epochs=20)
print(model)

Training Doc2Vec model...
Doc2Vec(dm/m,d100,n5,w5,mc5,s0.001,t3)


In [170]:
# TEST ON VALIDATION SET
print("Running on validation set...")
temp_training_set = [doc for fold in pang_folds.values()
                    for doc in fold]
accuracy = run_svm(temp_training_set, validation_set, 
                   doc2vec_model)
print("The accuracy is {}%.".format(accuracy))

Running on validation set...
The accuracy is 82.5%.


### //TODO:
* Implement permutation test.
* Find visualisations for vectors.
* Come up with interesting research question.