In [6]:
import json
import logging
import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model, datasets
from sklearn.metrics import f1_score
from nltk.stem.porter import *

# NOTE: to import these packages you must have them in your PYTHONPATH
# for Linux go `export PYTHONPATH=${PYTHONPATH}:/path/to/doc2vec/` 
# for Windows try https://stackoverflow.com/questions/3701646/how-to-add-to-the-pythonpath-in-windows
from preprocessing import Preprocessor
from data_loader import DataLoader
from doc2vec  import Doc2VecWrapper
from trainer import Trainer, Metrics

#nltk.download('wordnet')
#nltk.download('stopwords')

In [2]:
if __name__ == "__main__":  
    # Load the data.
    loader = DataLoader()
    with open('settings.json', 'r') as settingsJ:
        settings = json.load(settingsJ)
        train_set, test_set = loader.load_dataset(settings) 
        # Cut it down in size to speed up the test.
        train_set = train_set.iloc[:50]
        test_set = test_set.iloc[:50]

In [3]:
# Preprocess the data in accordance with settings.
preprocesor = Preprocessor(settings)
train_df = preprocesor.preprocess(train_set)
test_df = preprocesor.preprocess(test_set)

INFO:root:Using WordTokenizer as tokenizer
INFO:root:Using default lematization
INFO:root:Using default stemming
INFO:root:Using default stopwords_remove


In [4]:
# Train doc2vec model and perform inference.
d2v = Doc2VecWrapper()
d2v.train(train_df, settings)
    
train_vector = d2v.doc2vec_features(train_df)
test_vector = d2v.doc2vec_features(test_df)

INFO:gensim.models.doc2vec:collecting all words and their counts
INFO:gensim.models.doc2vec:PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
INFO:gensim.models.doc2vec:collected 2594 word types and 50 unique tags from a corpus of 50 examples and 9751 words
INFO:gensim.models.word2vec:Loading a fresh vocabulary
INFO:gensim.models.word2vec:effective_min_count=2 retains 1151 unique words (44% of original 2594, drops 1443)
INFO:gensim.models.word2vec:effective_min_count=2 leaves 8308 word corpus (85% of original 9751, drops 1443)
INFO:gensim.models.word2vec:deleting the raw counts dictionary of 2594 items
INFO:gensim.models.word2vec:sample=0 downsamples 0 most-common words
INFO:gensim.models.word2vec:downsampling leaves estimated 8308 word corpus (100.0% of prior 8308)
INFO:gensim.models.base_any2vec:estimated required memory for 1151 words and 5000 dimensions: 47615500 bytes
INFO:gensim.models.word2vec:resetting layer weights
INFO:gensim.models.doc2vec:collecting all

INFO:gensim.models.base_any2vec:worker thread finished; awaiting finish of 0 more threads
INFO:gensim.models.base_any2vec:EPOCH - 13 : training on 9751 raw words (8358 effective words) took 0.2s, 54405 effective words/s
INFO:gensim.models.base_any2vec:worker thread finished; awaiting finish of 3 more threads
INFO:gensim.models.base_any2vec:worker thread finished; awaiting finish of 2 more threads
INFO:gensim.models.base_any2vec:worker thread finished; awaiting finish of 1 more threads
INFO:gensim.models.base_any2vec:worker thread finished; awaiting finish of 0 more threads
INFO:gensim.models.base_any2vec:EPOCH - 14 : training on 9751 raw words (8358 effective words) took 0.1s, 57399 effective words/s
INFO:gensim.models.base_any2vec:worker thread finished; awaiting finish of 3 more threads
INFO:gensim.models.base_any2vec:worker thread finished; awaiting finish of 2 more threads
INFO:gensim.models.base_any2vec:worker thread finished; awaiting finish of 1 more threads
INFO:gensim.models.b

INFO:gensim.models.base_any2vec:worker thread finished; awaiting finish of 0 more threads
INFO:gensim.models.base_any2vec:EPOCH - 9 : training on 9751 raw words (8358 effective words) took 0.3s, 30250 effective words/s
INFO:gensim.models.base_any2vec:worker thread finished; awaiting finish of 3 more threads
INFO:gensim.models.base_any2vec:worker thread finished; awaiting finish of 2 more threads
INFO:gensim.models.base_any2vec:worker thread finished; awaiting finish of 1 more threads
INFO:gensim.models.base_any2vec:worker thread finished; awaiting finish of 0 more threads
INFO:gensim.models.base_any2vec:EPOCH - 10 : training on 9751 raw words (8358 effective words) took 0.3s, 29665 effective words/s
INFO:gensim.models.base_any2vec:worker thread finished; awaiting finish of 3 more threads
INFO:gensim.models.base_any2vec:worker thread finished; awaiting finish of 2 more threads
INFO:gensim.models.base_any2vec:worker thread finished; awaiting finish of 1 more threads
INFO:gensim.models.ba

In [5]:
# Train and evaluate classification model.          
t = Trainer(settings)
m = Metrics.MetricEvaluator(settings)

t.fit(train_vector, train_set)
pred = t.predict(test_vector)
results = m.evaluate(test_set, pred)

print(results)

INFO:root:Training m1
INFO:root:Training m2
INFO:root:Using f1_score metric
INFO:root:Evaluating m1
INFO:root:Evaluating m2


[('m1', 0.4000000000000001), ('m2', 0.36)]
