In [15]:
import json
import logging
import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model, datasets
from sklearn.metrics import f1_score
from nltk.stem.porter import *

from preprocessing import Preprocessor
from data_loader import DataLoader
from doc2vec  import Doc2VecWrapper
from trainer import Trainer, Metrics

#nltk.download('wordnet')
#nltk.download('stopwords')

In [16]:
if __name__ == "__main__":  
    # Load the data.
    loader = DataLoader()
    with open('settings.json', 'r') as settingsJ:
        settings = json.load(settingsJ)
        train_set, test_set = loader.load_dataset(settings) 
        # Cut it down in size to speed up the test.
        train_set = train_set.iloc[:500]
        test_set = test_set.iloc[:500]

In [17]:
# Preprocess the data in accordance with settings.
preprocesor = Preprocessor(settings)
train_df = preprocesor.preprocess(train_set)
test_df = preprocesor.preprocess(test_set)

INFO:root:Using WordTokenizer as tokenizer
INFO:root:Using default lematization
INFO:root:Using default stemming
INFO:root:Using default stopwords_remove


In [18]:
# Train doc2vec model and perform inference.
d2v = Doc2VecWrapper()
d2v.train(train_df, settings)
    
train_vector = d2v.doc2vec_features(train_df)
test_vector = d2v.doc2vec_features(test_df)

INFO:gensim.models.doc2vec:collecting all words and their counts
INFO:gensim.models.doc2vec:PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
INFO:gensim.models.doc2vec:collected 9260 word types and 500 unique tags from a corpus of 500 examples and 89358 words
INFO:gensim.models.word2vec:Loading a fresh vocabulary
INFO:gensim.models.word2vec:effective_min_count=2 retains 5110 unique words (55% of original 9260, drops 4150)
INFO:gensim.models.word2vec:effective_min_count=2 leaves 85208 word corpus (95% of original 89358, drops 4150)
INFO:gensim.models.word2vec:deleting the raw counts dictionary of 9260 items
INFO:gensim.models.word2vec:sample=0 downsamples 0 most-common words
INFO:gensim.models.word2vec:downsampling leaves estimated 85208 word corpus (100.0% of prior 85208)
INFO:gensim.models.base_any2vec:estimated required memory for 5110 words and 5000 dimensions: 216955000 bytes
INFO:gensim.models.word2vec:resetting layer weights
INFO:gensim.models.doc2vec:collec

INFO:gensim.models.base_any2vec:EPOCH 11 - PROGRESS: at 100.00% examples, 78827 words/s, in_qsize 0, out_qsize 1
INFO:gensim.models.base_any2vec:worker thread finished; awaiting finish of 0 more threads
INFO:gensim.models.base_any2vec:EPOCH - 11 : training on 89358 raw words (85708 effective words) took 1.1s, 78732 effective words/s
INFO:gensim.models.base_any2vec:worker thread finished; awaiting finish of 3 more threads
INFO:gensim.models.base_any2vec:worker thread finished; awaiting finish of 2 more threads
INFO:gensim.models.base_any2vec:worker thread finished; awaiting finish of 1 more threads
INFO:gensim.models.base_any2vec:EPOCH 12 - PROGRESS: at 100.00% examples, 82214 words/s, in_qsize 0, out_qsize 1
INFO:gensim.models.base_any2vec:worker thread finished; awaiting finish of 0 more threads
INFO:gensim.models.base_any2vec:EPOCH - 12 : training on 89358 raw words (85708 effective words) took 1.0s, 82138 effective words/s
INFO:gensim.models.base_any2vec:worker thread finished; awai

INFO:gensim.models.base_any2vec:worker thread finished; awaiting finish of 0 more threads
INFO:gensim.models.base_any2vec:EPOCH - 5 : training on 89358 raw words (85708 effective words) took 1.7s, 50929 effective words/s
INFO:gensim.models.base_any2vec:EPOCH 6 - PROGRESS: at 53.60% examples, 35679 words/s, in_qsize 5, out_qsize 0
INFO:gensim.models.base_any2vec:worker thread finished; awaiting finish of 3 more threads
INFO:gensim.models.base_any2vec:worker thread finished; awaiting finish of 2 more threads
INFO:gensim.models.base_any2vec:worker thread finished; awaiting finish of 1 more threads
INFO:gensim.models.base_any2vec:worker thread finished; awaiting finish of 0 more threads
INFO:gensim.models.base_any2vec:EPOCH - 6 : training on 89358 raw words (85708 effective words) took 1.7s, 50064 effective words/s
INFO:gensim.models.base_any2vec:EPOCH 7 - PROGRESS: at 53.60% examples, 36220 words/s, in_qsize 5, out_qsize 0
INFO:gensim.models.base_any2vec:worker thread finished; awaiting f

INFO:gensim.models.base_any2vec:worker thread finished; awaiting finish of 2 more threads
INFO:gensim.models.base_any2vec:worker thread finished; awaiting finish of 1 more threads
INFO:gensim.models.base_any2vec:worker thread finished; awaiting finish of 0 more threads
INFO:gensim.models.base_any2vec:EPOCH - 19 : training on 89358 raw words (85708 effective words) took 1.8s, 46665 effective words/s
INFO:gensim.models.base_any2vec:EPOCH 20 - PROGRESS: at 53.60% examples, 32357 words/s, in_qsize 5, out_qsize 0
INFO:gensim.models.base_any2vec:worker thread finished; awaiting finish of 3 more threads
INFO:gensim.models.base_any2vec:worker thread finished; awaiting finish of 2 more threads
INFO:gensim.models.base_any2vec:worker thread finished; awaiting finish of 1 more threads
INFO:gensim.models.base_any2vec:worker thread finished; awaiting finish of 0 more threads
INFO:gensim.models.base_any2vec:EPOCH - 20 : training on 89358 raw words (85708 effective words) took 1.9s, 45498 effective wo

In [19]:
# Train and evaluate classification model.          
t = Trainer(settings)
m = Metrics.MetricEvaluator(settings)

t.fit(train_vector, train_set)
pred = t.predict(test_vector)
results = m.evaluate(test_set, pred)

print(results)

INFO:root:Training m1
INFO:root:Training m2
INFO:root:Using f1_score metric
INFO:root:Evaluating m1
INFO:root:Evaluating m2


[('m1', 0.33), ('m2', 0.382)]
