In [2]:
import numpy as np
import os
import json

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neural_network import MLPClassifier
from sklearn.decomposition import TruncatedSVD
from sklearn import metrics as mtr
import pickle

# PreClassifier Evaluation

This was the notebook we used to pick between the parameters we wanted to evaluate. See the PreClassifierClass for final model.

In [3]:
class QPC:
    def __init__(self, conf='f', ngram_range = (1,2), min_df = 1, tsvd_cmp=0):
        '''
        Arguments
            conf:         Configuration ID for saving or loading model
            ngram_range:  Ngrams to use when tokenizing
            min_df:       Minimum document frequency
            tsvd_cmp:     How many components to include in truncated SVD decomposition.
        '''
        self.conf = conf
        self.ngram_range = ngram_range
        self.min_df = min_df
        self.tsvd_cmp = tsvd_cmp
        
        self.classes = np.array(['resource', 'date', 'number', 'string', 'boolean'])
        self.mlpc = None
        
    def _get_queries(self, filepath):
        '''
        Method for retrieving queries from file
        '''
        with open (filepath) as f:
            query_file = json.load(f)
        return query_file
    
    def _get_labels(self, data):
        y_label = []
        for query in data:
            if query['question'] is not None:
                if query['category']=='literal':
                    y_label.append(np.where(self.classes==query['type'][0])[0][0])
                else:
                    y_label.append(np.where(self.classes==query['category'])[0][0])
        return np.array(y_label).astype(int)
    
    def train(self, filepath):
        '''
        Method for training word vectorizer and neural network.
        '''
        query_file = self._get_queries(filepath)
        queries = [q['question'] for q in query_file if q['question'] is not None]
        try:
            self.cv, self.mlpc, self.tsvd = self._load_model(self.conf)
        except:
            print('No saved model found. Training...')
            self.cv = CountVectorizer(ngram_range = self.ngram_range, min_df = self.min_df)
            train_vec = self.cv.fit_transform(queries)

            #preparing labels
            train_label = self._get_labels(query_file)
            
            if self.tsvd_cmp:
                self.tsvd = TruncatedSVD(n_components=self.tsvd_cmp)
                train_vec = self.tsvd.fit_transform(train_vec)
        
            self.mlpc = MLPClassifier(random_state=1, learning_rate_init=0.03, learning_rate='adaptive')
            self.mlpc.fit(train_vec, train_label)

    def _metrics(self, label, pred):
        '''
        Metrics for evaluating query preclassifier
        '''
        acc = mtr.accuracy_score(label, pred)
        bacc = mtr.balanced_accuracy_score(label, pred)
        f1_mac = mtr.f1_score(label, pred,average='macro')
        f1_mic = mtr.f1_score(label, pred,average='micro')
        print('Accuracy: {:.3}\nBalanced Accuracy: {:.3}\nF1 Macro: {:.3}\nF1 Micro: {:.3}'.format(acc,bacc,f1_mac,f1_mic))
        
    def predict(self, filepath, metrics=False):
        '''
        Method for predicting category labels.
        '''
        if not self.mlpc:
            print('Training not completed')
            return None
        query_file = self._get_queries(filepath)
        queries = [q['question'] for q in query_file if q['question'] is not None]
        y_labels = self._get_labels(query_file)
        vec = self.cv.transform(queries)
        if self.tsvd_cmp:
            vec = self.tsvd.transform(vec)
        pred = self.mlpc.predict(vec)
        if metrics:
            self._metrics(y_labels, pred)
        return pred
    
    def save_model(self):
        if self.mlpc:
            with open('qpccv-' + self.conf + '.sav','wb') as f:
                pickle.dump(self.cv,f)
            with open('qpcmlpc-' + self.conf + '.sav','wb') as f:
                pickle.dump(self.mlpc,f)
            if self.tsvd_cmp:
                with open('qpctsvd-' + self.conf + '.sav','wb') as f:
                    pickle.dump(self.tsvd,f)
        else:
            print('Model not trained.')
            
    def _load_model(self, conf):
        d = 'd' if self.tsvd_cmp else 'r'
        tsvd = None
        with open('qpccv-'+ self.conf + '.sav','rb') as f:
            cv = pickle.load(f)
        with open('qpcmlpc-' + self.conf + '.sav','rb') as f:
            mlpc = pickle.load(f)
        if self.tsvd_cmp:
            with open('qpctsvd-' + self.conf + '.sav','rb') as f:
                tsvd = pickle.load(f)
        return [cv, mlpc, tsvd]

In [None]:
test_models = \
[QPC(conf='tc1', min_df = 1), \
 QPC(conf='tc2', min_df = 0.01), \
 QPC(conf='tc3', min_df = 0.005), \
 QPC(conf='tc4', min_df = 0.0005), \
 QPC(conf='tc5', min_df = 1, tsvd_cmp=1000)]

text = \
['Minimum document frequency: 1, no decomposition.', \
 'Minimum document frequency: 1%, no decomposition.', \
 'Minimum document frequency: .5%, no decomposition.', \
 'Minimum document frequency: .05%, no decomposition.',\
 'Minimum document frequency: 1, decomposition:1000 components.']

for i, m in enumerate(test_models):
    print(text[i])
    m.train(os.path.join('Data','train_set'))
    m.save_model()
    print('\n')
    m.predict(os.path.join('Data','validation_set'), metrics=True)
    print('\n')