In [1]:
import pandas as pd
import numpy as np
from sklearn.externals import joblib
import nltk
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB
from mlxtend.classifier import EnsembleVoteClassifier
from sklearn.calibration import CalibratedClassifierCV


In [2]:
def convert_features(sentence):
    return {i:(i in sentence) for i in word_features}

no_of_classes = 5

if no_of_classes == 2: 
        dataset_path = '../Data/Datasets/Binary Classification/'
else:
    dataset_path = '../Data/Datasets/Multiclass Classification/'

In [3]:
# create the sub models
estimators = []
#clf_nb = joblib.load(dataset_path+'nb_classifier.pkl')
word_features = joblib.load('../Data/nb_word_features.pkl')

#clf_svm = joblib.load(dataset_path+'svm_classifier.pkl')
onehot_enc = joblib.load(dataset_path+'svm_encode.pkl')
vocabulary = joblib.load('../Data/svm_vocabulary.pkl')

In [4]:
#hold out testing
train = pd.read_csv(dataset_path+'train.csv', sep=",", header=None,index_col = False)
test = pd.read_csv(dataset_path+'test.csv', sep=",", header=None,index_col = False)

train['tokenized_sents'] = [nltk.word_tokenize(row[0]) for index, row in train.iterrows()]
test['tokenized_sents'] = [nltk.word_tokenize(row[0]) for index, row in test.iterrows()]
    

featured_train = [(convert_features(row['tokenized_sents']), row[1]) for index, row in train.iterrows()]
featured_test = [(convert_features(row['tokenized_sents']), row[1]) for index, row in test.iterrows()]

lsvm = LinearSVC()
lsvm = CalibratedClassifierCV(lsvm) 
lsvm.fit(onehot_enc.transform(train['tokenized_sents']), train[1])
svm_pred = lsvm.predict(onehot_enc.transform(test['tokenized_sents']))
svm_pred_prob = lsvm.predict_proba(onehot_enc.transform(test['tokenized_sents']))

nb = nltk.NaiveBayesClassifier.train(featured_train)
nb_pred = np.zeros((test.shape[0],no_of_classes))
nb_pred_prob = np.zeros((test.shape[0],no_of_classes))
for index, row in test.iterrows():
    featured_item = (convert_features(row['tokenized_sents']))
    nb_pred[index]  = int(nb.classify(featured_item))
    p =nb.prob_classify(featured_item)
    for label in p.samples():
        nb_pred_prob[index][label] = p.prob(label)

ensemble_pred = np.zeros((test.shape[0],1))  
m=np.zeros((test.shape[0],5))
for idx,item in enumerate(nb_pred_prob):
    m[idx] = np.average([svm_pred_prob[idx], item], axis=0,weights=[1./4, 3./4])
ensemble_pred = m.argmax(axis=1)

eq = 0
for i in range(test.shape[0]):
    if ensemble_pred[i] == test[1][i]:
        eq = eq +1

score = eq/test.shape[0]
print("Holdout testing accuracy: {}".format(score))

Holdout testing accuracy: 0.80042689434365


In [8]:
#cross validation testing
folds = 10
accuracy_esn = []
for idx in range(1,folds+1):
    train = pd.read_csv(dataset_path+'cv'+str(idx)+'_train.csv', sep=",", header=None,index_col = False)
    test = pd.read_csv(dataset_path+'cv'+str(idx)+'_test.csv', sep=",", header=None,index_col = False)
    
    
    train['tokenized_sents'] = [nltk.word_tokenize(row[0]) for index, row in train.iterrows()]
    test['tokenized_sents'] = [nltk.word_tokenize(row[0]) for index, row in test.iterrows()]
    
    
    featured_train = [(convert_features(row['tokenized_sents']), row[1]) for index, row in train.iterrows()]
    featured_test = [(convert_features(row['tokenized_sents']), row[1]) for index, row in test.iterrows()]

    lsvm = LinearSVC()
    lsvm = CalibratedClassifierCV(lsvm) 
    lsvm.fit(onehot_enc.transform(train['tokenized_sents']), train[1])
    svm_pred = lsvm.predict(onehot_enc.transform(test['tokenized_sents']))
    svm_pred_prob = lsvm.predict_proba(onehot_enc.transform(test['tokenized_sents']))
    
    nb = nltk.NaiveBayesClassifier.train(featured_train)
    nb_pred = np.zeros((test.shape[0],no_of_classes))
    nb_pred_prob = np.zeros((test.shape[0],no_of_classes))
    for index, row in test.iterrows():
        featured_item = (convert_features(row['tokenized_sents']))
        nb_pred[index]  = int(nb.classify(featured_item))
        p =nb.prob_classify(featured_item)
        for label in p.samples():
            nb_pred_prob[index][label] = p.prob(label)

    ensemble_pred = np.zeros((test.shape[0],1))  
    m=np.zeros((test.shape[0],5))
    for idy,item in enumerate(nb_pred_prob):
        m[idy] = np.average([svm_pred_prob[idy], item], axis=0,weights=[1./4, 3./4])
    ensemble_pred = m.argmax(axis=1)
    
    eq = 0
    for i in range(test.shape[0]):
        if ensemble_pred[i] == test[1][i]:
            eq = eq +1
            
    score = eq/test.shape[0]
    
    
    accuracy_esn.append(score)
    print("Fold: {}: accuracy = {}".format(idx,score))

print("10 fold Cross Validation accuracy = {}".format(np.mean(accuracy_esn)))

Fold: 1: accuracy = 0.8492063492063492
Fold: 2: accuracy = 0.8063660477453581
Fold: 3: accuracy = 0.7898936170212766
Fold: 4: accuracy = 0.8111702127659575
Fold: 5: accuracy = 0.786096256684492
Fold: 6: accuracy = 0.774798927613941
Fold: 7: accuracy = 0.7801608579088471
Fold: 8: accuracy = 0.7774798927613941
Fold: 9: accuracy = 0.7453083109919572
Fold: 10: accuracy = 0.8123324396782842
10 fold Cross Validation accuracy = 0.7932812912377857
