# Imports

In [1]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

import numpy as np
import pandas as pd
import torch
from torch.autograd import Variable
from torch.nn import Linear, ReLU, CrossEntropyLoss, Sequential, Conv2d, MaxPool2d, Module, Softmax, BatchNorm2d, Dropout
from torch.optim import Adam, SGD
import GPyOpt
from timeit import default_timer as timer

from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression

import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import accuracy_score

import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier

# Data Preprocessing

In [2]:
# define number of classes
train_pr_label = 2
pool_size_pr_label = 100
pool_size_pr_label += train_pr_label

# load train and test sets
train = pd.read_csv('Data/fashion-mnist_train.csv').to_numpy()
test = pd.read_csv('Data/fashion-mnist_test.csv').to_numpy()

X_train = train[:,1:]
y_train = train[:,0]
X_test = test[:,1:]
y_test = test[:,0]

where_train = []
where_test = []
where_pool = []

for label in range(10):
    where_train.append(np.where(y_train == label)[0][:train_pr_label])
    where_test.append(np.where(y_test == label)[0])
    where_pool.append(np.where(y_train == label)[0][train_pr_label:pool_size_pr_label])

def flatten(array):
    new_array = []
    for sublist in array:
        for item in sublist:
            new_array.append(item)
    return new_array
    
where_train = flatten(where_train)
where_test = flatten(where_test)
where_pool = flatten(where_pool)

X_pool = X_train[where_pool]
y_pool = y_train[where_pool]

X_train = X_train[where_train]
y_train = y_train[where_train]

X_test = X_test[where_test]
y_test = y_test[where_test]
    
print("Train data shape:", X_train.shape, "Test data shape:", X_test.shape)
print("Train labels shape:", y_train.shape,"  Test labels shape:", y_test.shape)
print("Pool data shape:", X_pool.shape,"  Pool labels shape:", y_pool.shape)

Train data shape: (20, 784) Test data shape: (10000, 784)
Train labels shape: (20,)   Test labels shape: (10000,)
Pool data shape: (1000, 784)   Pool labels shape: (1000,)


# Naive Bayes Baseline

In [3]:
gnb = GaussianNB()
y_pred = gnb.fit(X_train, y_train).predict(X_test)

clf = LogisticRegression(random_state=0).fit(X_train, y_train)
y_pred = clf.predict(X_test)

print("Number of mislabeled points out of a total %d points : %d"% (X_test.shape[0], (y_test != y_pred).sum()))
print("{0}% mislabeled data.".format(((y_test != y_pred).sum()/X_test.shape[0])*100))

mislabeled = [0,0,0,0,0,0,0,0,0,0]
count = [0,0,0,0,0,0,0,0,0,0]
for i in range(len(y_pred)):
    count[int(y_test[i])] += 1
    if y_pred[i] != y_test[i]:
        mislabeled[int(y_test[i])] += 1

for i in range(10):
    print("Mislabeled of class {0}: {1}".format(i,(mislabeled[i]/count[i])*100))

Number of mislabeled points out of a total 10000 points : 3880
38.800000000000004% mislabeled data.
Mislabeled of class 0: 22.2
Mislabeled of class 1: 8.3
Mislabeled of class 2: 43.7
Mislabeled of class 3: 32.7
Mislabeled of class 4: 65.7
Mislabeled of class 5: 76.0
Mislabeled of class 6: 60.9
Mislabeled of class 7: 13.700000000000001
Mislabeled of class 8: 56.00000000000001
Mislabeled of class 9: 8.799999999999999


# Loading Bar

In [4]:
class _loading:
    
    def __init__(self, count, headline):
        self.count = count
        self.current = 0
        self.skip = 0
        print("{0} in progress...".format(headline))
        
    def _update(self):
        self.current += 1
        if self.current == self.count:
            print("\r{0}".format("Finished Successfully!                               \n"))
        elif self.skip > self.count/133:
            print("\r{0}".format("|{0}{1}|   {2}% finished.".format("█"*int(round(self.current/self.count,1)*20),"-"*(20-int(round(self.current/self.count,1)*20)),round((self.current/self.count)*100,2))), end = "", flush=True)
            self.skip = 0
            
        else:
            self.skip += 1

# Query By Comittee

In [5]:
class QBC:
    
    model_number = 0
    models = []
    X_pool = []
    y_pool = []
    X_train = []
    y_train = []
    X_test = []
    y_test = []
    
    compare_count = 0
    compare_labels = {}
    
    loading_bar = _loading(0,"")
    
    def __init__(self,model_number,X_train,y_train,X_pool,y_pool,X_test,y_test):
        
        self.model_number = max(model_number,20)
        self.initialize_models()
        
        self.X_pool = X_pool
        self.y_pool = y_pool
        
        self.X_train = X_train
        self.y_train = y_train
        
        self.X_test = X_test
        self.y_test = y_test
        
        
        
    def initialize_models(self):
        params = [["newton-cg","l2"],["newton-cg","none"],["lbfgs","l2"],["lbfgs","none"],["liblinear","l1"],["liblinear","l2"],["sag","l2"],["sag","none"],["saga","elasticnet"],["saga","l1"]]
        trees = np.arange(10,101,10)
        
        for i in range(int(np.floor(self.model_number/2))):
            if params[i][1] == "elasticnet":
                model = LogisticRegression(random_state=i, solver = params[i][0], penalty = params[i][1], l1_ratio=0.2)
            else:
                model = LogisticRegression(random_state=i, solver = params[i][0], penalty = params[i][1])
            self.models.append(model)
            
        for i in range(int(np.floor(self.model_number/2))):
            model = RandomForestClassifier(n_estimators = trees[i])
            
    def train_models(self):
        for model in self.models:
            model.fit(self.X_train,self.y_train)
    
    def acc_model(self):
        scores = []
        for model in self.models:
            preds = model.predict(X_test)
            scores.append(accuracy_score(preds,self.y_test))
        
        return np.mean(scores)
            
    
    def find_pool_idx(self,heuristic):
        y_preds = np.array([])
        for model in self.models:
            y_pred = model.predict(self.X_pool)
            if len(y_preds) == 0:
                y_preds = np.append(y_preds,y_pred)
            else:
                y_preds = np.vstack((y_preds,y_pred))
                
        if heuristic == "majority":
            minimum = 11
            pool_index = 0
            for pred in range(len(y_preds[0])):
                maks = 0
                for label in range(10):
                    count = np.count_nonzero(y_preds[:,pred] == label)
                    if count > maks:
                        maks = count
                if maks < minimum:
                    minimum = maks
                    pool_index = pred
                    
        elif heuristic == "vote_entropy":
            
            values = []
            for pred in range(len(y_preds[0])):
                value = 0
                for label in range(10):
                    count = np.count_nonzero(y_preds[:,pred] == label)
                    value += count/self.model_number*np.log(count/self.model_number) if count != 0 else 0
                values.append(-value)
            
            pool_index = np.argmax(np.array(values))
        
        elif heuristic == "random":
            pool_index = np.random.randint(len(self.X_pool))
        
        self.X_train = np.vstack((self.X_train,self.X_pool[pool_index]))
        self.y_train = np.append(self.y_train,self.y_pool[pool_index])
        
        self.train_models()
        
        self.X_pool = np.delete(self.X_pool, pool_index, axis=0)
        self.y_pool = np.delete(self.y_pool, pool_index, axis=0)
        
        return y_pool[pool_index]
        
    def query(self,itts,heuristics):
        accs = []
        labels = {0:0,1:0,2:0,3:0,4:0,5:0,6:0,7:0,8:0,9:0}
        for i in range(itts):
            #print(f"heuristic: {heuristics}, itt: {i} out of {itts}")
            label = self.find_pool_idx(heuristics)
            accs.append(self.acc_model())
            labels[label] += 1
            self.loading_bar._update()
        return accs, labels
    
    def compare_models(self,itts):
        self.loading_bar = _loading(3*itts,"Comparing QBC Models")
        
        X_train_old = self.X_train
        y_train_old = self.y_train
        
        self.train_models()
        majority_accs, majority_labels = self.query(itts,"majority")
        
        self.X_train = X_train_old
        self.y_train = y_train_old
        self.train_models()
        vote_entropy_accs, vote_entropy_labels = self.query(itts,"vote_entropy")
        
        
        self.X_train = X_train_old
        self.y_train = y_train_old
        self.train_models()
        random_accs, random_labels = self.query(itts,"random")
        
        self.plot_models([majority_accs,vote_entropy_accs,random_accs])
        print(f"Majority: {majority_labels}")
        print(f"vote_entropy: {vote_entropy_labels}")
        print(f"random: {random_labels}")
    
        
    def plot_models(self,accs):
        
        plt.plot(accs[0], label='majority')
        plt.plot(accs[1], label='vote_entropy')
        plt.plot(accs[2], label='random')
        plt.ylabel('accuracy')
        plt.legend(loc="upper left")
        plt.show()
    

 in progress...


In [None]:
# Number of itterations (datapoints from pool) to sample.
number_of_samples = 500
# Number of models /// Number between 1-20.
number_of_models = 20

comittee = QBC(number_of_models,X_train,y_train,X_pool,y_pool,X_test,y_test)
comittee.compare_models(number_of_samples)

Comparing QBC Models in progress...
|██████--------------|   25.13% finished.