In [1]:
import numpy as np
import sklearn
import math
import pandas as pd

In [227]:
class TriTraining:
    def __init__(self, classifiers):
        self.classifier = classifiers
    
    def train(self, L_X, L_y, U_X, X_test, y_test):
        try:
            if len(self.classifier) == 3:#if input 3 models, use them for tritraining
                self.classifiers = [sklearn.base.clone(self.classifier[i]) for i in range(3)]
                for i in range(3):
                    self.classifiers[i].fit(L_X, L_y)
        except TypeError as e:#if input 1 model, clone it three times
            self.classifiers = [sklearn.base.clone(self.classifier) for i in range(3)]
            sample = sklearn.utils.resample(L_X, L_y)
            for i in range(3):
                self.classifiers[i].fit(*sample)
            
        self.init_score = self.score(X_test, y_test)
        self.fit(L_X, L_y, U_X)
        return self.score(X_test, y_test)
        
    def fit(self, L_X, L_y, U_X):
        e_prime = [0.5]*3
        l_prime = [0]*3
        e = [0]*3
        update = [False]*3
        Li_X, Li_y = [[]]*3, [[]]*3#to save proxy labeled data
        improve = True
        self.iter = 0
        
        while improve:
            self.iter += 1#count iterations 
            
            for i in range(3):    
                j, k = np.delete(np.array([0,1,2]),i)
                update[i] = False
                e[i] = self.measure_error(L_X, L_y, j, k)
                if e[i] < e_prime[i]:
                    U_y_j = self.classifiers[j].predict(U_data)
                    U_y_k = self.classifiers[k].predict(U_data)
                    Li_X[i] = U_X[U_y_j == U_y_k]#when two models agree on the label, save it
                    Li_y[i] = U_y_j[U_y_j == U_y_k]
                    if l_prime[i] == 0:#no updated before
                        l_prime[i]  = int(e[i]/(e_prime[i] - e[i]) + 1)
                    if l_prime[i] <len(Li_y[i]):
                        if e[i]*len(Li_y[i])<e_prime[i] * l_prime[i]:
                            update[i] = True
                        elif l_prime[i] > e[i]/(e_prime[i] - e[i]):
                            L_index = np.random.choice(len(Li_y[i]), int(e_prime[i] * l_prime[i]/e[i] -1))#subsample from proxy labeled data
                            Li_X[i], Li_y[i] = Li_X[i][L_index], Li_y[i][L_index]
                            update[i] = True
             
            for i in range(3):
                if update[i]:
                    self.classifiers[i].fit(np.append(L_X,Li_X[i],axis=0), np.append(L_y, Li_y[i], axis=0))#train the classifier on integrated dataset
                    e_prime[i] = e[i]
                    l_prime[i] = len(Li_y[i])
    
            if update == [False]*3:
                improve = False#if no classifier was updated, no improvement


    def predict(self, X):
        pred = np.asarray([self.classifiers[i].predict(X) for i in range(3)])
        pred[0][pred[1]==pred[2]] = pred[1][pred[1]==pred[2]]
        return pred[0]
        
    def score(self, X, y):
        return sklearn.metrics.accuracy_score(y, self.predict(X))
        
    def measure_error(self, X, y, j, k):
        j_pred = self.classifiers[j].predict(X)
        k_pred = self.classifiers[k].predict(X)
        wrong_index =np.logical_and(j_pred != y, k_pred==j_pred)#model_j and model_k make the same wrong prediction
        #wrong_index =np.logical_and(j_pred != y_test, k_pred!=y_test)
        return sum(wrong_index)/sum(j_pred == k_pred)

In [196]:
np.zeros(U_data.shape[0]).shape

(203,)

In [206]:
L_data, L_label, U_data, X_test, y_test = data_process(dataset['australian']['X'], dataset['australian']['y'], 0.6)
# U_label = np.zeros([U_data.shape[0]])
# X_train = np.concatenate((L_data, U_data))
# y_train = np.concatenate((
#             L_label.astype(str),
#             np.full_like(U_label.astype(str), "unlabeled")
#         ))
# m = StandardSelfTraining('s', classifier['DecisionTree'])
# m.fit(X_train, y_train)
# print(m.score(X_test, y_test))
m1 = SelfTraining1(classifier['DecisionTree'])
print(m1.train(L_data, L_label, U_data, X_test, y_test, 0.6), m1.init_score)

0.832369942197 0.797687861272


In [226]:
class SelfTraining1:
    def __init__(self, classifiers):
        self.classifier = classifiers
        
    def train(self, L_X, L_y, U_X, X_test, y_test, tau):
        try:
            if len(self.classifier) == 3:#if input 3 models, use them for tritraining
                self.classifiers = [sklearn.base.clone(self.classifier[i]) for i in range(3)]
                for i in range(3):
                    self.classifiers[i].fit(L_X, L_y)
        except TypeError as e:#if input 1 model, clone it three times
            self.classifiers = [sklearn.base.clone(self.classifier) for i in range(3)]
            sample = sklearn.utils.resample(L_X, L_y)
            for i in range(3):
                self.classifiers[i].fit(*sample)
            
        self.init_score = self.score(X_test, y_test)
        self.fit(L_X, L_y, U_X, tau)
        return self.score(X_test, y_test)
    
    def fit(self, L_X, L_y, U_X, tau):
        for c in range(3):
            improve =  True
            self.iter = 0
            while improve and len(U_X)!=0:
                self.classifiers[c].fit(L_X, L_y)
                U_prob = self.classifiers[c].predict_proba(U_X)
                U_label = self.classifiers[c].predict(U_X)
                label_index = np.argmax(U_prob, axis = 1)>tau

                if sum(label_index) ==0:
                    improve = False
                self.iter += 1
                L_X = np.append(L_X, U_X[label_index], axis=0)
                L_y = np.append(L_y, U_label[label_index])
                U_X = np.delete(U_X, np.where(label_index), axis=0)

    def predict(self, X):
        pred = np.asarray([self.classifiers[i].predict(X) for i in range(3)])
        pred[0][pred[1]==pred[2]] = pred[1][pred[1]==pred[2]]
        return pred[0]
        
    def score(self, X, y):
        return sklearn.metrics.accuracy_score(y, self.predict(X))


In [225]:
class SelfTraining2:
    def __init__(self, classifiers):
        self.classifier = classifiers
    
    def train(self, L_X, L_y, U_X, X_test, y_test):
        try:
            if len(self.classifier) == 3:#if input 3 models, use them for tritraining
                self.classifiers = [sklearn.base.clone(self.classifier[i]) for i in range(3)]
                for i in range(3):
                    self.classifiers[i].fit(L_X, L_y)
        except TypeError as e:#if input 1 model, clone it three times
            self.classifiers = [sklearn.base.clone(self.classifier) for i in range(3)]
            sample = sklearn.utils.resample(L_X, L_y)
            for i in range(3):
                self.classifiers[i].fit(*sample)
            
        self.init_score = self.score(X_test, y_test)
        self.fit(L_X, L_y, U_X)
        return self.score(X_test, y_test)
        
    def fit(self, L_X, L_y, U_X):
        e_prime = [0.5]*3
        l_prime = [0]*3
        e = [0]*3
        update = [False]*3
        Li_X, Li_y = [[]]*3, [[]]*3#to save proxy labeled data
        improve = True
        self.iter = 0
        
        while improve:
            self.iter += 1#count iterations 
            
            for i in range(3):    
                j, k = np.delete(np.array([0,1,2]),i)
                update[i] = False
                e[i] = self.measure_error(L_X, L_y, j, k)
                if e[i] < e_prime[i]:
                    U_y_j = self.classifiers[j].predict(U_data)
                    U_y_k = self.classifiers[k].predict(U_data)
                    Li_X[i] = U_X[U_y_j == U_y_k]#when two models agree on the label, save it
                    Li_y[i] = U_y_j[U_y_j == U_y_k]
                    if l_prime[i] == 0:#no updated before
                        l_prime[i]  = int(e[i]/(e_prime[i] - e[i]) + 1)
                    if l_prime[i] <len(Li_y[i]):
                        if e[i]*len(Li_y[i])<e_prime[i] * l_prime[i]:
                            update[i] = True
                        elif l_prime[i] > e[i]/(e_prime[i] - e[i]):
                            L_index = np.random.choice(len(Li_y[i]), int(e_prime[i] * l_prime[i]/e[i] -1))#subsample from proxy labeled data
                            Li_X[i], Li_y[i] = Li_X[i][L_index], Li_y[i][L_index]
                            update[i] = True
             
            for i in range(3):
                if update[i]:
                    self.classifiers[i].fit(np.append(L_X,Li_X[i],axis=0), np.append(L_y, Li_y[i], axis=0))#train the classifier on integrated dataset
                    e_prime[i] = e[i]
                    l_prime[i] = len(Li_y[i])
    
            if update == [False]*3:
                improve = False#if no classifier was updated, no improvement


    def predict(self, X):
        return self.classifiers[0].predict(X)
        
    def score(self, X, y):
        return sklearn.metrics.accuracy_score(y, self.predict(X))
        
    def measure_error(self, X, y, j, k):
        j_pred = self.classifiers[j].predict(X)
        k_pred = self.classifiers[k].predict(X)
        wrong_index =np.logical_and(j_pred != y, k_pred==j_pred)#model_j and model_k make the same wrong prediction
        #wrong_index =np.logical_and(j_pred != y_test, k_pred!=y_test)
        return sum(wrong_index)/sum(j_pred == k_pred)

In [4]:
class TriTrainingwDisagreement():

    def __init__(self, classifier):
        """
        args:
            classifier - classifier, with .train, .predict API (refer to classifiers of sklearn)
        """
        # Initialize
        self.clf = [sklearn.base.clone(classifier) for i in range(3)]

    def measure_error(self, j, k):
        """
        args:
                j - int, classifier index
                k - int, classifier index
        return:
                float, classification_error
        """
        y_predict_j = self.clf[j].predict(self.X_label)
        y_predict_k = self.clf[k].predict(self.X_label)
        return (1 - np.sum((y_predict_j == y_predict_k) & (y_predict_j == self.y_label)) / np.sum(y_predict_j == y_predict_k))

    def fit(self, X_label, y_label, X_unlabel):
        """
        args:
                X_label - labeled train feature vector (ndarray of size, # of samples * # of features), features are numeric numbers
                y_label - labeled train label vector (ndarray of size, # of samples), labels are numeric numbers
                X_unlabel - test feature vector (ndarray of size, # of samples * # of features), features are numeric numbers
        """        

        self.X_label = X_label
        self.y_label = y_label

        classification_error_current = [0.5, 0.5, 0.5]
        classification_error = [0.5, 0.5, 0.5]
        pseudo_label_size_current = [0, 0, 0]
        pseudo_label_size = [0, 0, 0]
        # pseudo_label_index used to compare and check if tri-training can be stopped, when two iterations have the same label_index, means tri-training can be stopped
        X_pseudo_label_index = [[], [], []]
        X_pseudo_label_index_current = [[], [], []]

        feature_size = self.X_label.shape[1]

        # Train each classifier with bootstrampped subset
        for i in range(3):
            X_resample, y_resample = sklearn.utils.resample(self.X_label, self.y_label)  # BootstrapSample(L)
            self.clf[i].fit(X_resample, y_resample)  # Learn(Si)

        iteration = 0
        while (True):

            update = [False, False, False]

            iteration = iteration + 1
            for i in range(3):
                X_pseudo_label_index_current[i] = X_pseudo_label_index[i]

            # Step3.1 Set Li = empty set, Li denotes the new pseudo label set determined by tri-training iteration for classifier i
            # X_pseudo_label_index, contains the data record index (in the full unlabelled set) of the new pseudo label set determined by tri-training iteration for classifier i
            # X_pseudo_label, contains the features for new pseudo label set determined by tri-training iteration for classifier i
            # y_pseudo_label, contains the labels (not ground truth label, but pseudo label calculated by tri-training iteration) for new pseudo label set determined by tri-training iteration for classifier i
            X_pseudo_label_index = [[], [], []]
            X_pseudo_label = [[], [], []]
            y_pseudo_label = [[], [], []]

            # Step 3.2 Loop through all the data record in unlabelled set
            for i in range(3):
                j, k = np.delete(np.array([0, 1, 2]), i)
                classification_error[i] = self.measure_error(j, k)
                if classification_error[i] < classification_error_current[i]:
                    # Step 3.2 If classifier j,k aggrees with the label for one data record, and not agree with classifier i, in unlabelled set,
                    # then add the data record into Li                    
                    y_predict_j = self.clf[j].predict(X_unlabel)
                    y_predict_k = self.clf[k].predict(X_unlabel)
                    y_predict_i = self.clf[i].predict(X_unlabel)
                    y_pseudo_label[i] = y_predict_j[np.logical_and(y_predict_j==y_predict_k,y_predict_j!=y_predict_i)]
                    X_pseudo_label_index[i] = np.where(np.logical_and(y_predict_j==y_predict_k,y_predict_j!=y_predict_i))
                    
                    pseudo_label_size[i] = len(X_pseudo_label_index[i])
                    #print("classification_error: {}, classification_error_current: {}, pseudo_label_size: {}, pseudo_label_size_current: {}".format(classification_error[i], classification_error_current[i], pseudo_label_size[i],pseudo_label_size_current[i]))

                    if pseudo_label_size_current[i] == 0:
                        pseudo_label_size_current[i] = math.floor(classification_error[i] / (classification_error_current[i] - classification_error[i]) + 1)
                    if pseudo_label_size_current[i] < pseudo_label_size[i]:
                        if ((classification_error[i] * pseudo_label_size[i]) < (classification_error_current[i] * pseudo_label_size_current[i])):
                            update[i] = True
                        elif pseudo_label_size_current[i] > (classification_error[i] / (classification_error_current[i] - classification_error[i])):
                            resample_size = math.ceil(classification_error_current[i] * pseudo_label_size_current[i] / classification_error[i] - 1)
                            X_pseudo_label_index[i], y_pseudo_label[i] = sklearn.utils.resample(X_pseudo_label_index[i],y_pseudo_label[i],replace=False,n_samples=resample_size)
                            pseudo_label_size[i] = len(X_pseudo_label_index[i])
                            update[i] = True

            # Step 3.3 Train all the three classifiers with Li + original labelled data set
            for i in range(3):
                if update[i] == True:
                    #print("number of pseudo labels added for classifier {} is: {}".format(i,len(X_pseudo_label_index[i])))
                    X_pseudo_label[i] = np.array(X_unlabel[X_pseudo_label_index[i]])
                    self.clf[i].fit(np.concatenate((X_pseudo_label[i], self.X_label), axis=0),np.concatenate((np.array(y_pseudo_label[i]), self.y_label), axis=0))
                    classification_error_current[i] = classification_error[i]
                    pseudo_label_size_current[i] = pseudo_label_size[i]

            # Stop tri-training process, if the pseudo label data set added in current tri-training iteration
            # is the same for last tri-training iteration for all classifiers
            if (np.array_equal(X_pseudo_label_index[0], X_pseudo_label_index_current[0]) & np.array_equal(X_pseudo_label_index[1], X_pseudo_label_index_current[1]) 
                    & np.array_equal(X_pseudo_label_index[2], X_pseudo_label_index_current[2])):
                break

    def predict(self, X_test):
        """
        args:
                X_test - test feature vector (ndarray of size, # of samples * # of features), features are numeric numbers
        return:
                array of size (# of test samples), with values as predicted label 1 or 0
        """
        I = self.clf[0].predict(X_test)
        J = self.clf[1].predict(X_test)
        K = self.clf[2].predict(X_test)
        I[J == K] = J[J == K]
        return I

    def score(self, X_test, y_test):
        """
        args:
                X_test - test feature vector (ndarray of size, # of samples * # of features), features are numeric numbers
                y_test - test label vector (ndarray of size, # of samples), labels are numeric numbers
        return:
                float, accuracy_score of predicted value by the tri-training (with disagreement) classifier against groud truth
        """
        
        return sklearn.metrics.accuracy_score(y_test, self.predict(X_test))

In [5]:
def data_process(data, label, rate, test_rate=0.25):
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(data, label, test_size = test_rate, random_state=0)

    rng = np.random.RandomState(0)#to make same index every time
    labeled_index = rng.rand(len(y_train)) < rate#in training set, choose 20% as labeled data
    unlabeled_index = np.logical_not(labeled_index)
    L_data = X_train[labeled_index]#data of L
    L_label = y_train[labeled_index]#lable of L
    U_data = X_train[unlabeled_index]#data of U
    return L_data, L_label, U_data, X_test, y_test

In [200]:
classifier = {}

from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import tree
from sklearn import linear_model
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier

classifier['DecisionTree'] = tree.DecisionTreeClassifier()
classifier['BP_Network'] = MLPClassifier(solver='lbfgs', alpha=1e-3, hidden_layer_sizes=(20, 10), random_state=1)
classifier['NaiveBayes'] = GaussianNB()
# classifier['KNN'] = KNeighborsClassifier(
#         n_neighbors=3,
#         metric="euclidean",
#         #n_jobs=2  # Parallelize work on CPUs
#     )
# classifier['SGD'] = linear_model.SGDClassifier(max_iter=1000, tol=1e-3)
# classifier['SVM'] = svm.SVC(gamma='scale')

In [7]:
import numpy as np
dataset = {}

data = np.loadtxt('australian.dat')[:, 0:14]
label = np.loadtxt('australian.dat')[:, 14]
dataset['australian'] = {'X': data, 'y':label}

data = np.loadtxt('ionosphere.data', delimiter=',')[:, 0:34]
label = np.loadtxt('ionosphere.data', delimiter=',')[:, 34]
dataset['ionosphere'] = {'X': data, 'y':label}

data = np.loadtxt('wdbc.data', delimiter=',')[:, 2:34]
label = np.loadtxt('wdbc.data', delimiter=',')[:, 1]
dataset['wdbc'] = {'X': data, 'y':label}

data = np.loadtxt('bupa.data', delimiter=',')[:, :6]
label = np.loadtxt('bupa.data', delimiter=',')[:, 6]
dataset['bupa'] = {'X': data, 'y': label}

data = np.loadtxt('german.data-numeric')[:, 0:20]
label = np.loadtxt('german.data-numeric')[:, 20]
dataset['german'] = {'X': data, 'y': label}

In [223]:
sklearn.base.is_classifier(classifiers[0])

True

In [218]:
L_data, L_label, U_data, X_test, y_test = data_process(dataset[d]['X'], dataset[d]['y'], r)

classifiers = [sklearn.base.clone(classifier[c])]
for clf in classifier.keys():
    if clf != c:
        print(clf)
        classifiers.append(sklearn.base.clone(classifier[clf]))#use the first clf in classifiers to output score, but all three to label data
print(classifiers)
m = TriTraining(classifiers)
1 - m.train(L_data, L_label, U_data, X_test, y_test)


DecisionTree
BP_Network
[GaussianNB(priors=None, var_smoothing=1e-09), DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'), MLPClassifier(activation='relu', alpha=0.001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(20, 10), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=1, shuffle=True, solver='lbfgs', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)]


0.060000000000000053

In [250]:
results = None

for d in dataset:
    for c in classifier:
        for r in [0.2, 0.4, 0.6, 0.8]:        
            print('dataset:', d, dataset[d]['X'].shape)
            print('classifier:', c)
            print('label_rate:', r)
            error = np.zeros([4,50])
            init_error = np.zeros([3,50])
            for i in range(50):#average on 20 data splits
                L_data, L_label, U_data, X_test, y_test = data_process(dataset[d]['X'], dataset[d]['y'], r)
                
                classifiers = [sklearn.base.clone(classifier[c])]
                
                for clf in classifier.keys():
                    if clf != c:
                        classifiers.append(sklearn.base.clone(classifier[clf]))#use the first clf in classifiers to output score, but all three to label data
                m = TriTraining(classifier[c])
                error[0,i] = 1 - m.train(L_data, L_label, U_data, X_test, y_test)
                init_error[0,i] = 1- m.init_score
                
#                 m2 = TriTrainingwDisagreement(classifier[c])
#                 m2.fit(L_data, L_label, U_data)
#                 error[1, i] = 1-m2.score(X_test, y_test)
                m2 = SelfTraining1(classifier[c])
                error[1,i] = 1 - m2.train(L_data, L_label, U_data, X_test, y_test, 0.7)
                init_error[1,i] = 1- m2.init_score
            
                m3 = SelfTraining2(classifier[c])
                error[2,i] = 1 - m3.train(L_data, L_label, U_data, X_test, y_test)
                init_error[2,i] = 1- m3.init_score
                
                m4 = sklearn.base.clone(classifier[c])
                m4.fit(L_data, L_label)
                error[3,i] = 1 - m4.score(X_test, y_test)
                
            e = np.mean(error, axis = 1)
            init_e = np.mean(init_error, axis = 1)
#             print('TriTraining test error', e[0])
#             print('TriTraining Disagree test error', e[1])
#             print('SelfTraining test error', e[2])
#             print('Supervised test error', e[3],'\n')
            
            test_info = {'dataset': d+str(dataset[d]['X'].shape), 'classifier': c, 'label_rate': r}
            errors = {'TriTraining': e[0], 'SelfTraining1': e[1],'SelfTraining2': e[2], 'Supervised': e[3], 'TriTraining_init': init_e[0], 'SelfTraining1_init': init_e[1], 'SelfTraining2_init': init_e[2]}#, 'Best': methods[np.argmin(e)]}#'
            if results is None:
                results = pd.DataFrame([{**test_info, **errors}])
            else:
                results.loc[len(results.index)] = {**test_info, **errors}

dataset: australian (690, 14)
classifier: DecisionTree
label_rate: 0.2
dataset: australian (690, 14)
classifier: DecisionTree
label_rate: 0.4
dataset: australian (690, 14)
classifier: DecisionTree
label_rate: 0.6
dataset: australian (690, 14)
classifier: DecisionTree
label_rate: 0.8
dataset: australian (690, 14)
classifier: BP_Network
label_rate: 0.2
dataset: australian (690, 14)
classifier: BP_Network
label_rate: 0.4
dataset: australian (690, 14)
classifier: BP_Network
label_rate: 0.6
dataset: australian (690, 14)
classifier: BP_Network
label_rate: 0.8
dataset: australian (690, 14)
classifier: NaiveBayes
label_rate: 0.2
dataset: australian (690, 14)
classifier: NaiveBayes
label_rate: 0.4
dataset: australian (690, 14)
classifier: NaiveBayes
label_rate: 0.6
dataset: australian (690, 14)
classifier: NaiveBayes
label_rate: 0.8
dataset: ionosphere (351, 34)
classifier: DecisionTree
label_rate: 0.2
dataset: ionosphere (351, 34)
classifier: DecisionTree
label_rate: 0.4
dataset: ionosphere (3

In [251]:
pd.pivot_table(results, values=None, index=['label_rate', 'dataset', 'classifier' ])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,SelfTraining1,SelfTraining1_init,SelfTraining2,SelfTraining2_init,Supervised,TriTraining,TriTraining_init
label_rate,dataset,classifier,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0.2,"australian(690, 14)",BP_Network,0.359538,0.359538,0.344624,0.356994,0.323699,0.347283,0.382081
0.2,"australian(690, 14)",DecisionTree,0.160231,0.17815,0.175029,0.177457,0.177457,0.169942,0.177341
0.2,"australian(690, 14)",NaiveBayes,0.179191,0.18474,0.18763,0.17341,0.184971,0.187514,0.183353
0.2,"bupa(345, 6)",BP_Network,0.464368,0.464368,0.446437,0.456552,0.45977,0.445977,0.452644
0.2,"bupa(345, 6)",DecisionTree,0.354943,0.417471,0.387126,0.390805,0.378161,0.388736,0.432644
0.2,"bupa(345, 6)",NaiveBayes,0.471264,0.445747,0.462759,0.443218,0.45977,0.457701,0.452414
0.2,"german(1000, 20)",BP_Network,0.18,0.20072,0.14368,0.20936,0.172,0.14064,0.20568
0.2,"german(1000, 20)",DecisionTree,0.032,0.04144,0.0364,0.04248,0.02992,0.03856,0.0396
0.2,"german(1000, 20)",NaiveBayes,0.112,0.13512,0.12992,0.16504,0.096,0.09816,0.1248
0.2,"ionosphere(351, 34)",BP_Network,0.25,0.245455,0.266364,0.252727,0.25,0.255909,0.248636


In [257]:
results['Best'] = results.loc[:, ['TriTraining','SelfTraining1','TriTraining_init','SelfTraining1_init','SelfTraining2_init','SelfTraining2', 'Supervised']].idxmin(axis = 1)
results['Tri'] = results.loc[:, ['TriTraining','TriTraining_init']].idxmin(axis = 1)
results['Self1'] = results.loc[:, ['SelfTraining1','SelfTraining1_init']].idxmin(axis = 1)
results['Self2'] = results.loc[:, ['SelfTraining2','SelfTraining2_init']].idxmin(axis = 1)
results['Best'].value_counts()

Supervised            16
TriTraining           16
SelfTraining1         15
SelfTraining1_init     6
SelfTraining2_init     3
TriTraining_init       3
SelfTraining2          1
Name: Best, dtype: int64

In [262]:
writer = pd.ExcelWriter('output.xlsx')
results.to_excel(writer,'Sheet1')
writer.save()

In [261]:
!pip install openpyxl

Collecting openpyxl
[?25l  Downloading https://files.pythonhosted.org/packages/08/8a/509eb6f58672288da9a5884e1cc7e90819bc8dbef501161c4b40a6a4e46b/openpyxl-2.5.12.tar.gz (173kB)
[K    100% |████████████████████████████████| 174kB 7.9MB/s ta 0:00:01
[?25hCollecting jdcal (from openpyxl)
  Downloading https://files.pythonhosted.org/packages/a0/38/dcf83532480f25284f3ef13f8ed63e03c58a65c9d3ba2a6a894ed9497207/jdcal-1.4-py2.py3-none-any.whl
Collecting et_xmlfile (from openpyxl)
  Downloading https://files.pythonhosted.org/packages/22/28/a99c42aea746e18382ad9fb36f64c1c1f04216f41797f2f0fa567da11388/et_xmlfile-1.0.1.tar.gz
Building wheels for collected packages: openpyxl, et-xmlfile
  Running setup.py bdist_wheel for openpyxl ... [?25ldone
[?25h  Stored in directory: /jet/.cache/pip/wheels/95/b0/38/e5d13093b588f87177df648c06d07d4b7221f2c17d544cde4c
  Running setup.py bdist_wheel for et-xmlfile ... [?25ldone
[?25h  Stored in directory: /jet/.cache/pip/wheels/2a/77/35/0da0965a057698121fc7d8