In [1]:
import time
import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import fetch_openml
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.utils import check_random_state
import sklearn
from sklearn.metrics import roc_auc_score
import random
random.seed(0)

In [2]:
t0 = time.time()
train_samples = 60000
# Load data from https://www.openml.org/d/554
X, y = fetch_openml('mnist_784', version=1, return_X_y=True)
X = X.reshape((X.shape[0], -1))
X_train, X_test, y_train, y_test = train_test_split(
X, y, train_size=train_samples, test_size=10000)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print("==================")
X_train=list(X_train)
X_test=list(X_test)
print(len(X_train))
print(len(X_test))
print("==================")

60000
10000


In [3]:
templist = []
for tup in zip(X_train, y_train):
    if(tup[1]=='8' or tup[1]=='3'):
        templist.append(list(tup))
    
testList = []
for tup in zip(X_test, y_test):
    if(tup[1]=='8' or tup[1]=='3'):
        testList.append(list(tup))
    
print("Initial Training set size = "+str(len(templist)))
print("Initial Test set size = "+str(len(testList)))

Initial Training set size = 11945
Initial Test set size = 2021


In [4]:
X_test=[]
y_test=[]

for tup in testList:
	X_test.append(tup[0])
	y_test.append(tup[1])


random.shuffle(templist)

#seed_size=int(0.1*len(templist))
seed_size=5

seed_list=templist[0:seed_size]
print("Initial Seed set size = "+str(len(seed_list)))
 
unlabelled_list=templist[seed_size:20]
print("Unlabelled dataset size = "+str(len(unlabelled_list)))

Initial Seed set size = 5
Unlabelled dataset size = 15


In [5]:
import numpy as np
import lime
import lime.lime_tabular


def explanation_generation_based_and_train_test(seed_set, unlabelled_list, batch_size):
    print("Before Sampling Unlabelled Data Size: "+str(len(unlabelled_list)))
    print("Before Sampling Seed Data Size: "+str(len(seed_set)))
    
    feature_names=[]

    for i in range(784):
        feature='feature_'+str(i)
        feature_names.append(feature)
    

    class_names=[]
    classes=['3','8']

    for i in classes:
        class_1='class_'+str(i)
        class_names.append(class_1)
        
    clf = LogisticRegression(C=50. / train_samples, penalty='l1', solver='saga', tol=0.1)
    
    seed_X=[]
    seed_y=[]

    for tup in seed_list:
        seed_X.append(list(tup[0]))
        seed_y.append(list(tup[1]))
        
    test_X=[]
    test_y=[]

    for tup in testList:
        test_X.append(list(tup[0]))
        test_y.append(list(tup[1]))
    
    clf.fit(seed_X, seed_y)
    print("Accuracy on the Test Set is : "+str(sklearn.metrics.accuracy_score(test_y, clf.predict(test_X))))
    explainer = lime.lime_tabular.LimeTabularExplainer(np.array(seed_X), feature_names=feature_names, class_names=class_names, discretize_continuous=False)
    
    total_np_array=[]
    num_features=50
    for item in seed_list:
        #print(item)
        exp = explainer.explain_instance(item[0], clf.predict_proba, num_features=num_features)
        vectors = np.empty([1, num_features])
        for elem in list(exp.as_list()):
            np.append(vectors, elem[1], axis=None)
        total_np_array.append(vectors[0])
        
    
    seed_mean = np.mean(np.array(total_np_array), axis=0)
    
    ulabelled_X=[]
    ulabelled_y=[]

    for tup in unlabelled_list:
        ulabelled_X.append(list(tup[0]))
        ulabelled_y.append(list(tup[1]))
        
    explainer = lime.lime_tabular.LimeTabularExplainer(np.array(ulabelled_X), feature_names=feature_names, class_names=class_names, discretize_continuous=False)
    
    sums=[]
    for item in unlabelled_list:
        exp = explainer.explain_instance(item[0], clf.predict_proba, num_features=num_features)
        vectors = np.empty([1, num_features])
        for elem in list(exp.as_list()):
            np.append(vectors, elem[1], axis=None)
        np_vector=vectors[0]
        sums.append(np.sum(np.subtract(seed_mean, np_vector)))
        
    
    
    print("Max= "+str(max(sums)))
    print("Index= "+str(sums.index(max(sums))))
    
    print("Unlabelled List Before: "+str(len(unlabelled_list)))
    index = sums.index(max(sums))
    
    
    print("Seed List Before: "+str(len(seed_list)))
    seed_list.append(unlabelled_list[index])
    print("Seed List After: "+str(len(seed_list)))
    
    del unlabelled_list[index]
    print("Unlabelled List After: "+str(len(unlabelled_list)))

In [6]:
for i in range(len(unlabelled_list)):
    explanation_generation_based_and_train_test(seed_list, unlabelled_list, 1)

Before Sampling Unlabelled Data Size: 15
Before Sampling Seed Data Size: 5
Accuracy on the Test Set is : 0.4943097476496784


  y = column_or_1d(y, warn=True)


Max= 2.4526636718292826e-32
Index= 0
Unlabelled List Before: 15
Seed List Before: 5
Seed List After: 6
Unlabelled List After: 14
Before Sampling Unlabelled Data Size: 14
Before Sampling Seed Data Size: 6
Accuracy on the Test Set is : 0.5056902523503216


  y = column_or_1d(y, warn=True)


Max= 3.4636542057891814e-32
Index= 1
Unlabelled List Before: 14
Seed List Before: 6
Seed List After: 7
Unlabelled List After: 13
Before Sampling Unlabelled Data Size: 13
Before Sampling Seed Data Size: 7
Accuracy on the Test Set is : 0.4943097476496784


  y = column_or_1d(y, warn=True)


Max= 4.5554517670822373e-32
Index= 2
Unlabelled List Before: 13
Seed List Before: 7
Seed List After: 8
Unlabelled List After: 12
Before Sampling Unlabelled Data Size: 12
Before Sampling Seed Data Size: 8
Accuracy on the Test Set is : 0.5056902523503216


  y = column_or_1d(y, warn=True)


Max= 2.2753957957327166e-32
Index= 8
Unlabelled List Before: 12
Seed List Before: 8
Seed List After: 9
Unlabelled List After: 11
Before Sampling Unlabelled Data Size: 11
Before Sampling Seed Data Size: 9


  y = column_or_1d(y, warn=True)


Accuracy on the Test Set is : 0.5056902523503216
Max= 5.1484755006373725e-33
Index= 1
Unlabelled List Before: 11
Seed List Before: 9
Seed List After: 10
Unlabelled List After: 10
Before Sampling Unlabelled Data Size: 10
Before Sampling Seed Data Size: 10
Accuracy on the Test Set is : 0.4943097476496784


  y = column_or_1d(y, warn=True)


Max= 3.510383856602637e-32
Index= 9
Unlabelled List Before: 10
Seed List Before: 10
Seed List After: 11
Unlabelled List After: 9
Before Sampling Unlabelled Data Size: 9
Before Sampling Seed Data Size: 11


  y = column_or_1d(y, warn=True)


Accuracy on the Test Set is : 0.4943097476496784
Max= 1.6075413807796212e-32
Index= 5
Unlabelled List Before: 9
Seed List Before: 11
Seed List After: 12
Unlabelled List After: 8
Before Sampling Unlabelled Data Size: 8
Before Sampling Seed Data Size: 12


  y = column_or_1d(y, warn=True)


Accuracy on the Test Set is : 0.4943097476496784
Max= 3.039689181901504e-32
Index= 6
Unlabelled List Before: 8
Seed List Before: 12
Seed List After: 13
Unlabelled List After: 7
Before Sampling Unlabelled Data Size: 7
Before Sampling Seed Data Size: 13


  y = column_or_1d(y, warn=True)


Accuracy on the Test Set is : 0.4943097476496784
Max= 6.7841152582557064e-34
Index= 0
Unlabelled List Before: 7
Seed List Before: 13
Seed List After: 14
Unlabelled List After: 6
Before Sampling Unlabelled Data Size: 6
Before Sampling Seed Data Size: 14
Accuracy on the Test Set is : 0.4943097476496784


  y = column_or_1d(y, warn=True)


Max= 2.9018347643937e-32
Index= 1
Unlabelled List Before: 6
Seed List Before: 14
Seed List After: 15
Unlabelled List After: 5
Before Sampling Unlabelled Data Size: 5
Before Sampling Seed Data Size: 15


  y = column_or_1d(y, warn=True)


Accuracy on the Test Set is : 0.4943097476496784
Max= 8.644841074838079e-34
Index= 0
Unlabelled List Before: 5
Seed List Before: 15
Seed List After: 16
Unlabelled List After: 4
Before Sampling Unlabelled Data Size: 4
Before Sampling Seed Data Size: 16
Accuracy on the Test Set is : 0.4943097476496784


  y = column_or_1d(y, warn=True)


Max= -2.808818722197449e-34
Index= 0
Unlabelled List Before: 4
Seed List Before: 16
Seed List After: 17
Unlabelled List After: 3
Before Sampling Unlabelled Data Size: 3
Before Sampling Seed Data Size: 17


  y = column_or_1d(y, warn=True)


Accuracy on the Test Set is : 0.4943097476496784
Max= 1.779083379814898e-33
Index= 2
Unlabelled List Before: 3
Seed List Before: 17
Seed List After: 18
Unlabelled List After: 2
Before Sampling Unlabelled Data Size: 2
Before Sampling Seed Data Size: 18
Accuracy on the Test Set is : 0.4943097476496784


  y = column_or_1d(y, warn=True)


Max= 2.8820809821811342e-33
Index= 0
Unlabelled List Before: 2
Seed List Before: 18
Seed List After: 19
Unlabelled List After: 1
Before Sampling Unlabelled Data Size: 1
Before Sampling Seed Data Size: 19


  y = column_or_1d(y, warn=True)


Accuracy on the Test Set is : 0.4943097476496784
Max= -1.678443836466671e-33
Index= 0
Unlabelled List Before: 1
Seed List Before: 19
Seed List After: 20
Unlabelled List After: 0


In [7]:
seed_list=templist[0:seed_size]
print("Initial Seed set size = "+str(len(seed_list)))
 
unlabelled_list=templist[seed_size:20]
print("Unlabelled dataset size = "+str(len(unlabelled_list)))

Initial Seed set size = 5
Unlabelled dataset size = 15


In [8]:
def uncertainity_based_and_train_test(seed_set, unlabelled_list, batch_size):
    print("Before Sampling Unlabelled Data Size: "+str(len(unlabelled_list)))
    print("Before Sampling Seed Data Size: "+str(len(seed_set)))
    
    seed_X=[]
    seed_y=[]

    for tup in seed_list:
        seed_X.append(list(tup[0]))
        seed_y.append(list(tup[1]))
    
    ulabelled_X=[]
    ulabelled_y=[]
    
    for tup in unlabelled_list:
        ulabelled_X.append(list(tup[0]))
        ulabelled_y.append(list(tup[1]))
        
    clf = LogisticRegression(C=50. / train_samples, penalty='l1', solver='saga', tol=0.1)
    clf.fit(seed_X, seed_y)
    
    test_X=[]
    test_y=[]

    for tup in testList:
        test_X.append(list(tup[0]))
        test_y.append(list(tup[1]))
    
    print("Accuracy on the Test Set is : "+str(sklearn.metrics.accuracy_score(test_y, clf.predict(test_X))))
    predictions_label_wise=np.array(clf.predict_proba(ulabelled_X))
    uncertainity_list = list(1-predictions_label_wise.max(axis=1))
    max_index = uncertainity_list.index(max(uncertainity_list))
    print("Initial seed size"+str(len(seed_list)))
    print("Initial unlabelled_list size"+str(len(unlabelled_list)))
    seed_list.append(unlabelled_list[max_index])
    del unlabelled_list[max_index]
    print("Final unlabelled_list size"+str(len(unlabelled_list)))
    print("Final seed size"+str(len(seed_list)))
    #print(max_index)

In [9]:
for i in range(len(unlabelled_list)):
    uncertainity_based_and_train_test(seed_list, unlabelled_list, 1)

Before Sampling Unlabelled Data Size: 15
Before Sampling Seed Data Size: 5
Accuracy on the Test Set is : 0.4943097476496784
Initial seed size5
Initial unlabelled_list size15
Final unlabelled_list size14
Final seed size6
0


  y = column_or_1d(y, warn=True)


Before Sampling Unlabelled Data Size: 14
Before Sampling Seed Data Size: 6
Accuracy on the Test Set is : 0.4943097476496784
Initial seed size6
Initial unlabelled_list size14
Final unlabelled_list size13
Final seed size7
0


  y = column_or_1d(y, warn=True)


Before Sampling Unlabelled Data Size: 13
Before Sampling Seed Data Size: 7
Accuracy on the Test Set is : 0.4943097476496784
Initial seed size7
Initial unlabelled_list size13
Final unlabelled_list size12
Final seed size8
0


  y = column_or_1d(y, warn=True)


Before Sampling Unlabelled Data Size: 12
Before Sampling Seed Data Size: 8
Accuracy on the Test Set is : 0.4943097476496784
Initial seed size8
Initial unlabelled_list size12
Final unlabelled_list size11
Final seed size9
0
Before Sampling Unlabelled Data Size: 11
Before Sampling Seed Data Size: 9
Accuracy on the Test Set is : 0.4943097476496784
Initial seed size9
Initial unlabelled_list size11
Final unlabelled_list size10
Final seed size10
0


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Before Sampling Unlabelled Data Size: 10
Before Sampling Seed Data Size: 10
Accuracy on the Test Set is : 0.4943097476496784
Initial seed size10
Initial unlabelled_list size10
Final unlabelled_list size9
Final seed size11
0
Before Sampling Unlabelled Data Size: 9
Before Sampling Seed Data Size: 11
Accuracy on the Test Set is : 0.4943097476496784
Initial seed size11
Initial unlabelled_list size9
Final unlabelled_list size8
Final seed size12
0


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Before Sampling Unlabelled Data Size: 8
Before Sampling Seed Data Size: 12
Accuracy on the Test Set is : 0.4943097476496784
Initial seed size12
Initial unlabelled_list size8
Final unlabelled_list size7
Final seed size13
0
Before Sampling Unlabelled Data Size: 7

  y = column_or_1d(y, warn=True)



Before Sampling Seed Data Size: 13
Accuracy on the Test Set is : 0.4943097476496784
Initial seed size13
Initial unlabelled_list size7
Final unlabelled_list size6
Final seed size14
0
Before Sampling Unlabelled Data Size: 6
Before Sampling Seed Data Size: 14
Accuracy on the Test Set is : 0.4943097476496784
Initial seed size14
Initial unlabelled_list size6
Final unlabelled_list size5
Final seed size15
0
Before Sampling Unlabelled Data Size: 5

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)



Before Sampling Seed Data Size: 15
Accuracy on the Test Set is : 0.4943097476496784
Initial seed size15
Initial unlabelled_list size5
Final unlabelled_list size4
Final seed size16
0
Before Sampling Unlabelled Data Size: 4
Before Sampling Seed Data Size: 16
Accuracy on the Test Set is : 0.4943097476496784
Initial seed size16
Initial unlabelled_list size4
Final unlabelled_list size3
Final seed size17
0


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Before Sampling Unlabelled Data Size: 3
Before Sampling Seed Data Size: 17
Accuracy on the Test Set is : 0.4943097476496784
Initial seed size17
Initial unlabelled_list size3
Final unlabelled_list size2
Final seed size18
0


  y = column_or_1d(y, warn=True)


Before Sampling Unlabelled Data Size: 2
Before Sampling Seed Data Size: 18
Accuracy on the Test Set is : 0.4943097476496784
Initial seed size18
Initial unlabelled_list size2
Final unlabelled_list size1
Final seed size19
0
Before Sampling Unlabelled Data Size: 1
Before Sampling Seed Data Size: 19
Accuracy on the Test Set is : 0.4943097476496784
Initial seed size19
Initial unlabelled_list size1
Final unlabelled_list size0
Final seed size20
0


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
