In [1]:
import time
import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import fetch_openml
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.utils import check_random_state
import sklearn
from sklearn.metrics import roc_auc_score
import random
random.seed(0)

In [2]:
t0 = time.time()
train_samples = 60000
# Load data from https://www.openml.org/d/554
X, y = fetch_openml('mnist_784', version=1, return_X_y=True)
X = X.reshape((X.shape[0], -1))
X_train, X_test, y_train, y_test = train_test_split(
X, y, train_size=train_samples, test_size=10000)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print("==================")
X_train=list(X_train)
X_test=list(X_test)
print(len(X_train))
print(len(X_test))
print("==================")

60000
10000


In [3]:
templist = []
for tup in zip(X_train, y_train):
    if(tup[1]=='8' or tup[1]=='3'):
        templist.append(list(tup))
    
testList = []
for tup in zip(X_test, y_test):
    if(tup[1]=='8' or tup[1]=='3'):
        testList.append(list(tup))
    
print("Initial Training set size = "+str(len(templist)))
print("Initial Test set size = "+str(len(testList)))

Initial Training set size = 11989
Initial Test set size = 1977


In [4]:
X_test=[]
y_test=[]

for tup in testList:
	X_test.append(tup[0])
	y_test.append(tup[1])


random.shuffle(templist)

#seed_size=int(0.1*len(templist))
seed_size=5

seed_list=templist[0:seed_size]
print("Initial Seed set size = "+str(len(seed_list)))
 
unlabelled_list=templist[seed_size:20]
print("Unlabelled dataset size = "+str(len(unlabelled_list)))

Initial Seed set size = 5
Unlabelled dataset size = 15


In [5]:
import numpy as np
import lime
import lime.lime_tabular
import matplotlib.pyplot as plt

def explanation_generation_based_and_train_test(seed_set, unlabelled_list, batch_size):
    print("Before Sampling Unlabelled Data Size: "+str(len(unlabelled_list)))
    print("Before Sampling Seed Data Size: "+str(len(seed_set)))
    
    feature_names=[]

    for i in range(784):
        feature='feature_'+str(i)
        feature_names.append(feature)
    

    class_names=[]
    classes=['3','8']

    for i in classes:
        class_1='class_'+str(i)
        class_names.append(class_1)
        
    clf = LogisticRegression(C=50. / train_samples, penalty='l1', solver='saga', tol=0.1)
    
    seed_X=[]
    seed_y=[]

    for tup in seed_list:
        seed_X.append(list(tup[0]))
        seed_y.append(list(tup[1]))
        
    test_X=[]
    test_y=[]

    for tup in testList:
        test_X.append(list(tup[0]))
        test_y.append(list(tup[1]))
    
    clf.fit(seed_X, seed_y)
    print("Accuracy on the Test Set is : "+str(sklearn.metrics.accuracy_score(test_y, clf.predict(test_X))))
    explainer = lime.lime_tabular.LimeTabularExplainer(np.array(seed_X), feature_names=feature_names, class_names=class_names, discretize_continuous=False)
    
    total_np_array=[]
    num_features=784
    for item in seed_list:
        #print(item)
        exp = explainer.explain_instance(item[0], clf.predict_proba, num_features=num_features)
        vectors = np.empty([1, num_features])
        for elem in list(exp.as_list()):
            np.append(vectors, elem[1], axis=None)
        total_np_array.append(vectors[0])
        
    
    seed_mean = np.mean(np.array(total_np_array), axis=0)
    #reshaped = seed_mean.reshape((28, 28))
    print(seed_mean)
    #plt.imshow(reshaped, cmap='hot')
    
    ulabelled_X=[]
    ulabelled_y=[]

    for tup in unlabelled_list:
        ulabelled_X.append(list(tup[0]))
        ulabelled_y.append(list(tup[1]))
        
    explainer = lime.lime_tabular.LimeTabularExplainer(np.array(ulabelled_X), feature_names=feature_names, class_names=class_names, discretize_continuous=False)
    
    sums=[]
    for item in unlabelled_list:
        exp = explainer.explain_instance(item[0], clf.predict_proba, num_features=num_features)
        vectors = np.empty([1, num_features])
        for elem in list(exp.as_list()):
            np.append(vectors, elem[1], axis=None)
        np_vector=vectors[0]
        sums.append(np.sum(np.subtract(seed_mean, np_vector)))
        
    
    
    print("Max= "+str(max(sums)))
    print("Index= "+str(sums.index(max(sums))))
    
    print("Unlabelled List Before: "+str(len(unlabelled_list)))
    index = sums.index(max(sums))
    
    
    print("Seed List Before: "+str(len(seed_list)))
    seed_list.append(unlabelled_list[index])
    print("Seed List After: "+str(len(seed_list)))
    
    del unlabelled_list[index]
    print("Unlabelled List After: "+str(len(unlabelled_list)))

In [6]:
for i in range(len(unlabelled_list)):
    explanation_generation_based_and_train_test(seed_list, unlabelled_list, 1)

Before Sampling Unlabelled Data Size: 15
Before Sampling Seed Data Size: 5
Accuracy on the Test Set is : 0.5103692463328275


  y = column_or_1d(y, warn=True)


[6.89860742e-310 6.45356050e-310 3.73869829e-310 3.73869829e-310
 4.13913571e-310 4.13913570e-310 4.13913571e-310 4.13913571e-310
 4.13913570e-310 4.13913571e-310 4.13913571e-310 4.13913571e-310
 4.13913570e-310 4.13913571e-310 4.13913571e-310 4.13913571e-310
 4.13913571e-310 4.13913571e-310 4.13913571e-310 4.13913571e-310
 4.13913571e-310 4.13913571e-310 4.13913571e-310 4.13913570e-310
 4.13913570e-310 4.13913570e-310 4.13913571e-310 4.13913571e-310
 4.13913571e-310 4.13913571e-310 4.13913571e-310 4.13913571e-310
 4.13913571e-310 4.13913571e-310 4.13913571e-310 4.13913571e-310
 4.13913571e-310 4.13913571e-310 4.13913571e-310 4.13913571e-310
 4.13913571e-310 4.13913571e-310 4.13913571e-310 4.13913571e-310
 4.13913571e-310 4.13913571e-310 4.13913571e-310 4.13913571e-310
 4.13913571e-310 4.13913571e-310 4.13913571e-310 4.13913571e-310
 4.13913571e-310 4.13913571e-310 4.13913571e-310 4.13913571e-310
 4.13913571e-310 4.13913571e-310 4.13913571e-310 4.13913571e-310
 4.13913571e-310 4.139135

Max= 2.4393732161356975e-32
Index= 10
Unlabelled List Before: 15
Seed List Before: 5
Seed List After: 6
Unlabelled List After: 14
Before Sampling Unlabelled Data Size: 14
Before Sampling Seed Data Size: 6


  y = column_or_1d(y, warn=True)


Accuracy on the Test Set is : 0.48963075366717246
[5.78599014e-310 5.41511765e-310 2.33668643e-310 2.33668643e-310
 1.66666667e-001 1.66666667e-001 1.66666667e-001 1.66666667e-001
 1.66666667e-001 1.66666667e-001 1.66666667e-001 1.66666667e-001
 1.66666667e-001 1.66666667e-001 1.66666667e-001 1.66666667e-001
 1.66666667e-001 1.66666667e-001 1.66666667e-001 1.66666667e-001
 1.66666667e-001 1.66666667e-001 1.66666667e-001 1.66666667e-001
 1.66666667e-001 1.66666667e-001 1.66666667e-001 1.66666667e-001
 1.66666667e-001 1.66666667e-001 1.66666667e-001 1.66666667e-001
 1.66666667e-001 1.66666667e-001 1.66666667e-001 1.66666667e-001
 1.66666667e-001 1.66666667e-001 1.66666667e-001 1.66666667e-001
 1.66666667e-001 1.66666667e-001 1.66666667e-001 1.66666667e-001
 1.66666667e-001 1.66666667e-001 1.66666667e-001 1.66666667e-001
 1.66666667e-001 1.66666667e-001 1.66666667e-001 1.66666667e-001
 1.66666667e-001 1.66666667e-001 1.66666667e-001 1.66666667e-001
 1.66666667e-001 1.66666667e-001 1.66666

Max= 130.0
Index= 0
Unlabelled List Before: 14
Seed List Before: 6
Seed List After: 7
Unlabelled List After: 13
Before Sampling Unlabelled Data Size: 13
Before Sampling Seed Data Size: 7
Accuracy on the Test Set is : 0.48963075366717246


  y = column_or_1d(y, warn=True)


[6.26282612e-310 5.30915392e-310 2.67049869e-310 2.67049869e-310
 6.89856146e-310 6.89856146e-310 6.89856146e-310 6.89856106e-310
 6.89856146e-310 6.89856146e-310 6.89856194e-310 6.89856146e-310
 6.89856147e-310 6.89856146e-310 6.89856146e-310 6.89856146e-310
 6.89856146e-310 6.89856106e-310 6.89856146e-310 6.89856082e-310
 6.89856146e-310 6.89856106e-310 6.89856146e-310 6.89856146e-310
 6.89856146e-310 6.89856146e-310 6.89856146e-310 6.89856146e-310
 6.89856146e-310 6.89856146e-310 6.89856194e-310 6.89856083e-310
 6.89856146e-310 6.89856146e-310 6.89856146e-310 6.89856146e-310
 6.89856146e-310 6.89856146e-310 6.89856147e-310 6.89856082e-310
 6.89856147e-310 6.89856083e-310 6.89856107e-310 6.89856147e-310
 6.89856147e-310 6.89856106e-310 6.89856146e-310 6.89856146e-310
 6.89856146e-310 6.89856106e-310 6.89856146e-310 6.89856194e-310
 6.89856146e-310 6.89856106e-310 6.89856182e-310 6.89856147e-310
 6.89856146e-310 6.89856147e-310 6.89856146e-310 6.89856081e-310
 6.89856083e-310 6.898561

Max= 5.386219331848072e-307
Index= 4
Unlabelled List Before: 13
Seed List Before: 7
Seed List After: 8
Unlabelled List After: 12
Before Sampling Unlabelled Data Size: 12
Before Sampling Seed Data Size: 8


  y = column_or_1d(y, warn=True)


Accuracy on the Test Set is : 0.48963075366717246
[6.06414447e-310 5.50783607e-310 3.50502955e-310 3.50502950e-310
 1.25000000e-001 1.25000000e-001 1.25000000e-001 1.25000000e-001
 1.25000000e-001 1.25000000e-001 1.25000000e-001 1.25000000e-001
 1.25000000e-001 1.25000000e-001 1.25000000e-001 1.25000000e-001
 1.25000000e-001 1.25000000e-001 1.25000000e-001 1.25000000e-001
 1.25000000e-001 1.25000000e-001 1.25000000e-001 1.25000000e-001
 1.25000000e-001 1.25000000e-001 1.25000000e-001 1.25000000e-001
 1.25000000e-001 1.25000000e-001 1.25000000e-001 1.25000000e-001
 1.25000000e-001 1.25000000e-001 1.25000000e-001 1.25000000e-001
 1.25000000e-001 1.25000000e-001 1.25000000e-001 1.25000000e-001
 4.31160113e-310 4.31160113e-310 5.17392156e-310 5.17392100e-310
 5.17392155e-310 5.17392155e-310 5.17392155e-310 5.17392197e-310
 5.17392121e-310 5.17392156e-310 5.17392156e-310 5.17392639e-310
 5.17392156e-310 5.17392155e-310 5.17392638e-310 5.17392197e-310
 5.17392155e-310 5.17392155e-310 5.17392

Max= 4.5
Index= 0
Unlabelled List Before: 12
Seed List Before: 8
Seed List After: 9
Unlabelled List After: 11
Before Sampling Unlabelled Data Size: 11
Before Sampling Seed Data Size: 9
Accuracy on the Test Set is : 0.48963075366717246


  y = column_or_1d(y, warn=True)


[ 6.65135913e-310  5.90961427e-310  3.11558182e-310  3.11558182e-310
  1.81019476e-003 -1.89027499e-003  2.20286185e-004  9.31035148e-004
 -3.29088454e-003 -1.21224593e-003  1.33809409e-003  2.07104468e-003
  9.86619735e-004  5.78810613e-004 -1.73950253e-004 -1.19011028e-004
 -2.92642615e-004  7.77747361e-004 -9.61010680e-004 -2.76567514e-004
 -1.53826842e-003 -2.26556832e-003 -9.14363806e-004 -1.08711990e-003
 -1.79532454e-003  1.78579242e-004  2.02344084e-003  6.10196319e-005
  2.34186803e-003 -2.46146235e-003  9.37404499e-004 -2.58000396e-003
  4.36021544e-003  5.41956071e-005 -3.57380577e-004 -1.58455971e-003
 -1.00662787e-003  2.50148888e-004  5.25080717e-004 -4.44990434e-004
 -1.20811230e-004 -1.71413696e-007 -1.91334329e-003 -3.25979934e-005
  1.02552781e-003  1.78372988e-003 -2.29666846e-003 -3.06466411e-004
 -3.77411887e-004 -3.90059694e-006 -2.34428576e-003  4.34177141e-004
  2.06095865e-005  2.40545836e-004 -8.30200320e-004 -8.37183081e-004
 -8.06923106e-004  8.37328218e-004

Max= -0.05949204442197228
Index= 0
Unlabelled List Before: 11
Seed List Before: 9
Seed List After: 10
Unlabelled List After: 10
Before Sampling Unlabelled Data Size: 10
Before Sampling Seed Data Size: 10
Accuracy on the Test Set is : 0.5103692463328275


  y = column_or_1d(y, warn=True)


[ 6.67608392e-310  6.67608392e-310  4.20603544e-310  4.20603548e-310
 -3.37918452e-003  4.47443797e-004 -2.18345516e-003 -1.56377003e-003
 -7.02118549e-003 -3.94770111e-003  1.36871939e-004 -2.73493863e-003
  2.89307878e-003 -2.67206726e-003 -4.31320977e-004  2.99977209e-003
  3.59364494e-003 -4.61608213e-003 -4.62972126e-004 -1.74821103e-004
 -4.15551316e-003  4.43428807e-003  3.42413730e-005  4.40714403e-004
  1.72552221e-003 -2.19531823e-003  2.72829470e-003 -2.23508485e-003
 -3.44591778e-004 -1.36244611e-003  1.07051673e-003  1.57297602e-003
  3.79708519e-004  8.34471339e-004  7.67927401e-004 -1.35150036e-003
 -4.84103325e-003 -8.50631696e-004  3.14506282e-006  2.72251271e-003
  1.71557284e-003  7.84935998e-003  1.35172767e-003  6.71629483e-004
 -3.28815247e-003 -5.00923893e-003  1.31621026e-003  4.81047958e-004
  8.84524731e-004 -1.78600462e-003  1.44125783e-003  4.01647236e-003
  1.48769381e-003 -7.38241011e-004  4.79382209e-003 -3.14595710e-003
  3.78678955e-003  6.14423549e-004

Max= -0.08134224626612435
Index= 0
Unlabelled List Before: 10
Seed List Before: 10
Seed List After: 11
Unlabelled List After: 9
Before Sampling Unlabelled Data Size: 9
Before Sampling Seed Data Size: 11
Accuracy on the Test Set is : 0.48963075366717246


  y = column_or_1d(y, warn=True)


[6.49401924e-310 6.08943117e-310 3.39881646e-310 3.39881654e-310
 5.64427718e-310 5.64427718e-310 5.64427693e-310 5.64427719e-310
 5.64427693e-310 5.64427718e-310 5.64427718e-310 5.64427718e-310
 5.64427718e-310 5.64427718e-310 5.64427718e-310 5.64427718e-310
 5.64427718e-310 5.64427718e-310 5.64427678e-310 5.64427718e-310
 5.64427718e-310 5.64427718e-310 5.64427718e-310 5.64427718e-310
 5.64427719e-310 5.64427693e-310 5.64427718e-310 5.64427693e-310
 5.64427718e-310 5.64427693e-310 5.64427718e-310 5.64427751e-310
 5.64427693e-310 5.64427773e-310 5.64427718e-310 5.64427718e-310
 5.64427719e-310 5.64427718e-310 5.64427828e-310 5.64427773e-310
 5.64427773e-310 5.64427803e-310 5.64427773e-310 5.64427827e-310
 5.64427828e-310 5.64427827e-310 5.64427802e-310 5.64427802e-310
 5.64427827e-310 5.64427827e-310 5.64427827e-310 5.64427827e-310
 5.64427827e-310 5.64427875e-310 5.64427875e-310 5.64427939e-310
 5.64427939e-310 5.64427939e-310 5.64427939e-310 5.64427939e-310
 5.64427939e-310 5.644279

KeyboardInterrupt: 

In [None]:
seed_list=templist[0:seed_size]
print("Initial Seed set size = "+str(len(seed_list)))
 
unlabelled_list=templist[seed_size:20]
print("Unlabelled dataset size = "+str(len(unlabelled_list)))

In [8]:
def uncertainity_based_and_train_test(seed_set, unlabelled_list, batch_size):
    print("Before Sampling Unlabelled Data Size: "+str(len(unlabelled_list)))
    print("Before Sampling Seed Data Size: "+str(len(seed_set)))
    
    seed_X=[]
    seed_y=[]

    for tup in seed_list:
        seed_X.append(list(tup[0]))
        seed_y.append(list(tup[1]))
    
    ulabelled_X=[]
    ulabelled_y=[]
    
    for tup in unlabelled_list:
        ulabelled_X.append(list(tup[0]))
        ulabelled_y.append(list(tup[1]))
        
    clf = LogisticRegression(C=50. / train_samples, penalty='l1', solver='saga', tol=0.1)
    clf.fit(seed_X, seed_y)
    
    test_X=[]
    test_y=[]

    for tup in testList:
        test_X.append(list(tup[0]))
        test_y.append(list(tup[1]))
    
    print("Accuracy on the Test Set is : "+str(sklearn.metrics.accuracy_score(test_y, clf.predict(test_X))))
    predictions_label_wise=np.array(clf.predict_proba(ulabelled_X))
    uncertainity_list = list(1-predictions_label_wise.max(axis=1))
    max_index = uncertainity_list.index(max(uncertainity_list))
    print("Initial seed size"+str(len(seed_list)))
    print("Initial unlabelled_list size"+str(len(unlabelled_list)))
    seed_list.append(unlabelled_list[max_index])
    del unlabelled_list[max_index]
    print("Final unlabelled_list size"+str(len(unlabelled_list)))
    print("Final seed size"+str(len(seed_list)))
    #print(max_index)

In [9]:
for i in range(len(unlabelled_list)):
    uncertainity_based_and_train_test(seed_list, unlabelled_list, 1)

Before Sampling Unlabelled Data Size: 15
Before Sampling Seed Data Size: 5
Accuracy on the Test Set is : 0.4943097476496784
Initial seed size5
Initial unlabelled_list size15
Final unlabelled_list size14
Final seed size6
0


  y = column_or_1d(y, warn=True)


Before Sampling Unlabelled Data Size: 14
Before Sampling Seed Data Size: 6
Accuracy on the Test Set is : 0.4943097476496784
Initial seed size6
Initial unlabelled_list size14
Final unlabelled_list size13
Final seed size7
0


  y = column_or_1d(y, warn=True)


Before Sampling Unlabelled Data Size: 13
Before Sampling Seed Data Size: 7
Accuracy on the Test Set is : 0.4943097476496784
Initial seed size7
Initial unlabelled_list size13
Final unlabelled_list size12
Final seed size8
0


  y = column_or_1d(y, warn=True)


Before Sampling Unlabelled Data Size: 12
Before Sampling Seed Data Size: 8
Accuracy on the Test Set is : 0.4943097476496784
Initial seed size8
Initial unlabelled_list size12
Final unlabelled_list size11
Final seed size9
0
Before Sampling Unlabelled Data Size: 11
Before Sampling Seed Data Size: 9
Accuracy on the Test Set is : 0.4943097476496784
Initial seed size9
Initial unlabelled_list size11
Final unlabelled_list size10
Final seed size10
0


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Before Sampling Unlabelled Data Size: 10
Before Sampling Seed Data Size: 10
Accuracy on the Test Set is : 0.4943097476496784
Initial seed size10
Initial unlabelled_list size10
Final unlabelled_list size9
Final seed size11
0
Before Sampling Unlabelled Data Size: 9
Before Sampling Seed Data Size: 11
Accuracy on the Test Set is : 0.4943097476496784
Initial seed size11
Initial unlabelled_list size9
Final unlabelled_list size8
Final seed size12
0


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Before Sampling Unlabelled Data Size: 8
Before Sampling Seed Data Size: 12
Accuracy on the Test Set is : 0.4943097476496784
Initial seed size12
Initial unlabelled_list size8
Final unlabelled_list size7
Final seed size13
0
Before Sampling Unlabelled Data Size: 7

  y = column_or_1d(y, warn=True)



Before Sampling Seed Data Size: 13
Accuracy on the Test Set is : 0.4943097476496784
Initial seed size13
Initial unlabelled_list size7
Final unlabelled_list size6
Final seed size14
0
Before Sampling Unlabelled Data Size: 6
Before Sampling Seed Data Size: 14
Accuracy on the Test Set is : 0.4943097476496784
Initial seed size14
Initial unlabelled_list size6
Final unlabelled_list size5
Final seed size15
0
Before Sampling Unlabelled Data Size: 5

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)



Before Sampling Seed Data Size: 15
Accuracy on the Test Set is : 0.4943097476496784
Initial seed size15
Initial unlabelled_list size5
Final unlabelled_list size4
Final seed size16
0
Before Sampling Unlabelled Data Size: 4
Before Sampling Seed Data Size: 16
Accuracy on the Test Set is : 0.4943097476496784
Initial seed size16
Initial unlabelled_list size4
Final unlabelled_list size3
Final seed size17
0


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Before Sampling Unlabelled Data Size: 3
Before Sampling Seed Data Size: 17
Accuracy on the Test Set is : 0.4943097476496784
Initial seed size17
Initial unlabelled_list size3
Final unlabelled_list size2
Final seed size18
0


  y = column_or_1d(y, warn=True)


Before Sampling Unlabelled Data Size: 2
Before Sampling Seed Data Size: 18
Accuracy on the Test Set is : 0.4943097476496784
Initial seed size18
Initial unlabelled_list size2
Final unlabelled_list size1
Final seed size19
0
Before Sampling Unlabelled Data Size: 1
Before Sampling Seed Data Size: 19
Accuracy on the Test Set is : 0.4943097476496784
Initial seed size19
Initial unlabelled_list size1
Final unlabelled_list size0
Final seed size20
0


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
