In [1]:
import time
import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import fetch_openml
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.utils import check_random_state
import sklearn
from sklearn.metrics import roc_auc_score
import random
random.seed(0)

In [2]:
t0 = time.time()
train_samples = 60000
# Load data from https://www.openml.org/d/554
X, y = fetch_openml('mnist_784', version=1, return_X_y=True)
X = X.reshape((X.shape[0], -1))
X_train, X_test, y_train, y_test = train_test_split(
X, y, train_size=train_samples, test_size=10000)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print("==================")
X_train=list(X_train)
X_test=list(X_test)
print(len(X_train))
print(len(X_test))
print("==================")

60000
10000


In [25]:
templist = []
for tup in zip(X_train, y_train):
    if(tup[1]=='8' or tup[1]=='3'):
        templist.append(list(tup))
    
testList = []
for tup in zip(X_test, y_test):
    if(tup[1]=='8' or tup[1]=='3'):
        testList.append(list(tup))
    
print("Initial Training set size = "+str(len(templist)))
print("Initial Test set size = "+str(len(testList)))

Initial Training set size = 11885
Initial Test set size = 2081


In [41]:
X_test=[]
y_test=[]

for tup in testList:
	X_test.append(tup[0])
	y_test.append(tup[1])


random.shuffle(templist)

#seed_size=int(0.1*len(templist))
seed_size=5

seed_list=templist[0:seed_size]
print("Initial Seed set size = "+str(len(seed_list)))
 
unlabelled_list=templist[seed_size:20]
print("Unlabelled dataset size = "+str(len(unlabelled_list)))

Initial Seed set size = 5
Unlabelled dataset size = 15


In [42]:
import numpy as np
import lime
import lime.lime_tabular


def explanation_generation_based_and_train_test(seed_set, unlabelled_list, batch_size):
    print("Before Sampling Unlabelled Data Size: "+str(len(unlabelled_list)))
    print("Before Sampling Seed Data Size: "+str(len(seed_set)))
    
    feature_names=[]

    for i in range(784):
        feature='feature_'+str(i)
        feature_names.append(feature)
    

    class_names=[]
    classes=['3','8']

    for i in classes:
        class_1='class_'+str(i)
        class_names.append(class_1)
        
    clf = LogisticRegression(C=50. / train_samples, penalty='l1', solver='saga', tol=0.1)
    
    seed_X=[]
    seed_y=[]

    for tup in seed_list:
        seed_X.append(list(tup[0]))
        seed_y.append(list(tup[1]))
        
    test_X=[]
    test_y=[]

    for tup in testList:
        test_X.append(list(tup[0]))
        test_y.append(list(tup[1]))
    
    clf.fit(seed_X, seed_y)
    print("Accuracy on the Test Set is : "+str(sklearn.metrics.accuracy_score(test_y, clf.predict(test_X))))
    explainer = lime.lime_tabular.LimeTabularExplainer(np.array(seed_X), feature_names=feature_names, class_names=class_names, discretize_continuous=False)
    
    total_np_array=[]
    num_features=784
    for item in seed_list:
        #print(item)
        exp = explainer.explain_instance(np.array(item[0]), clf.predict_proba, num_features=num_features)
        vectors = np.empty([1, num_features])
        for elem in list(exp.as_list()):
            np.append(vectors, elem[1], axis=None)
        total_np_array.append(vectors[0])
        
    
    seed_mean = np.mean(np.array(total_np_array), axis=0)
    
    ulabelled_X=[]
    ulabelled_y=[]

    for tup in unlabelled_list:
        ulabelled_X.append(list(tup[0]))
        ulabelled_y.append(list(tup[1]))
        
    import random
    random_batch=[]
    for i in range(batch_size):
        elem=random.choice(unlabelled_list)
        random_batch.append(elem)
        
    #print(random_batch[1][1])
    
    new_x0=[]
    new_y0=[]
    
    for tup in random_batch:
        new_x0.append(list(tup[0]))
        new_y0.append(list('3'))
    
    #print(new_y0)
    
    new_x1=[]
    new_y1=[]
    
    for tup in random_batch:
        new_x1.append(list(tup[0]))
        new_y1.append(list('8'))
    
    #print(new_y0[0], new_y1[0])
    
    ## Preparing Seed X0
    
    seed_x0=[]
    seed_y0=[]
    
    for tup in seed_list:
        seed_x0.append(list(tup[0]))
        seed_y0.append(list(tup[1]))
    
    #print(seed_y0[-1])
    
    
    for x in new_x0:
        seed_x0.append(x)
        
    for y in new_y0:
        seed_y0.append(y)
        
    print(seed_y0)
    
    
    ## Preparing Seed X1
    
    seed_x1=[]
    seed_y1=[]
    
    for tup in seed_list:
        seed_x1.append(list(tup[0]))
        seed_y1.append(list(tup[1]))
    
    for x in new_x1:
        seed_x1.append(x)
        
    for y in new_y1:
        seed_y1.append(y)
        
        
        
    print(seed_y1)
    
    clf0 = LogisticRegression(C=50. / train_samples, penalty='l1', solver='saga', tol=0.1)
    clf0.fit(seed_x0, seed_y0)
    
    clf1 = LogisticRegression(C=50. / train_samples, penalty='l1', solver='saga', tol=0.1)
    clf1.fit(seed_x1, seed_y1)
    
    explainer0 = lime.lime_tabular.LimeTabularExplainer(np.array(seed_x0), feature_names=feature_names, class_names=class_names, discretize_continuous=False)
    explainer1 = lime.lime_tabular.LimeTabularExplainer(np.array(seed_x1), feature_names=feature_names, class_names=class_names, discretize_continuous=False)
    
    # Mean for LIME(M(S0),x)
    
    total_np_array_0=[]
    num_features=784
    for item in seed_x0:
        exp = explainer0.explain_instance(np.array(item), clf.predict_proba, num_features=num_features)
        vectors = np.empty([1, num_features])
        for elem in list(exp.as_list()):
            np.append(vectors, elem[1], axis=None)
        total_np_array_0.append(vectors[0])
        
    
    seed_mean0 = np.mean(np.array(total_np_array_0), axis=0)
    
    
    # Mean for LIME(M(S1),x)
    
    total_np_array_1=[]
    num_features=784
    for item in seed_x1:
        exp = explainer0.explain_instance(np.array(item), clf.predict_proba, num_features=num_features)
        vectors = np.empty([1, num_features])
        for elem in list(exp.as_list()):
            np.append(vectors, elem[1], axis=None)
        total_np_array_1.append(vectors[0])
        
    
    seed_mean1 = np.mean(np.array(total_np_array_1), axis=0)
    
    # Computing the maximum of del[LIME(M(S0),x)] and del[LIME(M(S1),x)]
    
    delMO = np.subtract(np.array(seed_mean0), np.array(seed_mean))
    delM1 = np.subtract(np.array(seed_mean1), np.array(seed_mean))
    
    
    if(np.mean(delMO) > np.mean(delM1)):
        final_seed_x=seed_x0
        final_seed_y=seed_y0
    else:
        final_seed_x=seed_x1
        final_seed_y=seed_y1
        
    #print(unlabelled_list.index(final_seed_x))
    final_seed=[]
    
    
    print(len(ulabelled_X))
    for elem in zip(final_seed_x, final_seed_y):
        final_seed.append(elem)
        
        i=0
        index=[]
        
        for index, item in enumerate(ulabelled_X):
            if(item==elem[0]):
                print("Found at index"+str(index))
                del ulabelled_X[index]
                del ulabelled_y[index]
                
    
    print(len(ulabelled_X))
    
    new_unlabelled_list = []
    for elem in zip(ulabelled_X, ulabelled_y):
        new_unlabelled_list.append(elem)
        
    return final_seed, new_unlabelled_list
    
    '''
    
    explainer = lime.lime_tabular.LimeTabularExplainer(np.array(ulabelled_X), feature_names=feature_names, class_names=class_names, discretize_continuous=False)
    
    sums=[]
    for item in unlabelled_list:
        exp = explainer.explain_instance(item[0], clf.predict_proba, num_features=num_features)
        vectors = np.empty([1, num_features])
        for elem in list(exp.as_list()):
            np.append(vectors, elem[1], axis=None)
        np_vector=vectors[0]
        sums.append(np.sum(np.subtract(seed_mean, np_vector)))
        
    
    
    print("Max= "+str(max(sums)))
    print("Index= "+str(sums.index(max(sums))))
    
    print("Unlabelled List Before: "+str(len(unlabelled_list)))
    index = sums.index(max(sums))
    
    
    print("Seed List Before: "+str(len(seed_list)))
    seed_list.append(unlabelled_list[index])
    print("Seed List After: "+str(len(seed_list)))
    
    del unlabelled_list[index]
    print("Unlabelled List After: "+str(len(unlabelled_list)))'''

In [43]:
batch_size=1
for i in range(5):
    seed_list, unlabelled_list = explanation_generation_based_and_train_test(seed_list, unlabelled_list, batch_size)

Before Sampling Unlabelled Data Size: 15
Before Sampling Seed Data Size: 5
Accuracy on the Test Set is : 0.4848630466122057


  y = column_or_1d(y, warn=True)


[['3'], ['8'], ['8'], ['3'], ['3'], ['3']]
[['3'], ['8'], ['8'], ['3'], ['3'], ['8']]


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


15
Found at index5
14
Before Sampling Unlabelled Data Size: 14
Before Sampling Seed Data Size: 6
Accuracy on the Test Set is : 0.4848630466122057


  y = column_or_1d(y, warn=True)


[['3'], ['8'], ['8'], ['3'], ['3'], ['8'], ['3']]
[['3'], ['8'], ['8'], ['3'], ['3'], ['8'], ['8']]


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


14
Found at index6
13
Before Sampling Unlabelled Data Size: 13
Before Sampling Seed Data Size: 7
Accuracy on the Test Set is : 0.4848630466122057


  y = column_or_1d(y, warn=True)


[['3'], ['8'], ['8'], ['3'], ['3'], ['8'], ['3'], ['3']]
[['3'], ['8'], ['8'], ['3'], ['3'], ['8'], ['3'], ['8']]


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


13
Found at index3
12
Before Sampling Unlabelled Data Size: 12
Before Sampling Seed Data Size: 8
Accuracy on the Test Set is : 0.5151369533877943


  y = column_or_1d(y, warn=True)


[['3'], ['8'], ['8'], ['3'], ['3'], ['8'], ['3'], ['8'], ['3']]
[['3'], ['8'], ['8'], ['3'], ['3'], ['8'], ['3'], ['8'], ['8']]


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


12
Found at index7
11
Before Sampling Unlabelled Data Size: 11
Before Sampling Seed Data Size: 9
Accuracy on the Test Set is : 0.5151369533877943


  y = column_or_1d(y, warn=True)


[['3'], ['8'], ['8'], ['3'], ['3'], ['8'], ['3'], ['8'], ['3'], ['3']]
[['3'], ['8'], ['8'], ['3'], ['3'], ['8'], ['3'], ['8'], ['3'], ['8']]


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


11
Found at index5
10
