In [1]:
import time
import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import fetch_openml
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.utils import check_random_state
import sklearn
from sklearn.metrics import roc_auc_score
import random
random.seed(0)

In [3]:
t0 = time.time()
train_samples = 60000
# Load data from https://www.openml.org/d/554
X, y = fetch_openml('mnist_784', version=1, return_X_y=True)
X = X.reshape((X.shape[0], -1))
X_train, X_test, y_train, y_test = train_test_split(
X, y, train_size=train_samples, test_size=10000)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print("==================")
X_train=list(X_train)
X_test=list(X_test)
print(len(X_train))
print(len(X_test))
print("==================")

60000
10000


In [4]:
templist = []
for tup in zip(X_train, y_train):
    if(tup[1]=='8' or tup[1]=='3'):
        templist.append(list(tup))
    
testList = []
for tup in zip(X_test, y_test):
    if(tup[1]=='8' or tup[1]=='3'):
        testList.append(list(tup))
    
print("Initial Training set size = "+str(len(templist)))
print("Initial Test set size = "+str(len(testList)))

Initial Training set size = 11992
Initial Test set size = 1974


In [5]:
X_test=[]
y_test=[]

for tup in testList:
	X_test.append(tup[0])
	y_test.append(tup[1])


random.shuffle(templist)

seed_size=int(0.1*len(templist))
#seed_size=5

seed_list=templist[0:seed_size]
print("Initial Seed set size = "+str(len(seed_list)))
 
unlabelled_list=templist[seed_size:]
print("Unlabelled dataset size = "+str(len(unlabelled_list)))


Initial Seed set size = 1199
Unlabelled dataset size = 10793


In [10]:
unlabelled_list[5][1]

'3'

In [11]:
def random_instance_generate(seed_set, unlabelled_list, batch_size):
    random_elements=[]
    print("Before Sampling Unlabelled Data Size: "+str(len(unlabelled_list)))
    print("Before Sampling Seed Data Size: "+str(len(seed_set)))
    random.shuffle(unlabelled_list)
    for i in range(batch_size):
        rand_elem = random.choice(unlabelled_list)
        random_elements.append(rand_elem)
    for elem in random_elements:
        seed_set.append(elem)
        ul=[]
    for i in random_elements:
        ul.append(list(i[0]))
        
    new_unlabelled=[]
    for elem in unlabelled_list:
        if(list(elem[0]) in ul):
            continue
        new_unlabelled.append(elem)
    print("After Sampling Unlabelled Data Size : "+str(len(new_unlabelled)))
    print("After Sampling Seed Data Size : "+str(len(seed_set)))
    return new_unlabelled, seed_set

In [21]:
import numpy as np
import numpy.random as npr

def select_random_from_unlabeled(unlabelled_list, batch_size):
    selected = set(npr.choice(len(unlabelled_list), batch_size, replace=False))
    new_unlabeled = []
    
    i=0
    for x in unlabelled_list:
        if not i in selected:
            new_unlabeled.append(x)
        i=i+1
    return new_unlabeled

In [22]:
def train_model(data, X_test, y_test):
    print("--------------------Start------------------")
    clf = LogisticRegression(C=50. / train_samples, penalty='l2', solver='saga', tol=0.1)
    print("Training with seed size: "+str(len(data)))
    random.shuffle(data)
    X_train=[]
    y_train=[]
    for i in range(len(data)):
        try:
            #print(data[i][1])
            X_train.append(list(data[i][0]))
            y_train.append(data[i][1])
        except:
            pass
        
    print(type(X_train[-1]))
    clf.fit(X_train, y_train)
    print("Accuracy on the Test Set is : "+str(sklearn.metrics.accuracy_score(y_test, clf.predict(X_test))))
    #print("--------------------End------------------")
    return(clf)


In [23]:

batch_size = 1000
#stopping_criteria = int(len(unlabelled_list)/100)
print(len(X_test))
train_model(seed_list, X_test, y_test)

niters = 2

for i in range(niters):
    print("Seed Set Size in Iteration " + str(i) +" is :" + str(len(seed_list)))
    #unlabelled_list, seed_list = random_instance_generate(seed_list, unlabelled_list, batch_size)    
    unlabelled_list = select_random_from_unlabeled(unlabelled_list, batch_size)
    seed_list = seed_list + unlabelled_list
    
    train_model(seed_list, X_test, y_test)
   

1974
--------------------Start------------------
Training with seed size: 8599
<class 'list'>
Accuracy on the Test Set is : 0.9432624113475178
Seed Set Size in Iteration 0 is :8599
--------------------Start------------------
Training with seed size: 11416
<class 'list'>
Accuracy on the Test Set is : 0.9468085106382979
Seed Set Size in Iteration 1 is :11416
--------------------Start------------------
Training with seed size: 13233
<class 'list'>
Accuracy on the Test Set is : 0.9478216818642351
