In [1]:
import time
import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import fetch_openml
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.utils import check_random_state
import sklearn
from sklearn.metrics import roc_auc_score
import random
random.seed(0)

In [2]:
t0 = time.time()
train_samples = 60000
# Load data from https://www.openml.org/d/554
X, y = fetch_openml('mnist_784', version=1, return_X_y=True)
X = X.reshape((X.shape[0], -1))
X_train, X_test, y_train, y_test = train_test_split(
X, y, train_size=train_samples, test_size=10000)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print("==================")
X_train=list(X_train)
X_test=list(X_test)
print(len(X_train))
print(len(X_test))
print("==================")

60000
10000


In [3]:
templist = []
for tup in zip(X_train, y_train):
    if(tup[1]=='8' or tup[1]=='3'):
        templist.append(list(tup))
    
testList = []
for tup in zip(X_test, y_test):
    if(tup[1]=='8' or tup[1]=='3'):
        testList.append(list(tup))
    
print("Initial Training set size = "+str(len(templist)))
print("Initial Test set size = "+str(len(testList)))

Initial Training set size = 11958
Initial Test set size = 2008


In [9]:
X_test=[]
y_test=[]

for tup in testList:
	X_test.append(tup[0])
	y_test.append(tup[1])


random.shuffle(templist)

seed_size=int(0.1*len(templist))
#seed_size=5

seed_list=templist[0:seed_size]
print("Initial Seed set size = "+str(len(seed_list)))
 
unlabelled_list=templist[seed_size:]
print("Unlabelled dataset size = "+str(len(unlabelled_list)))


Initial Seed set size = 1195
Unlabelled dataset size = 10763


In [5]:
unlabelled_list[2][1]

'3'

In [6]:
def random_instance_generate(seed_set, unlabelled_list, batch_size):
    random_elements=[]
    print("Before Sampling Unlabelled Data Size: "+str(len(unlabelled_list)))
    print("Before Sampling Seed Data Size: "+str(len(seed_set)))
    random.shuffle(unlabelled_list)
    for i in range(batch_size):
        rand_elem = random.choice(unlabelled_list)
        random_elements.append(rand_elem)
    for elem in random_elements:
        seed_set.append(elem)
        ul=[]
    for i in random_elements:
        ul.append(list(i[0]))
        
    new_unlabelled=[]
    for elem in unlabelled_list:
        if(list(elem[0]) in ul):
            continue
        new_unlabelled.append(elem)
    print("After Sampling Unlabelled Data Size : "+str(len(new_unlabelled)))
    print("After Sampling Seed Data Size : "+str(len(seed_set)))
    return new_unlabelled, seed_set

In [7]:
def train_model(data, X_test, y_test):
    print("--------------------Start------------------")
    clf = LogisticRegression(C=50. / train_samples, penalty='l1', solver='saga', tol=0.1)
    print("Training with seed size: "+str(len(data)))
    random.shuffle(data)
    X_train=[]
    y_train=[]
    for i in range(len(data)):
        try:
            #print(data[i][1])
            X_train.append(list(data[i][0]))
            y_train.append(data[i][1])
        except:
            pass
        
    print(type(X_train[-1]))
    clf.fit(X_train, y_train)
    print("Accuracy on the Test Set is : "+str(sklearn.metrics.accuracy_score(y_test, clf.predict(X_test))))
    #print("--------------------End------------------")
    return(clf)


In [8]:

batch_size = 100
stopping_criteria = int(len(unlabelled_list)/100)
print(len(X_test))
train_model(seed_list, X_test, y_test)

for i in range(stopping_criteria):
    print("Seed Set Size in Iteration " + str(i) +" is :" + str(len(seed_list)))
    unlabelled_list, seed_list = random_instance_generate(seed_list, unlabelled_list, batch_size)
    train_model(seed_list, X_test, y_test)
   

2008
--------------------Start------------------
Training with seed size: 5
<class 'list'>
Accuracy on the Test Set is : 0.4960159362549801
Seed Set Size in Iteration 0 is :5
Before Sampling Unlabelled Data Size: 11953
Before Sampling Seed Data Size: 5
After Sampling Unlabelled Data Size : 11855
After Sampling Seed Data Size : 105
--------------------Start------------------
Training with seed size: 105
<class 'list'>
Accuracy on the Test Set is : 0.4960159362549801
Seed Set Size in Iteration 1 is :105
Before Sampling Unlabelled Data Size: 11855
Before Sampling Seed Data Size: 105
After Sampling Unlabelled Data Size : 11755
After Sampling Seed Data Size : 205
--------------------Start------------------
Training with seed size: 205
<class 'list'>
Accuracy on the Test Set is : 0.4960159362549801
Seed Set Size in Iteration 2 is :205
Before Sampling Unlabelled Data Size: 11755
Before Sampling Seed Data Size: 205


KeyboardInterrupt: 