# TensorFlow Semi-supervised Self-training Classification with mnist Dataset

In [1]:
import shutil
import numpy as np
import pandas as pd
import tensorflow as tf
print(tf.__version__)

1.13.1


## Get data

In [2]:
mnist = tf.keras.datasets.mnist

(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0

In [3]:
print("x_train.shape = {}".format(x_train.shape))
print("y_train.shape = {}".format(y_train.shape))
print("x_test.shape = {}".format(x_test.shape))
print("y_test.shape = {}".format(y_test.shape))

x_train.shape = (60000, 28, 28)
y_train.shape = (60000,)
x_test.shape = (10000, 28, 28)
y_test.shape = (10000,)


In [4]:
HEIGHT = 28
WIDTH = 28
NCLASSES = 10

In [5]:
y_train = np.eye(N = NCLASSES)[y_train]
y_test = np.eye(N = NCLASSES)[y_test]

In [6]:
print("x_train.shape = {}".format(x_train.shape))
print("y_train.shape = {}".format(y_train.shape))
print("x_test.shape = {}".format(x_test.shape))
print("y_test.shape = {}".format(y_test.shape))

x_train.shape = (60000, 28, 28)
y_train.shape = (60000, 10)
x_test.shape = (10000, 28, 28)
y_test.shape = (10000, 10)


## Create fully supervised model for comparison

In [7]:
train_input_fn = tf.estimator.inputs.numpy_input_fn(
  x = {"image": x_train},
  y = y_train,
  batch_size = 100,
  num_epochs = None,
  shuffle = True,
  queue_capacity = 5000)

eval_input_fn = tf.estimator.inputs.numpy_input_fn(
  x = {"image": x_test},
  y = y_test,
  batch_size = 100,
  num_epochs = 1,
  shuffle = False,
  queue_capacity = 5000)

In [8]:
def linear_model(img, mode, hparams):
  X = tf.reshape(tensor = img, shape = [-1,HEIGHT * WIDTH])  # flatten
  ylogits = tf.layers.dense(inputs = X, units = NCLASSES, activation = None)
  return ylogits, NCLASSES

def dnn_model(img, mode, hparams):
  X = tf.reshape(tensor = img, shape = [-1, HEIGHT * WIDTH])  # flatten
  h1 = tf.layers.dense(inputs = X, units = 300, activation = tf.nn.relu)
  h2 = tf.layers.dense(inputs = h1, units = 100, activation = tf.nn.relu)
  h3 = tf.layers.dense(inputs = h2, units = 30, activation = tf.nn.relu)
  ylogits = tf.layers.dense(inputs = h3, units = NCLASSES, activation = None)
  return ylogits, NCLASSES

def dnn_dropout_model(img, mode, hparams):
  dprob = hparams.get("dprob", 0.1)

  X = tf.reshape(tensor = img, shape = [-1, HEIGHT * WIDTH])  #flatten
  h1 = tf.layers.dense(inputs = X, units = 300, activation = tf.nn.relu)
  h2 = tf.layers.dense(inputs = h1, units = 100, activation = tf.nn.relu)
  h3 = tf.layers.dense(inputs = h2, units = 30, activation = tf.nn.relu)
  h3d = tf.layers.dropout(
    inputs = h3, 
    rate = dprob, 
    training = (mode == tf.estimator.ModeKeys.TRAIN))  # only dropout when training
  ylogits = tf.layers.dense(inputs = h3d, units = NCLASSES, activation = None)
  return ylogits, NCLASSES

def cnn_model(img, mode, hparams):
  ksize1 = hparams.get("ksize1", 5)
  ksize2 = hparams.get("ksize2", 5)
  nfil1 = hparams.get("nfil1", 10)
  nfil2 = hparams.get("nfil2", 20)
  dprob = hparams.get("dprob", 0.25)

  c1 = tf.layers.conv2d(inputs = img, filters = nfil1,
              kernel_size = ksize1, strides = 1, # ?x28x28x10
              padding = "same", activation = tf.nn.relu)
  p1 = tf.layers.max_pooling2d(inputs = c1, pool_size = 2, strides = 2)  # ?x14x14x10
  c2 = tf.layers.conv2d(inputs = p1, filters = nfil2,
              kernel_size = ksize2, strides = 1, 
              padding = "same", activation = tf.nn.relu)
  p2 = tf.layers.max_pooling2d(inputs = c2, pool_size = 2, strides = 2)  # ?x7x7x20
  
  outlen = p2.shape[1] * p2.shape[2] * p2.shape[3] #980
  p2flat = tf.reshape(tensor = p2, shape = [-1, outlen]) # flattened

  # Apply batch normalization
  if hparams["batch_norm"]:
    h3 = tf.layers.dense(inputs = p2flat, units = 300, activation = None)
    h3 = tf.layers.batch_normalization(
      x = h3, 
      training = (mode == tf.estimator.ModeKeys.TRAIN))  # only batchnorm when training
    h3 = tf.nn.relu(x = h3)
  else:  
    h3 = tf.layers.dense(inputs = p2flat, units = 300, activation = tf.nn.relu)
  
  # Apply dropout
  h3d = tf.layers.dropout(
    inputs = h3, rate = dprob, training = (mode == tf.estimator.ModeKeys.TRAIN))

  ylogits = tf.layers.dense(inputs = h3d, units = NCLASSES, activation = None)
    
  # Apply batch normalization once more
  if hparams["batch_norm"]:
     ylogits = tf.layers.batch_normalization(
       x = ylogits, 
       training = (mode == tf.estimator.ModeKeys.TRAIN))

  return ylogits, NCLASSES

In [9]:
def image_classifier(features, labels, mode, params):
  print("\nfeatures = \n{}".format(features))
  print("labels = \n{}".format(labels))
  print("mode = \n{}".format(mode))
  print("params = \n{}".format(params))
  
  model_functions = {
    "linear":linear_model,
    "dnn":dnn_model,
    "dnn_dropout":dnn_dropout_model,
    "cnn":cnn_model}
  
  model_function = model_functions[params["model"]]  
  
  ylogits, nclasses = model_function(features["image"], mode, params)
  print("ylogits = \n{}".format(ylogits))
  probabilities = tf.nn.softmax(logits = ylogits)  # shape = (current_batch_size, NCLASSES)
  print("probabilities = \n{}".format(probabilities))
  class_ids = tf.cast(
    x = tf.argmax(
      input = probabilities, axis = 1), dtype = tf.uint8)  # shape = (current_batch_size,)
  print("class_ids = \n{}".format(class_ids))
  
  if mode == tf.estimator.ModeKeys.TRAIN or mode == tf.estimator.ModeKeys.EVAL:
    loss = tf.reduce_mean(
      input_tensor = tf.nn.softmax_cross_entropy_with_logits_v2(
        logits = ylogits, labels = labels))
    eval_metric_ops = {
      "accuracy": tf.metrics.accuracy(
        labels = tf.argmax(input = labels, axis = 1), predictions = class_ids)}
    if mode == tf.estimator.ModeKeys.TRAIN:
      # This is needed for batch normalization, but has no effect otherwise
      update_ops = tf.get_collection(key = tf.GraphKeys.UPDATE_OPS)
      with tf.control_dependencies(update_ops):
        train_op = tf.contrib.layers.optimize_loss(
          loss = loss, 
          global_step = tf.train.get_global_step(),
          learning_rate = params["learning_rate"], 
          optimizer = "Adam")
    else:
      train_op = None
  else:
    loss = None
    train_op = None
    eval_metric_ops = None
 
  return tf.estimator.EstimatorSpec(
    mode = mode,
    predictions = {"probabilities": probabilities, "class_ids": class_ids},
    loss = loss,
    train_op = train_op,
    eval_metric_ops = eval_metric_ops,
    export_outputs = {
      "classes": tf.estimator.export.PredictOutput(
        {"probabilities": probabilities, 
         "class_ids": class_ids})})

In [10]:
def serving_input_fn():
  # Input will be rank 3
  feature_placeholders = {
    "image": tf.placeholder(dtype = tf.float64, shape = [None, HEIGHT, WIDTH])}
  # But model function requires rank 4
  features = {
    "image": tf.expand_dims(input = feature_placeholders["image"], axis = -1)} 
  return tf.estimator.export.ServingInputReceiver(
    features = features, 
    receiver_tensors = feature_placeholders)

In [11]:
def train_and_evaluate(output_dir, hparams):
  # Ensure filewriter cache is clear for TensorBoard events file
  tf.summary.FileWriterCache.clear()
  EVAL_INTERVAL = 60

  supervised_estimator = tf.estimator.Estimator(
    model_fn = image_classifier,
    params = hparams,
    config = tf.estimator.RunConfig(
      save_checkpoints_secs = EVAL_INTERVAL),
    model_dir = output_dir)
  
  train_spec = tf.estimator.TrainSpec(
    input_fn = train_input_fn,
    max_steps = hparams["train_steps"])
  
  exporter = tf.estimator.LatestExporter(
    name = "exporter", 
    serving_input_receiver_fn = serving_input_fn)
  
  eval_spec = tf.estimator.EvalSpec(
    input_fn = eval_input_fn,
    steps = None,
    exporters = exporter,
    throttle_secs = EVAL_INTERVAL)
  
  tf.estimator.train_and_evaluate(
    estimator = supervised_estimator, 
    train_spec = train_spec, 
    eval_spec = eval_spec)
  
  return supervised_estimator

In [12]:
hparams = {}
hparams["train_batch_size"] = 100
hparams["learning_rate"] = 0.01
hparams["train_steps"] = 1000
hparams["ksize1"] = 5
hparams["ksize2"] = 5
hparams["nfil1"] = 10
hparams["nfil2"] = 20
hparams["dprob"] = 0.1
hparams["batch_norm"] = False
hparams["model"] = "linear"

In [13]:
SUPERVISED_MODEL_DIR = "supervised_trained"
shutil.rmtree(path = SUPERVISED_MODEL_DIR, ignore_errors = True)  # start fresh each time
supervised_estimator = train_and_evaluate(SUPERVISED_MODEL_DIR, hparams)

INFO:tensorflow:Using config: {'_service': None, '_device_fn': None, '_task_type': 'worker', '_evaluation_master': '', '_eval_distribute': None, '_is_chief': True, '_protocol': None, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_save_summary_steps': 100, '_keep_checkpoint_every_n_hours': 10000, '_save_checkpoints_secs': 60, '_tf_random_seed': None, '_master': '', '_log_step_count_steps': 100, '_save_checkpoints_steps': None, '_global_id_in_cluster': 0, '_model_dir': 'supervised_trained', '_keep_checkpoint_max': 5, '_num_ps_replicas': 0, '_num_worker_replicas': 1, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f5dd805bcf8>, '_task_id': 0, '_train_distribute': None, '_experimental_distribute': None}
INFO:tensorflow:Not using Distribute Coordinator.
INFO:tensorflow:Running training and evaluation locally (non-distributed).
INFO:tensorflow:Start train and evaluate loop. The ev

In [14]:
eval_metrics = supervised_estimator.evaluate(input_fn = eval_input_fn, steps = None)

INFO:tensorflow:Calling model_fn.

features = 
{'image': <tf.Tensor 'fifo_queue_DequeueUpTo:1' shape=(?, 28, 28) dtype=float64>}
labels = 
Tensor("fifo_queue_DequeueUpTo:2", shape=(?, 10), dtype=float64, device=/device:CPU:0)
mode = 
eval
params = 
{'ksize1': 5, 'batch_norm': False, 'ksize2': 5, 'learning_rate': 0.01, 'train_batch_size': 100, 'train_steps': 1000, 'nfil2': 20, 'nfil1': 10, 'dprob': 0.1, 'model': 'linear'}
ylogits = 
Tensor("dense/BiasAdd:0", shape=(?, 10), dtype=float64)
probabilities = 
Tensor("Softmax:0", shape=(?, 10), dtype=float64)
class_ids = 
Tensor("Cast:0", shape=(?,), dtype=uint8)
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2019-03-15T23:55:42Z
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from supervised_trained/model.ckpt-1000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2019-03-15-23:55:43
INFO:tensorflow:Saving dict for

## Now create semi-supervised model

In [15]:
percent_labeled = 0.01

In [16]:
number_of_train_examples = x_train.shape[0]
print("number_of_train_examples = {}".format(number_of_train_examples))

number_of_train_examples = 60000


In [17]:
number_of_labeled_train_examples = int(number_of_train_examples * percent_labeled)
number_of_unlabeled_train_examples = number_of_train_examples - number_of_labeled_train_examples
print("number_of_labeled_train_examples = {} & number_of_unlabeled_train_examples = {}".format(number_of_labeled_train_examples, number_of_unlabeled_train_examples))

number_of_labeled_train_examples = 600 & number_of_unlabeled_train_examples = 59400


In [18]:
semi_supervised_labeled_x_train_original_arr = x_train[0:number_of_labeled_train_examples]
semi_supervised_labeled_y_train_original_arr = y_train[0:number_of_labeled_train_examples]
semi_supervised_unlabeled_x_train_original_arr = x_train[number_of_labeled_train_examples:]

In [19]:
print("semi_supervised_labeled_x_train_original_arr.shape = {}".format(semi_supervised_labeled_x_train_original_arr.shape))
print("semi_supervised_labeled_y_train_original_arr.shape = {}".format(semi_supervised_labeled_y_train_original_arr.shape))
print("semi_supervised_unlabeled_x_train_original_arr.shape = {}".format(semi_supervised_unlabeled_x_train_original_arr.shape))

semi_supervised_labeled_x_train_original_arr.shape = (600, 28, 28)
semi_supervised_labeled_y_train_original_arr.shape = (600, 10)
semi_supervised_unlabeled_x_train_original_arr.shape = (59400, 28, 28)


## Create semi-supervised model using sparse labels

In [20]:
SEMI_SUPERVISED_MODEL_DIR = "semi_supervised_trained"

In [21]:
EVAL_INTERVAL = 30
semi_supervised_estimator = tf.estimator.Estimator(
  model_fn = image_classifier,
  params = hparams,
  config = tf.estimator.RunConfig(
    save_checkpoints_secs = EVAL_INTERVAL),
  model_dir = SEMI_SUPERVISED_MODEL_DIR)

INFO:tensorflow:Using config: {'_service': None, '_device_fn': None, '_task_type': 'worker', '_evaluation_master': '', '_eval_distribute': None, '_is_chief': True, '_protocol': None, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_save_summary_steps': 100, '_keep_checkpoint_every_n_hours': 10000, '_save_checkpoints_secs': 30, '_tf_random_seed': None, '_master': '', '_log_step_count_steps': 100, '_save_checkpoints_steps': None, '_global_id_in_cluster': 0, '_model_dir': 'semi_supervised_trained', '_keep_checkpoint_max': 5, '_num_ps_replicas': 0, '_num_worker_replicas': 1, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f5da6e157f0>, '_task_id': 0, '_train_distribute': None, '_experimental_distribute': None}


In [22]:
confidence_threshold = 0.99

In [23]:
shutil.rmtree(path = SEMI_SUPERVISED_MODEL_DIR, ignore_errors = True) # start fresh each time

semi_supervised_labeled_x_train_arr = semi_supervised_labeled_x_train_original_arr
semi_supervised_labeled_y_train_arr = semi_supervised_labeled_y_train_original_arr
semi_supervised_unlabeled_x_train_arr = semi_supervised_unlabeled_x_train_original_arr

new_labeled_x_train_arr = np.zeros([1])

accuracy = 0.000001
old_accuracy = 0.0

loop_counter = 0
while semi_supervised_unlabeled_x_train_arr.shape[0] > 0 and new_labeled_x_train_arr.shape[0] > 0 and accuracy > old_accuracy:
  print("\nloop_counter = {}, number_of_labeled_examples = {}, number_of_unlabeled_examples = {}\n".format(loop_counter, semi_supervised_labeled_x_train_arr.shape[0], semi_supervised_unlabeled_x_train_arr.shape[0]))
  # Train on currently labeled data
  train_input_fn = tf.estimator.inputs.numpy_input_fn(
    x = {"image": semi_supervised_labeled_x_train_arr}, 
    y = semi_supervised_labeled_y_train_arr, 
    batch_size = 32, 
    num_epochs = None, 
    shuffle = True)

  semi_supervised_estimator.train(
    input_fn = train_input_fn, 
    steps = 2000)


  # Check evaluation metrics on held out evaluation set now that training is over
  eval_metrics = semi_supervised_estimator.evaluate(
    input_fn = eval_input_fn, 
    steps = None)
  
  old_accuracy = accuracy
  accuracy = eval_metrics["accuracy"]

  # Now predict from the unlabeled set
  predict_input_fn = tf.estimator.inputs.numpy_input_fn(
    x = {"image": semi_supervised_unlabeled_x_train_arr}, 
    y = None, 
    batch_size = 512, 
    num_epochs = 1, 
    shuffle = False)

  predictions = [prediction 
                 for prediction in semi_supervised_estimator.predict(
                   input_fn = predict_input_fn)]

  # Get the probabilities from the prediction list generated from the estimator
  probabilities = np.array(object = [prediction["probabilities"] for prediction in predictions])

  # Check if our predictions are above the confidence threshold
  confidence_condition = np.amax(a = probabilities, axis = 1) > confidence_threshold

  # Create array of the confidently prediction examples combining their features with the predicted probabilities
  new_labeled_x_train_arr = semi_supervised_unlabeled_x_train_arr[confidence_condition]
  new_labeled_y_train_arr = probabilities[confidence_condition]

  semi_supervised_labeled_x_train_arr = np.concatenate(
    seq = [semi_supervised_labeled_x_train_arr, new_labeled_x_train_arr], axis = 0)
  semi_supervised_labeled_y_train_arr = np.concatenate(
    seq = [semi_supervised_labeled_y_train_arr, new_labeled_y_train_arr], axis = 0)

  # Remove the confident predictions leaving only the unconfident predictions to go another round through the loop
  semi_supervised_unlabeled_x_train_arr = semi_supervised_unlabeled_x_train_arr[~confidence_condition]
  
  loop_counter += 1


loop_counter = 0, number_of_labeled_examples = 600, number_of_unlabeled_examples = 59400

INFO:tensorflow:Calling model_fn.

features = 
{'image': <tf.Tensor 'random_shuffle_queue_DequeueMany:1' shape=(32, 28, 28) dtype=float64>}
labels = 
Tensor("random_shuffle_queue_DequeueMany:2", shape=(32, 10), dtype=float64, device=/device:CPU:0)
mode = 
train
params = 
{'ksize1': 5, 'batch_norm': False, 'ksize2': 5, 'learning_rate': 0.01, 'train_batch_size': 100, 'train_steps': 1000, 'nfil2': 20, 'nfil1': 10, 'dprob': 0.1, 'model': 'linear'}
ylogits = 
Tensor("dense/BiasAdd:0", shape=(32, 10), dtype=float64)
probabilities = 
Tensor("Softmax:0", shape=(32, 10), dtype=float64)
class_ids = 
Tensor("Cast:0", shape=(32,), dtype=uint8)
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 0 into semi_supervised_trai

## Use kmeans to improve results

### First use PCA to reduce the dimensionality going into kmeans

In [24]:
number_of_dimensions = 10

s, u, v = tf.svd(
  tensor = tf.convert_to_tensor(
    value = x_train.reshape([-1, HEIGHT * WIDTH]), 
    dtype = tf.float32), 
  full_matrices = False, 
  compute_uv = True)
print("s = \n{}".format(s))
print("u = \n{}".format(u))
print("v = \n{}".format(v))

sigma = tf.diag(diagonal = s)
print("sigma = \n{}".format(sigma))

x_train_pca = tf.matmul(a = u, b = sigma[:, 0:number_of_dimensions])
print("x_train_pca = \n{}".format(x_train_pca))

with tf.Session() as sess:
  x_train_pca_arr = sess.run(fetches = x_train_pca)
print("x_train_pca_arr.shape = \n{}".format(x_train_pca_arr.shape))

s = 
Tensor("Svd:0", shape=(784,), dtype=float32)
u = 
Tensor("Svd:1", shape=(60000, 784), dtype=float32)
v = 
Tensor("Svd:2", shape=(784, 784), dtype=float32)
sigma = 
Tensor("Diag:0", shape=(784, 784), dtype=float32)
x_train_pca = 
Tensor("MatMul:0", shape=(60000, 10), dtype=float32)
x_train_pca_arr.shape = 
(60000, 10)


In [25]:
KMEANS_MODEL_DIR = "kmeans_estimator"

In [28]:
shutil.rmtree(path = KMEANS_MODEL_DIR, ignore_errors = True) # start fresh each time

def input_fn():
  return tf.train.limit_epochs(
    tensor = tf.convert_to_tensor(
      value = x_train_pca_arr, 
      dtype = tf.float32), 
    num_epochs = 1)

num_clusters = 10
kmeans = tf.contrib.factorization.KMeansClustering(
  num_clusters = num_clusters, 
  model_dir = KMEANS_MODEL_DIR,
  initial_clusters = tf.contrib.factorization.KMeansClustering.KMEANS_PLUS_PLUS_INIT,
  use_mini_batch = True)

# Train
num_iterations = 30
previous_centers = None
for _ in range(num_iterations):
  kmeans.train(input_fn = input_fn)
  cluster_centers = kmeans.cluster_centers()
  previous_centers = cluster_centers
print("cluster centers = \n{}".format(cluster_centers))

# Map the input points to their clusters
cluster_indices = list(kmeans.predict_cluster_index(input_fn = input_fn))

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_service': None, '_device_fn': None, '_task_type': 'worker', '_evaluation_master': '', '_eval_distribute': None, '_is_chief': True, '_protocol': None, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_save_summary_steps': 100, '_keep_checkpoint_every_n_hours': 10000, '_save_checkpoints_secs': 600, '_tf_random_seed': None, '_master': '', '_log_step_count_steps': 100, '_save_checkpoints_steps': None, '_global_id_in_cluster': 0, '_model_dir': 'kmeans_estimator', '_keep_checkpoint_max': 5, '_num_ps_replicas': 0, '_num_worker_replicas': 1, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f5da64e4390>, '_task_id': 0, '_train_distribute': None, '_experimental_distribute': None}
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph wa

In [29]:
cluster_indices_arr = np.array(object = cluster_indices)
cluster_indices_arr.shape

(60000,)

In [30]:
point_clusters_arr = cluster_centers[cluster_indices_arr, :]
point_clusters_arr.shape

(60000, 10)

In [31]:
squared_error_arr = (x_train_pca_arr - point_clusters_arr)**2
squared_error_arr.shape

(60000, 10)

In [32]:
squared_euclidean_distance = np.sum(a = squared_error_arr, axis = 1)
squared_euclidean_distance.shape

(60000,)

In [33]:
kmeans_df = pd.DataFrame({"cluster_id": cluster_indices_arr, 
                          "squared_euclidean_distance": squared_euclidean_distance})
kmeans_df.head()

Unnamed: 0,cluster_id,squared_euclidean_distance
0,1,9.698139
1,7,12.847395
2,8,13.683814
3,3,7.027799
4,2,4.790666


In [34]:
kmeans_df.groupby("cluster_id").count()

Unnamed: 0_level_0,squared_euclidean_distance
cluster_id,Unnamed: 1_level_1
0,7443
1,5461
2,5454
3,6929
4,7485
5,3578
6,4271
7,4006
8,7587
9,7786


In [35]:
kmeans_df.groupby("cluster_id").mean()

Unnamed: 0_level_0,squared_euclidean_distance
cluster_id,Unnamed: 1_level_1
0,14.651371
1,13.524157
2,11.01146
3,9.157743
4,9.979157
5,15.743523
6,10.246287
7,16.385509
8,12.881548
9,18.233294


In [36]:
kmeans_df.groupby("cluster_id")["squared_euclidean_distance"].nsmallest(n = 5)

cluster_id       
0           16825    1.989303
            7508     2.487936
            16783    2.785687
            12874    3.033547
            7396     3.227566
1           23143    1.664769
            36927    1.868385
            41235    2.096154
            50925    2.106042
            56141    2.164360
2           53655    1.745137
            27041    1.840623
            51757    1.871719
            36867    1.930790
            2217     1.958854
3           43121    1.112363
            22283    1.657932
            43109    1.755380
            28361    1.857489
            7329     1.910113
4           42104    1.589759
            21868    1.593555
            35828    1.790105
            14162    1.906652
            2954     2.025714
5           31095    1.750447
            22706    2.479543
            50958    2.554381
            53881    2.598932
            9197     2.809373
6           40211    1.797966
            16369    1.983479
            43019    2

In [37]:
closest_indices = np.array(
  object = kmeans_df.groupby("cluster_id")["squared_euclidean_distance"].nsmallest(n = 100).index.get_level_values(1))
closest_indices.shape

(1000,)

## Try semi-supervised again

In [38]:
semi_supervised_labeled_x_train_original_arr = x_train[closest_indices]
semi_supervised_labeled_y_train_original_arr = y_train[closest_indices]
semi_supervised_unlabeled_x_train_original_arr = x_train[np.isin(
  element = np.arange(number_of_train_examples), 
  test_elements = closest_indices, 
  assume_unique = True, 
  invert = True)]

In [39]:
print("semi_supervised_labeled_x_train_original_arr.shape = {}".format(semi_supervised_labeled_x_train_original_arr.shape))
print("semi_supervised_labeled_y_train_original_arr.shape = {}".format(semi_supervised_labeled_y_train_original_arr.shape))
print("semi_supervised_unlabeled_x_train_original_arr.shape = {}".format(semi_supervised_unlabeled_x_train_original_arr.shape))

semi_supervised_labeled_x_train_original_arr.shape = (1000, 28, 28)
semi_supervised_labeled_y_train_original_arr.shape = (1000, 10)
semi_supervised_unlabeled_x_train_original_arr.shape = (59000, 28, 28)


In [40]:
shutil.rmtree(path = SEMI_SUPERVISED_MODEL_DIR, ignore_errors = True) # start fresh each time

semi_supervised_labeled_x_train_arr = semi_supervised_labeled_x_train_original_arr
semi_supervised_labeled_y_train_arr = semi_supervised_labeled_y_train_original_arr
semi_supervised_unlabeled_x_train_arr = semi_supervised_unlabeled_x_train_original_arr

new_labeled_x_train_arr = np.zeros([1])

accuracy = 0.000001
old_accuracy = 0.0

loop_counter = 0
while semi_supervised_unlabeled_x_train_arr.shape[0] > 0 and new_labeled_x_train_arr.shape[0] > 0 and accuracy > old_accuracy:
  print("\nloop_counter = {}, number_of_labeled_examples = {}, number_of_unlabeled_examples = {}\n".format(loop_counter, semi_supervised_labeled_x_train_arr.shape[0], semi_supervised_unlabeled_x_train_arr.shape[0]))
  # Train on currently labeled data
  train_input_fn = tf.estimator.inputs.numpy_input_fn(
    x = {"image": semi_supervised_labeled_x_train_arr}, 
    y = semi_supervised_labeled_y_train_arr, 
    batch_size = 32, 
    num_epochs = None, 
    shuffle = True)

  semi_supervised_estimator.train(
    input_fn = train_input_fn, 
    steps = 2000)


  # Check evaluation metrics on held out evaluation set now that training is over
  eval_metrics = semi_supervised_estimator.evaluate(
    input_fn = eval_input_fn, 
    steps = None)
  
  old_accuracy = accuracy
  accuracy = eval_metrics["accuracy"]

  # Now predict from the unlabeled set
  predict_input_fn = tf.estimator.inputs.numpy_input_fn(
    x = {"image": semi_supervised_unlabeled_x_train_arr}, 
    y = None, 
    batch_size = 512, 
    num_epochs = 1, 
    shuffle = False)

  predictions = [prediction 
                 for prediction in semi_supervised_estimator.predict(
                   input_fn = predict_input_fn)]

  # Get the probabilities from the prediction list generated from the estimator
  probabilities = np.array(object = [prediction["probabilities"] 
                                     for prediction in predictions])

  # Check if our predictions are above the confidence threshold
  confidence_condition = np.amax(a = probabilities, axis = 1) > confidence_threshold

  # Create array of the confidently prediction examples combining their features with the predicted probabilities
  new_labeled_x_train_arr = semi_supervised_unlabeled_x_train_arr[confidence_condition]
  new_labeled_y_train_arr = probabilities[confidence_condition]

  semi_supervised_labeled_x_train_arr = np.concatenate(
    seq = [semi_supervised_labeled_x_train_arr, new_labeled_x_train_arr], axis = 0)
  semi_supervised_labeled_y_train_arr = np.concatenate(
    seq = [semi_supervised_labeled_y_train_arr, new_labeled_y_train_arr], axis = 0)

  # Remove the confident predictions leaving only the unconfident predictions to go another round through the loop
  semi_supervised_unlabeled_x_train_arr = semi_supervised_unlabeled_x_train_arr[~confidence_condition]
  
  loop_counter += 1


loop_counter = 0, number_of_labeled_examples = 1000, number_of_unlabeled_examples = 59000

INFO:tensorflow:Calling model_fn.

features = 
{'image': <tf.Tensor 'random_shuffle_queue_DequeueMany:1' shape=(32, 28, 28) dtype=float64>}
labels = 
Tensor("random_shuffle_queue_DequeueMany:2", shape=(32, 10), dtype=float64, device=/device:CPU:0)
mode = 
train
params = 
{'ksize1': 5, 'batch_norm': False, 'ksize2': 5, 'learning_rate': 0.01, 'train_batch_size': 100, 'train_steps': 1000, 'nfil2': 20, 'nfil1': 10, 'dprob': 0.1, 'model': 'linear'}
ylogits = 
Tensor("dense/BiasAdd:0", shape=(32, 10), dtype=float64)
probabilities = 
Tensor("Softmax:0", shape=(32, 10), dtype=float64)
class_ids = 
Tensor("Cast:0", shape=(32,), dtype=uint8)
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 0 into semi_supervised_tra