# Semi-supervised Self-learning Classification with Iris Dataset

In [1]:
import os
import shutil
import numpy as np
import pandas as pd
import tensorflow as tf
print(tf.__version__)

1.12.0


## Get data

In [2]:
TRAIN_URL = "http://download.tensorflow.org/data/iris_training.csv"
TEST_URL = "http://download.tensorflow.org/data/iris_test.csv"
train_path = tf.keras.utils.get_file(fname = TRAIN_URL.split('/')[-1], origin = TRAIN_URL)
test_path = tf.keras.utils.get_file(fname = TEST_URL.split('/')[-1], origin = TEST_URL)

In [3]:
os.environ['TRAIN_PATH'] = train_path
os.environ['TEST_PATH'] = test_path

In [4]:
!head ${TRAIN_PATH}

120,4,setosa,versicolor,virginica
6.4,2.8,5.6,2.2,2
5.0,2.3,3.3,1.0,1
4.9,2.5,4.5,1.7,2
4.9,3.1,1.5,0.1,0
5.7,3.8,1.7,0.3,0
4.4,3.2,1.3,0.2,0
5.4,3.4,1.5,0.4,0
6.9,3.1,5.1,2.3,2
6.7,3.1,4.4,1.4,1


In [5]:
!head ${TEST_PATH}

30,4,setosa,versicolor,virginica
5.9,3.0,4.2,1.5,1
6.9,3.1,5.4,2.1,2
5.1,3.3,1.7,0.5,0
6.0,3.4,4.5,1.6,1
5.5,2.5,4.0,1.3,1
6.2,2.9,4.3,1.3,1
5.5,4.2,1.4,0.2,0
6.3,2.8,5.1,1.5,2
5.6,3.0,4.1,1.3,1


## Create fully supervised model for comparison

In [6]:
CSV_COLUMN_NAMES = ["SepalLength", "SepalWidth", "PetalLength", "PetalWidth", "Species"]
LABEL_COLUMN_NAME = "Species"
SPECIES_NAMES = ["Setosa", "Versicolor", "Virginica"]
CSV_COLUMN_DEFAULTS = [[0.0],[0.0],[0.0],[0.0],[0]]

In [7]:
# Create an input function reading a file using the Dataset API
# Then provide the results to the Estimator API
def read_dataset(filename, mode, batch_size = 512):
  def _input_fn():
    def decode_csv(value_column):
      columns = tf.decode_csv(records = value_column, record_defaults = CSV_COLUMN_DEFAULTS)
      features = dict(zip(CSV_COLUMN_NAMES, columns))
      label = features.pop(LABEL_COLUMN_NAME)
      return features, label
    
    # Create list of files that match pattern
    file_list = tf.gfile.Glob(filename = filename)

    # Create dataset from file list
    dataset = tf.data.TextLineDataset(filenames = file_list).skip(count = 1)  # Read text file
    
    dataset = dataset.map(map_func = decode_csv)  # Transform each elem by applying decode_csv fn
      
    if mode == tf.estimator.ModeKeys.TRAIN:
      num_epochs = None # indefinitely
      dataset = dataset.shuffle(buffer_size = 10 * batch_size)
    else:
      num_epochs = 1 # end-of-input after this
 
    dataset = dataset.repeat(count = num_epochs)
    dataset = dataset.batch(batch_size = batch_size)
    return dataset
  return _input_fn

In [8]:
# Define feature columns
def create_feature_columns():
  feature_columns = [tf.feature_column.numeric_column(key = feature) for feature in CSV_COLUMN_NAMES[0:-1]] # all features are numeric

  return feature_columns

In [9]:
SUPERVISED_MODEL_DIR = "supervised_trained"

In [10]:
# Build a DNN with 2 hidden layers and 10 nodes in each hidden layer.
supervised_estimator = tf.estimator.DNNClassifier(
  feature_columns = create_feature_columns(),
  model_dir = SUPERVISED_MODEL_DIR,
  # Two hidden layers of 10 nodes each.
  hidden_units = [10, 10],
  # The model must choose between 3 classes.
  n_classes = 3)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'supervised_trained', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x105c11438>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [11]:
shutil.rmtree(path = SUPERVISED_MODEL_DIR, ignore_errors = True)  # start fresh each time
supervised_estimator.train(
  input_fn = read_dataset(
    filename = train_path, 
    mode = tf.estimator.ModeKeys.TRAIN, 
    batch_size = 32), 
  steps = 2000)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 0 into supervised_trained/model.ckpt.
INFO:tensorflow:loss = 50.350456, step = 1
INFO:tensorflow:global_step/sec: 202.61
INFO:tensorflow:loss = 4.913475, step = 101 (0.494 sec)
INFO:tensorflow:global_step/sec: 297
INFO:tensorflow:loss = 4.3145337, step = 201 (0.337 sec)
INFO:tensorflow:global_step/sec: 273.068
INFO:tensorflow:loss = 4.5109253, step = 301 (0.366 sec)
INFO:tensorflow:global_step/sec: 303.565
INFO:tensorflow:loss = 1.286738, step = 401 (0.329 sec)
INFO:tensorflow:global_step/sec: 298.148
INFO:tensorflow:loss = 1.6059611, step = 501 (0.335 sec)
INFO:tensorflow:global_step/sec: 300.551
INFO:tensorflow:loss = 1.0319146, step = 601 (0.333 sec)
INFO:tensorflow:global_step/sec: 305.393
INFO:tensorflow:loss 

<tensorflow.python.estimator.canned.dnn.DNNClassifier at 0x1048de358>

In [12]:
eval_metrics = supervised_estimator.evaluate(
  input_fn = read_dataset(
    filename = test_path, 
    mode = tf.estimator.ModeKeys.EVAL, 
    batch_size = 512), 
  steps = None)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2019-03-15-09:17:42
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from supervised_trained/model.ckpt-2000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2019-03-15-09:17:43
INFO:tensorflow:Saving dict for global step 2000: accuracy = 0.96666664, average_loss = 0.05467449, global_step = 2000, loss = 1.6402347
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 2000: supervised_trained/model.ckpt-2000


## Now create semi-supervised model

In [13]:
supervised_train_df = pd.read_csv(
  filepath_or_buffer = train_path, 
  sep = ',', 
  header = 0, 
  names = CSV_COLUMN_NAMES)
supervised_train_df.head()

Unnamed: 0,SepalLength,SepalWidth,PetalLength,PetalWidth,Species
0,6.4,2.8,5.6,2.2,2
1,5.0,2.3,3.3,1.0,1
2,4.9,2.5,4.5,1.7,2
3,4.9,3.1,1.5,0.1,0
4,5.7,3.8,1.7,0.3,0


In [14]:
number_of_train_examples = len(supervised_train_df)
print("number_of_train_examples = {}".format(number_of_train_examples))

number_of_train_examples = 120


In [15]:
number_of_labeled_train_examples = int(number_of_train_examples * 0.04)
number_of_unlabeled_train_examples = number_of_train_examples - number_of_labeled_train_examples
print("number_of_labeled_train_examples = {} & number_of_unlabeled_train_examples = {}".format(number_of_labeled_train_examples, number_of_unlabeled_train_examples))

number_of_labeled_train_examples = 4 & number_of_unlabeled_train_examples = 116


In [16]:
semi_supervised_labeled_train_original_df = supervised_train_df[0:number_of_labeled_train_examples].reset_index(drop = True)
semi_supervised_unlabeled_train_original_df = supervised_train_df[CSV_COLUMN_NAMES[0:-1]][number_of_labeled_train_examples:].reset_index(drop = True)

## Create semi-supervised model using sparse labels

In [17]:
SEMI_SUPERVISED_MODEL_DIR = "semi_supervised_trained"

In [18]:
# Build a DNN with hidden layers and neurons in each hidden layer.
semi_supervised_estimator = tf.estimator.DNNClassifier(
  feature_columns = create_feature_columns(),
  model_dir = SEMI_SUPERVISED_MODEL_DIR,
  hidden_units = [10, 10],
  n_classes = 3)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'semi_supervised_trained', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0xb3438a828>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [19]:
semi_supervised_labeled_train_original_df

Unnamed: 0,SepalLength,SepalWidth,PetalLength,PetalWidth,Species
0,6.4,2.8,5.6,2.2,2
1,5.0,2.3,3.3,1.0,1
2,4.9,2.5,4.5,1.7,2
3,4.9,3.1,1.5,0.1,0


In [20]:
confidence_threshold = 0.95

In [21]:
shutil.rmtree(path = SEMI_SUPERVISED_MODEL_DIR, ignore_errors = True) # start fresh each time

semi_supervised_labeled_train_df = semi_supervised_labeled_train_original_df.copy(deep = True)
semi_supervised_unlabeled_train_df = semi_supervised_unlabeled_train_original_df.copy(deep = True)

unlabed_confident_indices = np.zeros([1])

accuracy = 0.000001
old_accuracy = 0.0

loop_counter = 0
while len(semi_supervised_unlabeled_train_df) > 0 and unlabed_confident_indices.shape[0] > 0 and accuracy > old_accuracy:
  print("\nloop_counter = {}, number_of_labeled_examples = {}, number_of_unlabeled_examples = {}\n".format(loop_counter, len(semi_supervised_labeled_train_df), len(semi_supervised_unlabeled_train_df)))
  # Train on currently labeled data
  train_input_fn = tf.estimator.inputs.pandas_input_fn(
    x = semi_supervised_labeled_train_df, 
    y = semi_supervised_labeled_train_df[LABEL_COLUMN_NAME], 
    batch_size = 32, 
    num_epochs = None, 
    shuffle = True)

  semi_supervised_estimator.train(
    input_fn = train_input_fn, 
    steps = 2000)


  # Check evaluation metrics on held out evaluation set now that training is over
  eval_metrics = semi_supervised_estimator.evaluate(
    input_fn = read_dataset(
      filename = test_path, 
      mode = tf.estimator.ModeKeys.EVAL, 
      batch_size = 512), 
    steps = None)
  
  old_accuracy = accuracy
  accuracy = eval_metrics["accuracy"]

  # Now predict from the unlabeled set
  predict_input_fn = tf.estimator.inputs.pandas_input_fn(
    x = semi_supervised_unlabeled_train_df, 
    y = None, 
    batch_size = 512, 
    num_epochs = 1, 
    shuffle = False)

  predictions = [prediction 
                 for prediction in semi_supervised_estimator.predict(
                   input_fn = predict_input_fn)]

  # Get the probabilities and class ids from the prediction list generated from the estimator
  probabilities = np.array(object = [prediction["probabilities"] 
                                     for prediction in predictions])
  class_ids = np.array(object = [prediction["class_ids"] 
                                 for prediction in predictions])

  # Check if our predictions are above the confidence threshold
  confidence_condition = np.amax(a = probabilities, axis = 1) > confidence_threshold

  # Get the indices of both the confident and unconfident unlabeled predictions so that we can slice our unlabeled dataframe
  unlabed_confident_indices = np.where(confidence_condition)[0]
  unlabed_unconfident_indices = np.where(~confidence_condition)[0]

  # Get the class ids of the confident unlabeled predictions
  unlabed_confident_class_ids = np.squeeze(a = class_ids[confidence_condition], axis = 1)

  # Create dataframe of the confidently prediction examples combining their features with the predicted class id
  new_labeled_df = semi_supervised_unlabeled_train_df.loc[unlabed_confident_indices]
  new_labeled_df[LABEL_COLUMN_NAME] = unlabed_confident_class_ids

  semi_supervised_labeled_train_df = pd.concat(
    objs = [semi_supervised_labeled_train_df, new_labeled_df], 
    axis = 0).reset_index(drop = True)

  # Remove the confident predictions leaving only the unconfident predictions to go another round through the loop
  semi_supervised_unlabeled_train_df = semi_supervised_unlabeled_train_df.loc[unlabed_unconfident_indices].reset_index(drop = True)
  
  loop_counter += 1


loop_counter = 0, number_of_labeled_examples = 4, number_of_unlabeled_examples = 116

Instructions for updating:
To construct input pipelines, use the `tf.data` module.
Instructions for updating:
To construct input pipelines, use the `tf.data` module.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
Instructions for updating:
To construct input pipelines, use the `tf.data` module.
INFO:tensorflow:Saving checkpoints for 0 into semi_supervised_trained/model.ckpt.
INFO:tensorflow:loss = 34.139137, step = 1
INFO:tensorflow:global_step/sec: 268.311
INFO:tensorflow:loss = 3.249683, step = 101 (0.376 sec)
INFO:tensorflow:global_step/sec: 491.652
INFO:tensorflow:loss = 0.5667273, step = 201 (0.202 sec)
INFO:tensorflow:global_step/sec: 487.047
INFO:tensorflow:loss = 0.32575104, step = 301 (0.207 sec)
INFO:ten