From cc8961157f87c08b2592169732c5e9ba2978d24d Mon Sep 17 00:00:00 2001
From: tkornut <tkornut@us.ibm.com>
Date: Wed, 7 Aug 2019 13:57:50 -0700
Subject: [PATCH 1/2] First version of the simple molecules problem + baseline
 model based on CNNs

---
 .../tasks/image_to_class/simple_molecules.yml |  60 +++++
 .../mnist_classification_convnet_softmax.yml  | 120 ++++++++++
 .../tasks/image_to_class/__init__.py          |   4 +-
 .../tasks/image_to_class/simple_molecules.py  | 215 ++++++++++++++++++
 4 files changed, 398 insertions(+), 1 deletion(-)
 create mode 100644 configs/default/components/tasks/image_to_class/simple_molecules.yml
 create mode 100644 configs/molecule_classification/mnist_classification_convnet_softmax.yml
 create mode 100644 ptp/components/tasks/image_to_class/simple_molecules.py

diff --git a/configs/default/components/tasks/image_to_class/simple_molecules.yml b/configs/default/components/tasks/image_to_class/simple_molecules.yml
new file mode 100644
index 0000000..06ad3be
--- /dev/null
+++ b/configs/default/components/tasks/image_to_class/simple_molecules.yml
@@ -0,0 +1,60 @@
+# This file defines the default values for the MNIST task.
+
+####################################################################
+# 1. CONFIGURATION PARAMETERS that will be LOADED by the component.
+####################################################################
+
+# Folder where task will store data (LOADED)
+data_folder: '~/data/simple-molecules'
+
+# Defines the split that will be used used (LOADED)
+# Options: training | validation | test
+split: training
+
+# Optional parameter (LOADED)
+# When present, resizes the MNIST images from [28,28] to [width, height]
+#resize_image: [height, width]
+
+streams:
+  ####################################################################
+  # 2. Keymappings associated with INPUT and OUTPUT streams.
+  ####################################################################
+
+  # Stream containing batch of indices (OUTPUT)
+  # Every task MUST return that stream.
+  indices: indices
+
+  # Stream containing batch of images (OUTPUT)
+  images: images
+
+  # Stream containing targets (label ids) (OUTPUT)
+  targets: targets
+
+  # Stream containing labels (words) (OUTPUT)
+  labels: labels
+
+globals:
+  ####################################################################
+  # 3. Keymappings of variables that will be RETRIEVED from GLOBALS.
+  ####################################################################
+
+  ####################################################################
+  # 4. Keymappings associated with GLOBAL variables that will be SET.
+  ####################################################################
+
+  # Width of the image (SET)
+  input_width: image_width
+  # Height of the image (SET)
+  input_height: image_height
+  # Depth of the image (SET)
+  input_depth: image_depth
+  
+  # Number of output classes: 10 (SET)
+  num_classes: num_classes
+  # Label (word-idx) mappings (SET)
+  label_word_mappings: label_word_mappings
+
+  ####################################################################
+  # 5. Keymappings associated with statistics that will be ADDED.
+  ####################################################################
+
diff --git a/configs/molecule_classification/mnist_classification_convnet_softmax.yml b/configs/molecule_classification/mnist_classification_convnet_softmax.yml
new file mode 100644
index 0000000..a041249
--- /dev/null
+++ b/configs/molecule_classification/mnist_classification_convnet_softmax.yml
@@ -0,0 +1,120 @@
+# Training parameters:
+training:
+  task: 
+    type: SimpleMolecules
+    batch_size: &b 64
+    split: training
+    # TODO: change!
+    resize_image: [87, 87]
+  # TODO: change!
+  # Use sampler that operates on a subset.
+  sampler:
+    type: SubsetRandomSampler
+    indices: [0, 50000]
+  # optimizer parameters:
+  optimizer:
+    type: Adam
+    lr: 0.0001
+  # settings parameters
+  terminal_conditions:
+    loss_stop_threshold: 0.15
+    early_stop_validations: -1
+    episode_limit: 10000
+    epoch_limit: 10
+
+# Validation parameters:
+validation:
+  task:
+    type: SimpleMolecules
+    batch_size: *b
+    # TODO: change!
+    split: training
+    # TODO: change!
+    resize_image: [87, 87]
+  # TODO: change!
+  # Use sampler that operates on a subset.
+  sampler:
+    type: SubsetRandomSampler
+    indices: [50000, 50400]
+
+# Testing parameters:
+test:
+  task:
+    type: SimpleMolecules
+    batch_size: *b
+    # TODO: change!
+    split: training
+    # TODO: change!
+    resize_image: [87, 87]
+
+pipeline:
+  # Model 1: 3 CNN layers.
+  image_encoder:
+    type: ConvNetEncoder
+    priority: 1
+    # Using default stream names, so the following could be removed (leaving it just for the clarity though).
+    streams:
+      inputs: images
+      feature_maps: feature_maps
+
+  # Reshape inputs
+  reshaper:
+    type: ReshapeTensor
+    # TODO: change!
+    #input_dims: [-1, 16, 107, 107]
+    #output_dims: [-1, 183184]
+    input_dims: [-1, 16, 9, 9]
+    output_dims: [-1, 1296]
+    priority: 2
+    streams:
+      inputs: feature_maps
+      outputs: reshaped_maps
+    globals:
+      output_size: reshaped_maps_size
+
+  # Model 2: 1 Fully connected layer with softmax acitvation.
+  classifier:
+    type: FeedForwardNetwork 
+    priority: 3
+    streams:
+      inputs: reshaped_maps
+      # Using default stream name, so the following could be removed (leaving it just for the clarity though).
+      predictions: predictions
+    globals:
+      input_size: reshaped_maps_size
+      prediction_size: num_classes
+
+  # Loss
+  nllloss:
+    type: NLLLoss
+    priority: 4
+    # Using default stream names, so the following could be removed (leaving it just for the clarity though).
+    streams:
+      targets: targets
+      predictions: predictions
+
+  accuracy:
+    priority: 5
+    type: AccuracyStatistics
+    # Using default stream names, so the following could be removed (leaving it just for the clarity though).
+    streams:
+      targets: targets
+      predictions: predictions
+
+  answer_decoder:
+    priority: 6
+    type: WordDecoder
+    import_word_mappings_from_globals: True
+    globals:
+      word_mappings: label_word_mappings
+    streams:
+      inputs: predictions
+      outputs: predicted_answers
+
+  stream_viewer:
+    priority: 7
+    type: StreamViewer
+    input_streams: labels, targets, predictions, predicted_answers
+
+
+#: pipeline
diff --git a/ptp/components/tasks/image_to_class/__init__.py b/ptp/components/tasks/image_to_class/__init__.py
index 72b54db..aaaf933 100644
--- a/ptp/components/tasks/image_to_class/__init__.py
+++ b/ptp/components/tasks/image_to_class/__init__.py
@@ -1,8 +1,10 @@
 #from .cifar10 import CIFAR10
 from .cifar_100 import CIFAR100
 from .mnist import MNIST
+from .simple_molecules import SimpleMolecules
 
 __all__ = [
     'CIFAR100',
-    'MNIST'
+    'MNIST',
+    'SimpleMolecules',
     ]
diff --git a/ptp/components/tasks/image_to_class/simple_molecules.py b/ptp/components/tasks/image_to_class/simple_molecules.py
new file mode 100644
index 0000000..6ef9281
--- /dev/null
+++ b/ptp/components/tasks/image_to_class/simple_molecules.py
@@ -0,0 +1,215 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+#
+# Copyright (C) IBM Corporation 2019
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__author__ = "Tomasz Kornuta"
+
+import os
+import csv
+from PIL import Image
+
+import torch
+from torchvision import transforms
+
+from ptp.components.tasks.task import Task
+from ptp.data_types.data_definition import DataDefinition
+
+from ptp.configuration.config_parsing import get_value_from_dictionary
+from ptp.configuration.configuration_error import ConfigurationError
+
+
+class SimpleMolecules(Task):
+    """
+    Simple molecule classification task.
+
+    """
+
+    def __init__(self, name, config):
+        """
+        Initializes the task.
+
+        .. warning::
+
+            Resizing images might cause a significant slow down in batch generation.
+
+        :param name: Task name.
+        :type name: str
+
+        :param config: Dictionary of parameters (read from the configuration ``.yaml`` file).
+        :type config: :py:class:`ptp.configuration.ConfigInterface`
+        """
+
+        # Call base class constructors.
+        super(SimpleMolecules, self).__init__(name, SimpleMolecules, config)
+
+        # Get default key mappings.
+        self.key_images = self.stream_keys["images"]
+        self.key_targets = self.stream_keys["targets"]
+        # Stream returning targets as words.
+        self.key_labels = self.stream_keys["labels"]
+
+        # Add transformations depending on the resizing option.
+        if 'resize_image' in self.config:
+            # Check the desired size.
+            if len(self.config['resize_image']) != 2:
+                self.logger.error("'resize_image' field must contain 2 values: the desired height and width")
+                exit(-1)
+
+            # Output image dimensions.
+            self.height = self.config['resize_image'][0]
+            self.width = self.config['resize_image'][1]
+
+            # Up-scale and transform to tensors.
+            self.image_transforms = transforms.Compose([transforms.Resize((self.height, self.width)), transforms.ToTensor()])
+
+            self.logger.warning('Upscaling the images to [{}, {}]. Slows down batch generation.'.format(
+                self.width, self.height))
+
+        else:
+            # Default  settings.
+            self.width = 875
+            self.height = 875
+            # Simply turn to tensor.
+            self.image_transforms = transforms.Compose([transforms.ToTensor()])
+
+        # Set global variables - all dimensions ASIDE OF BATCH.
+        self.globals["num_classes"] = 10
+        self.globals["image_width"] = self.width
+        self.globals["image_height"] = self.height
+        self.globals["image_depth"] = 1
+
+        # Class names.
+        labels = 'Zero One Two Three Four Five Six Seven Eight Nine'.split(' ')
+        # Export to globals.
+        word_to_ix = {labels[i]: i for i in range(10)}
+        self.globals["label_word_mappings"] = word_to_ix
+        # Reverse mapping - for labels.
+        self.ix_to_word = {value: key for (key, value) in word_to_ix.items()}
+
+        # Get the absolute path.
+        self.data_folder = os.path.expanduser(self.config['data_folder'])
+
+        # Get the split.
+        split = get_value_from_dictionary('split', self.config, "training | validation | test".split(" | "))
+
+        # Set split-dependent data.
+        if split == 'training':
+            # Training split folder and file with data question.
+            data_file = os.path.join(self.data_folder, 'ChemDATA_A_Dist_Labels.tsv')
+            self.image_folder = os.path.join(self.data_folder, "ChemDATA_A_Dist")
+        else: 
+            raise ConfigurationError("Split {} not supported yet".format(split))
+
+        # Load dataset.
+        self.dataset = self.load_dataset(data_file)
+        
+        # Display exemplary sample.
+        i = 0
+        sample = self.dataset[i]
+
+        self.logger.info("Exemplary sample {}:\n  image_ids: {}\n  class {}".format(
+            i,
+            sample[1],
+            sample[0]
+            ))
+
+
+    def load_dataset(self, source_data_file):
+        """
+        Loads the dataset from source file
+
+        :param source_data_file: csv file containing label-image filename pairs.
+
+        """
+        self.logger.info("Loading dataset from:\n {}".format(source_data_file))
+        dataset = []
+
+        with open(source_data_file, 'r') as f:
+            self.logger.info("Loading samples from '{}'...".format(source_data_file))
+            dataset = list(csv.reader(f, delimiter='\t'))
+
+        self.logger.info("Loaded split consisting of {} samples".format(len(dataset)))
+        return dataset
+
+
+    def __len__(self):
+        """
+        Returns the "size" of the "task" (total number of samples).
+
+        :return: The size of the task.
+        """
+        return len(self.dataset)
+
+
+    def output_data_definitions(self):
+        """ 
+        Function returns a dictionary with definitions of output data produced the component.
+
+        :return: dictionary containing output data definitions (each of type :py:class:`ptp.utils.DataDefinition`).
+        """
+        return {
+            self.key_indices: DataDefinition([-1, 1], [list, int], "Batch of sample indices [BATCH_SIZE] x [1]"),
+            self.key_images: DataDefinition([-1, 1, self.height, self.width], [torch.Tensor], "Batch of images [BATCH_SIZE x IMAGE_DEPTH x IMAGE_HEIGHT x IMAGE_WIDTH]"),
+            self.key_targets: DataDefinition([-1], [torch.Tensor], "Batch of targets, each being a single index [BATCH_SIZE]"),
+            self.key_labels: DataDefinition([-1, 1], [list, str], "Batch of targets, each being a single word [BATCH_SIZE] x [STRING]")
+            }
+
+    def get_image(self, img_id):
+        """
+        Function loads and returns image along with its size.
+        Additionally, it performs all the required transformations.
+
+        :param img_id: Identifier of the images.
+        :param img_folder: Path to the image.
+
+        :return: image (Tensor)
+        """
+
+        # Load the image.
+        img = Image.open(os.path.join(self.image_folder, img_id + '.png')) #.convert('RGB')
+
+        # Apply transformations.
+        img = self.image_transforms(img)
+
+        # Return image.
+        return img
+
+    def __getitem__(self, index):
+        """
+        Getter method to access the dataset and return a sample.
+
+        :param index: index of the sample to return.
+        :type index: int
+
+        :return: ``DataStreams({'images','targets'})``, with:
+
+            - images: Image, resized if ``self.resize`` is set,
+            - targets: Index of the target class
+        """
+        # Get image and target.
+        (label, img_id) = self.dataset[index]
+  
+        # Load the image.
+        img = self.get_image(img_id)
+
+        target = int(label)
+
+        # Return data_streams.
+        data_streams = self.create_data_streams(index)
+        data_streams[self.key_images] = img
+        data_streams[self.key_targets] = target
+        data_streams[self.key_labels] = label
+        return data_streams

From 661cea89e384bda6236941953bdcde081d2ce5d1 Mon Sep 17 00:00:00 2001
From: Katherine Ottenbreit <ottenbreit@us.ibm.com>
Date: Tue, 13 Aug 2019 13:45:09 -0700
Subject: [PATCH 2/2] reset author

---
 .../mnist_classification_convnet_softmax.yml  | 26 +++++++++----------
 .../tasks/image_to_class/simple_molecules.py  | 13 +++++++++-
 2 files changed, 25 insertions(+), 14 deletions(-)

diff --git a/configs/molecule_classification/mnist_classification_convnet_softmax.yml b/configs/molecule_classification/mnist_classification_convnet_softmax.yml
index a041249..d5f5ae1 100644
--- a/configs/molecule_classification/mnist_classification_convnet_softmax.yml
+++ b/configs/molecule_classification/mnist_classification_convnet_softmax.yml
@@ -5,12 +5,12 @@ training:
     batch_size: &b 64
     split: training
     # TODO: change!
-    resize_image: [87, 87]
+    resize_image: [128, 128]
   # TODO: change!
   # Use sampler that operates on a subset.
-  sampler:
-    type: SubsetRandomSampler
-    indices: [0, 50000]
+  #sampler:
+  #  type: SubsetRandomSampler
+  #  indices: [0, 50000]
   # optimizer parameters:
   optimizer:
     type: Adam
@@ -28,14 +28,14 @@ validation:
     type: SimpleMolecules
     batch_size: *b
     # TODO: change!
-    split: training
+    split: validation
     # TODO: change!
-    resize_image: [87, 87]
+    resize_image: [128, 128]
   # TODO: change!
   # Use sampler that operates on a subset.
-  sampler:
-    type: SubsetRandomSampler
-    indices: [50000, 50400]
+  #sampler:
+  #  type: SubsetRandomSampler
+  #  indices: [50000, 50400]
 
 # Testing parameters:
 test:
@@ -43,9 +43,9 @@ test:
     type: SimpleMolecules
     batch_size: *b
     # TODO: change!
-    split: training
+    split: test
     # TODO: change!
-    resize_image: [87, 87]
+    resize_image: [128, 128]
 
 pipeline:
   # Model 1: 3 CNN layers.
@@ -63,8 +63,8 @@ pipeline:
     # TODO: change!
     #input_dims: [-1, 16, 107, 107]
     #output_dims: [-1, 183184]
-    input_dims: [-1, 16, 9, 9]
-    output_dims: [-1, 1296]
+    input_dims: [-1, 16, 14, 14]
+    output_dims: [-1, 3136]
     priority: 2
     streams:
       inputs: feature_maps
diff --git a/ptp/components/tasks/image_to_class/simple_molecules.py b/ptp/components/tasks/image_to_class/simple_molecules.py
index 6ef9281..3a53a1c 100644
--- a/ptp/components/tasks/image_to_class/simple_molecules.py
+++ b/ptp/components/tasks/image_to_class/simple_molecules.py
@@ -108,8 +108,19 @@ def __init__(self, name, config):
         # Set split-dependent data.
         if split == 'training':
             # Training split folder and file with data question.
-            data_file = os.path.join(self.data_folder, 'ChemDATA_A_Dist_Labels.tsv')
+            data_file = os.path.join(self.data_folder, 'ChemDATA_A_Dist_Labels_Set0.tsv')
             self.image_folder = os.path.join(self.data_folder, "ChemDATA_A_Dist")
+
+        elif split == 'validation':
+            # Training split folder and file with data question.
+            data_file = os.path.join(self.data_folder, 'ChemDATA_A_Dist_Labels_Set1.tsv')
+            self.image_folder = os.path.join(self.data_folder, "ChemDATA_A_Dist")
+
+        elif split == 'test':
+            # Training split folder and file with data question.
+            data_file = os.path.join(self.data_folder, 'ChemDATA_A_Test2_Labels.tsv')
+            self.image_folder = os.path.join(self.data_folder, "ChemDATA_A_Test2")
+
         else: 
             raise ConfigurationError("Split {} not supported yet".format(split))