From cc8961157f87c08b2592169732c5e9ba2978d24d Mon Sep 17 00:00:00 2001 From: tkornut Date: Wed, 7 Aug 2019 13:57:50 -0700 Subject: [PATCH 1/2] First version of the simple molecules problem + baseline model based on CNNs --- .../tasks/image_to_class/simple_molecules.yml | 60 +++++ .../mnist_classification_convnet_softmax.yml | 120 ++++++++++ .../tasks/image_to_class/__init__.py | 4 +- .../tasks/image_to_class/simple_molecules.py | 215 ++++++++++++++++++ 4 files changed, 398 insertions(+), 1 deletion(-) create mode 100644 configs/default/components/tasks/image_to_class/simple_molecules.yml create mode 100644 configs/molecule_classification/mnist_classification_convnet_softmax.yml create mode 100644 ptp/components/tasks/image_to_class/simple_molecules.py diff --git a/configs/default/components/tasks/image_to_class/simple_molecules.yml b/configs/default/components/tasks/image_to_class/simple_molecules.yml new file mode 100644 index 0000000..06ad3be --- /dev/null +++ b/configs/default/components/tasks/image_to_class/simple_molecules.yml @@ -0,0 +1,60 @@ +# This file defines the default values for the MNIST task. + +#################################################################### +# 1. CONFIGURATION PARAMETERS that will be LOADED by the component. +#################################################################### + +# Folder where task will store data (LOADED) +data_folder: '~/data/simple-molecules' + +# Defines the split that will be used used (LOADED) +# Options: training | validation | test +split: training + +# Optional parameter (LOADED) +# When present, resizes the MNIST images from [28,28] to [width, height] +#resize_image: [height, width] + +streams: + #################################################################### + # 2. Keymappings associated with INPUT and OUTPUT streams. + #################################################################### + + # Stream containing batch of indices (OUTPUT) + # Every task MUST return that stream. + indices: indices + + # Stream containing batch of images (OUTPUT) + images: images + + # Stream containing targets (label ids) (OUTPUT) + targets: targets + + # Stream containing labels (words) (OUTPUT) + labels: labels + +globals: + #################################################################### + # 3. Keymappings of variables that will be RETRIEVED from GLOBALS. + #################################################################### + + #################################################################### + # 4. Keymappings associated with GLOBAL variables that will be SET. + #################################################################### + + # Width of the image (SET) + input_width: image_width + # Height of the image (SET) + input_height: image_height + # Depth of the image (SET) + input_depth: image_depth + + # Number of output classes: 10 (SET) + num_classes: num_classes + # Label (word-idx) mappings (SET) + label_word_mappings: label_word_mappings + + #################################################################### + # 5. Keymappings associated with statistics that will be ADDED. + #################################################################### + diff --git a/configs/molecule_classification/mnist_classification_convnet_softmax.yml b/configs/molecule_classification/mnist_classification_convnet_softmax.yml new file mode 100644 index 0000000..a041249 --- /dev/null +++ b/configs/molecule_classification/mnist_classification_convnet_softmax.yml @@ -0,0 +1,120 @@ +# Training parameters: +training: + task: + type: SimpleMolecules + batch_size: &b 64 + split: training + # TODO: change! + resize_image: [87, 87] + # TODO: change! + # Use sampler that operates on a subset. + sampler: + type: SubsetRandomSampler + indices: [0, 50000] + # optimizer parameters: + optimizer: + type: Adam + lr: 0.0001 + # settings parameters + terminal_conditions: + loss_stop_threshold: 0.15 + early_stop_validations: -1 + episode_limit: 10000 + epoch_limit: 10 + +# Validation parameters: +validation: + task: + type: SimpleMolecules + batch_size: *b + # TODO: change! + split: training + # TODO: change! + resize_image: [87, 87] + # TODO: change! + # Use sampler that operates on a subset. + sampler: + type: SubsetRandomSampler + indices: [50000, 50400] + +# Testing parameters: +test: + task: + type: SimpleMolecules + batch_size: *b + # TODO: change! + split: training + # TODO: change! + resize_image: [87, 87] + +pipeline: + # Model 1: 3 CNN layers. + image_encoder: + type: ConvNetEncoder + priority: 1 + # Using default stream names, so the following could be removed (leaving it just for the clarity though). + streams: + inputs: images + feature_maps: feature_maps + + # Reshape inputs + reshaper: + type: ReshapeTensor + # TODO: change! + #input_dims: [-1, 16, 107, 107] + #output_dims: [-1, 183184] + input_dims: [-1, 16, 9, 9] + output_dims: [-1, 1296] + priority: 2 + streams: + inputs: feature_maps + outputs: reshaped_maps + globals: + output_size: reshaped_maps_size + + # Model 2: 1 Fully connected layer with softmax acitvation. + classifier: + type: FeedForwardNetwork + priority: 3 + streams: + inputs: reshaped_maps + # Using default stream name, so the following could be removed (leaving it just for the clarity though). + predictions: predictions + globals: + input_size: reshaped_maps_size + prediction_size: num_classes + + # Loss + nllloss: + type: NLLLoss + priority: 4 + # Using default stream names, so the following could be removed (leaving it just for the clarity though). + streams: + targets: targets + predictions: predictions + + accuracy: + priority: 5 + type: AccuracyStatistics + # Using default stream names, so the following could be removed (leaving it just for the clarity though). + streams: + targets: targets + predictions: predictions + + answer_decoder: + priority: 6 + type: WordDecoder + import_word_mappings_from_globals: True + globals: + word_mappings: label_word_mappings + streams: + inputs: predictions + outputs: predicted_answers + + stream_viewer: + priority: 7 + type: StreamViewer + input_streams: labels, targets, predictions, predicted_answers + + +#: pipeline diff --git a/ptp/components/tasks/image_to_class/__init__.py b/ptp/components/tasks/image_to_class/__init__.py index 72b54db..aaaf933 100644 --- a/ptp/components/tasks/image_to_class/__init__.py +++ b/ptp/components/tasks/image_to_class/__init__.py @@ -1,8 +1,10 @@ #from .cifar10 import CIFAR10 from .cifar_100 import CIFAR100 from .mnist import MNIST +from .simple_molecules import SimpleMolecules __all__ = [ 'CIFAR100', - 'MNIST' + 'MNIST', + 'SimpleMolecules', ] diff --git a/ptp/components/tasks/image_to_class/simple_molecules.py b/ptp/components/tasks/image_to_class/simple_molecules.py new file mode 100644 index 0000000..6ef9281 --- /dev/null +++ b/ptp/components/tasks/image_to_class/simple_molecules.py @@ -0,0 +1,215 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# +# Copyright (C) IBM Corporation 2019 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +__author__ = "Tomasz Kornuta" + +import os +import csv +from PIL import Image + +import torch +from torchvision import transforms + +from ptp.components.tasks.task import Task +from ptp.data_types.data_definition import DataDefinition + +from ptp.configuration.config_parsing import get_value_from_dictionary +from ptp.configuration.configuration_error import ConfigurationError + + +class SimpleMolecules(Task): + """ + Simple molecule classification task. + + """ + + def __init__(self, name, config): + """ + Initializes the task. + + .. warning:: + + Resizing images might cause a significant slow down in batch generation. + + :param name: Task name. + :type name: str + + :param config: Dictionary of parameters (read from the configuration ``.yaml`` file). + :type config: :py:class:`ptp.configuration.ConfigInterface` + """ + + # Call base class constructors. + super(SimpleMolecules, self).__init__(name, SimpleMolecules, config) + + # Get default key mappings. + self.key_images = self.stream_keys["images"] + self.key_targets = self.stream_keys["targets"] + # Stream returning targets as words. + self.key_labels = self.stream_keys["labels"] + + # Add transformations depending on the resizing option. + if 'resize_image' in self.config: + # Check the desired size. + if len(self.config['resize_image']) != 2: + self.logger.error("'resize_image' field must contain 2 values: the desired height and width") + exit(-1) + + # Output image dimensions. + self.height = self.config['resize_image'][0] + self.width = self.config['resize_image'][1] + + # Up-scale and transform to tensors. + self.image_transforms = transforms.Compose([transforms.Resize((self.height, self.width)), transforms.ToTensor()]) + + self.logger.warning('Upscaling the images to [{}, {}]. Slows down batch generation.'.format( + self.width, self.height)) + + else: + # Default settings. + self.width = 875 + self.height = 875 + # Simply turn to tensor. + self.image_transforms = transforms.Compose([transforms.ToTensor()]) + + # Set global variables - all dimensions ASIDE OF BATCH. + self.globals["num_classes"] = 10 + self.globals["image_width"] = self.width + self.globals["image_height"] = self.height + self.globals["image_depth"] = 1 + + # Class names. + labels = 'Zero One Two Three Four Five Six Seven Eight Nine'.split(' ') + # Export to globals. + word_to_ix = {labels[i]: i for i in range(10)} + self.globals["label_word_mappings"] = word_to_ix + # Reverse mapping - for labels. + self.ix_to_word = {value: key for (key, value) in word_to_ix.items()} + + # Get the absolute path. + self.data_folder = os.path.expanduser(self.config['data_folder']) + + # Get the split. + split = get_value_from_dictionary('split', self.config, "training | validation | test".split(" | ")) + + # Set split-dependent data. + if split == 'training': + # Training split folder and file with data question. + data_file = os.path.join(self.data_folder, 'ChemDATA_A_Dist_Labels.tsv') + self.image_folder = os.path.join(self.data_folder, "ChemDATA_A_Dist") + else: + raise ConfigurationError("Split {} not supported yet".format(split)) + + # Load dataset. + self.dataset = self.load_dataset(data_file) + + # Display exemplary sample. + i = 0 + sample = self.dataset[i] + + self.logger.info("Exemplary sample {}:\n image_ids: {}\n class {}".format( + i, + sample[1], + sample[0] + )) + + + def load_dataset(self, source_data_file): + """ + Loads the dataset from source file + + :param source_data_file: csv file containing label-image filename pairs. + + """ + self.logger.info("Loading dataset from:\n {}".format(source_data_file)) + dataset = [] + + with open(source_data_file, 'r') as f: + self.logger.info("Loading samples from '{}'...".format(source_data_file)) + dataset = list(csv.reader(f, delimiter='\t')) + + self.logger.info("Loaded split consisting of {} samples".format(len(dataset))) + return dataset + + + def __len__(self): + """ + Returns the "size" of the "task" (total number of samples). + + :return: The size of the task. + """ + return len(self.dataset) + + + def output_data_definitions(self): + """ + Function returns a dictionary with definitions of output data produced the component. + + :return: dictionary containing output data definitions (each of type :py:class:`ptp.utils.DataDefinition`). + """ + return { + self.key_indices: DataDefinition([-1, 1], [list, int], "Batch of sample indices [BATCH_SIZE] x [1]"), + self.key_images: DataDefinition([-1, 1, self.height, self.width], [torch.Tensor], "Batch of images [BATCH_SIZE x IMAGE_DEPTH x IMAGE_HEIGHT x IMAGE_WIDTH]"), + self.key_targets: DataDefinition([-1], [torch.Tensor], "Batch of targets, each being a single index [BATCH_SIZE]"), + self.key_labels: DataDefinition([-1, 1], [list, str], "Batch of targets, each being a single word [BATCH_SIZE] x [STRING]") + } + + def get_image(self, img_id): + """ + Function loads and returns image along with its size. + Additionally, it performs all the required transformations. + + :param img_id: Identifier of the images. + :param img_folder: Path to the image. + + :return: image (Tensor) + """ + + # Load the image. + img = Image.open(os.path.join(self.image_folder, img_id + '.png')) #.convert('RGB') + + # Apply transformations. + img = self.image_transforms(img) + + # Return image. + return img + + def __getitem__(self, index): + """ + Getter method to access the dataset and return a sample. + + :param index: index of the sample to return. + :type index: int + + :return: ``DataStreams({'images','targets'})``, with: + + - images: Image, resized if ``self.resize`` is set, + - targets: Index of the target class + """ + # Get image and target. + (label, img_id) = self.dataset[index] + + # Load the image. + img = self.get_image(img_id) + + target = int(label) + + # Return data_streams. + data_streams = self.create_data_streams(index) + data_streams[self.key_images] = img + data_streams[self.key_targets] = target + data_streams[self.key_labels] = label + return data_streams From 661cea89e384bda6236941953bdcde081d2ce5d1 Mon Sep 17 00:00:00 2001 From: Katherine Ottenbreit Date: Tue, 13 Aug 2019 13:45:09 -0700 Subject: [PATCH 2/2] reset author --- .../mnist_classification_convnet_softmax.yml | 26 +++++++++---------- .../tasks/image_to_class/simple_molecules.py | 13 +++++++++- 2 files changed, 25 insertions(+), 14 deletions(-) diff --git a/configs/molecule_classification/mnist_classification_convnet_softmax.yml b/configs/molecule_classification/mnist_classification_convnet_softmax.yml index a041249..d5f5ae1 100644 --- a/configs/molecule_classification/mnist_classification_convnet_softmax.yml +++ b/configs/molecule_classification/mnist_classification_convnet_softmax.yml @@ -5,12 +5,12 @@ training: batch_size: &b 64 split: training # TODO: change! - resize_image: [87, 87] + resize_image: [128, 128] # TODO: change! # Use sampler that operates on a subset. - sampler: - type: SubsetRandomSampler - indices: [0, 50000] + #sampler: + # type: SubsetRandomSampler + # indices: [0, 50000] # optimizer parameters: optimizer: type: Adam @@ -28,14 +28,14 @@ validation: type: SimpleMolecules batch_size: *b # TODO: change! - split: training + split: validation # TODO: change! - resize_image: [87, 87] + resize_image: [128, 128] # TODO: change! # Use sampler that operates on a subset. - sampler: - type: SubsetRandomSampler - indices: [50000, 50400] + #sampler: + # type: SubsetRandomSampler + # indices: [50000, 50400] # Testing parameters: test: @@ -43,9 +43,9 @@ test: type: SimpleMolecules batch_size: *b # TODO: change! - split: training + split: test # TODO: change! - resize_image: [87, 87] + resize_image: [128, 128] pipeline: # Model 1: 3 CNN layers. @@ -63,8 +63,8 @@ pipeline: # TODO: change! #input_dims: [-1, 16, 107, 107] #output_dims: [-1, 183184] - input_dims: [-1, 16, 9, 9] - output_dims: [-1, 1296] + input_dims: [-1, 16, 14, 14] + output_dims: [-1, 3136] priority: 2 streams: inputs: feature_maps diff --git a/ptp/components/tasks/image_to_class/simple_molecules.py b/ptp/components/tasks/image_to_class/simple_molecules.py index 6ef9281..3a53a1c 100644 --- a/ptp/components/tasks/image_to_class/simple_molecules.py +++ b/ptp/components/tasks/image_to_class/simple_molecules.py @@ -108,8 +108,19 @@ def __init__(self, name, config): # Set split-dependent data. if split == 'training': # Training split folder and file with data question. - data_file = os.path.join(self.data_folder, 'ChemDATA_A_Dist_Labels.tsv') + data_file = os.path.join(self.data_folder, 'ChemDATA_A_Dist_Labels_Set0.tsv') self.image_folder = os.path.join(self.data_folder, "ChemDATA_A_Dist") + + elif split == 'validation': + # Training split folder and file with data question. + data_file = os.path.join(self.data_folder, 'ChemDATA_A_Dist_Labels_Set1.tsv') + self.image_folder = os.path.join(self.data_folder, "ChemDATA_A_Dist") + + elif split == 'test': + # Training split folder and file with data question. + data_file = os.path.join(self.data_folder, 'ChemDATA_A_Test2_Labels.tsv') + self.image_folder = os.path.join(self.data_folder, "ChemDATA_A_Test2") + else: raise ConfigurationError("Split {} not supported yet".format(split))