In [2]:
import logging
import os
import shutil
import sys
import warnings
from pathlib import Path

import yaml

from immuneML.app.ImmuneMLApp import ImmuneMLApp
from immuneML.environment.EnvironmentSettings import EnvironmentSettings
from immuneML.util.PathBuilder import PathBuilder


  from .autonotebook import tqdm as notebook_tqdm


In [5]:
path = EnvironmentSettings.root_path / "quickstart/"


In [6]:
path

WindowsPath('C:/Users/karth/Desktop/PhD projects/immuneML/immuneML/quickstart')

In [7]:
specs = {
    "definitions": {
        "datasets": {
            "d1": {
                "format": "AIRR",
                "params": {
                    "path": str(path / "result1/simulation_instruction/exported_dataset/airr/"),
                    "metadata_file": str(path / "result1/simulation_instruction/exported_dataset/airr/metadata.csv")
                }
            }
        },
        "encodings": {
            "e1": {
                "KmerFrequency": {
                    "k": 3
                }
            },
            "e2": {
                "KmerFrequency": {
                    "k": 2
                }
            }
        },
        "ml_methods": {
            "simpleLR": {
                "LogisticRegression": {
                    "C": 0.1,
                    "penalty": "l1",
                    "max_iter": 200
                }}
        },
        "reports": {
            "rep1": {
                "SequenceLengthDistribution": {
                    "batch_size": 3
                }
            },
            "hprep": "MLSettingsPerformance",
            "coef": "Coefficients"
        }
    },
    "instructions": {
        "machine_learning_instruction": {
            "type": "TrainMLModel",
            "settings": [
                {
                    "encoding": "e1",
                    "ml_method": "simpleLR"
                },
                {
                    "encoding": "e2",
                    "ml_method": "simpleLR"
                }
            ],
            "assessment": {
                "split_strategy": "random",
                "split_count": 1,
                "training_percentage": 0.7,
                "reports": {
                    "data_splits": ["rep1"],
                    'models': ["coef"]
                }
            },
            "selection": {
                "split_strategy": "random",
                "split_count": 1,
                "training_percentage": 0.7,
                "reports": {
                    "data_splits": ["rep1"],
                    "models": [],
                }
            },
            "labels": ["my_signal"],
            "dataset": "d1",
            "strategy": "GridSearch",
            "metrics": ["accuracy"],
            "reports": ["hprep"],
            "number_of_processes": 3,
            "optimization_metric": "balanced_accuracy",
            "refit_optimal_model": False
        }
    }
}


In [8]:
PathBuilder.build(path)
specs_file = path / "machine_learning_analysis/specs.yaml"
with specs_file.open("w") as file:
    yaml.dump(specs, file)


In [9]:
specs_file

WindowsPath('C:/Users/karth/Desktop/PhD projects/immuneML/immuneML/quickstart/machine_learning_analysis/specs.yaml')

In [10]:
resultpath = path / "machine_learning_analysis/result"


In [11]:
PathBuilder.build(resultpath)

WindowsPath('C:/Users/karth/Desktop/PhD projects/immuneML/immuneML/quickstart/machine_learning_analysis/result')

In [12]:
resultpath

WindowsPath('C:/Users/karth/Desktop/PhD projects/immuneML/immuneML/quickstart/machine_learning_analysis/result')

In [13]:
specs

{'definitions': {'datasets': {'d1': {'format': 'AIRR',
    'params': {'path': 'C:\\Users\\karth\\Desktop\\PhD projects\\immuneML\\immuneML\\quickstart\\result1\\simulation_instruction\\exported_dataset\\airr',
     'metadata_file': 'C:\\Users\\karth\\Desktop\\PhD projects\\immuneML\\immuneML\\quickstart\\result1\\simulation_instruction\\exported_dataset\\airr\\metadata.csv'}}},
  'encodings': {'e1': {'KmerFrequency': {'k': 3}},
   'e2': {'KmerFrequency': {'k': 2}}},
  'ml_methods': {'simpleLR': {'LogisticRegression': {'C': 0.1,
     'penalty': 'l1',
     'max_iter': 200}}},
  'reports': {'rep1': {'SequenceLengthDistribution': {'batch_size': 3}},
   'hprep': 'MLSettingsPerformance',
   'coef': 'Coefficients'}},
 'instructions': {'machine_learning_instruction': {'type': 'TrainMLModel',
   'settings': [{'encoding': 'e1', 'ml_method': 'simpleLR'},
    {'encoding': 'e2', 'ml_method': 'simpleLR'}],
   'assessment': {'split_strategy': 'random',
    'split_count': 1,
    'training_percentage':

In [14]:
app = ImmuneMLApp(specs_file, resultpath /
                  "machine_learning_analysis/result")
app.run()


2022-04-22 11:43:45.384578: Setting temporary cache path to ..\..\quickstart\machine_learning_analysis\result\machine_learning_analysis\result\cache
2022-04-22 11:43:45.385559: ImmuneML: parsing the specification...

2022-04-22 11:44:02.707797: Full specification is available at ..\..\quickstart\machine_learning_analysis\result\machine_learning_analysis\result\full_specs.yaml.

2022-04-22 11:44:02.707797: ImmuneML: starting the analysis...

2022-04-22 11:44:02.708812: Instruction 1/1 has started.
2022-04-22 11:44:02.764800: Training ML model: running outer CV loop: started split 1/1.

2022-04-22 11:44:02.819802: Hyperparameter optimization: running the inner loop of nested CV: selection for label my_signal (label 1 / 1).

2022-04-22 11:44:02.822806: Evaluating hyperparameter setting: e1_simpleLR...
2022-04-22 11:44:02.825804: Encoding started...
2022-04-22 11:44:06.829835: Encoding finished.
2022-04-22 11:44:06.830835: ML model training started...


FileNotFoundError: [Errno 2] No such file or directory: '..\\..\\quickstart\\machine_learning_analysis\\result\\machine_learning_analysis\\result\\machine_learning_instruction\\split_1\\selection_random\\split_1\\my_signal_e1_simpleLR\\ml_method\\logistic_regression.pickle'

### ImmuneMLAPP

##### Set path and cache

In [None]:
from immuneML.caching.CacheType import CacheType
from immuneML.environment.Constants import Constants


In [56]:
for key in specs["instructions"]:
    print(key)


machine_learning_instruction


In [57]:
from immuneML.dsl.DefaultParamsLoader import DefaultParamsLoader


In [61]:
key

'machine_learning_instruction'

In [60]:
default_params = DefaultParamsLoader.load(
    "instructions/", specs["instructions"][key]["type"])

default_params

{'reports': [],
 'strategy': 'GridSearch',
 'number_of_processes': 4,
 'refit_optimal_model': False,
 'metrics': [],
 'assessment': {'split_strategy': 'random',
  'split_count': 1,
  'training_percentage': 0.7},
 'selection': {'split_strategy': 'random',
  'split_count': 1,
  'training_percentage': 0.7}}

In [64]:
parser = ReflectionHandler.get_class_by_name("{}Parser".format(specs["instructions"][key]["type"]), "instruction_parsers/")()


<immuneML.dsl.instruction_parsers.TrainMLModelParser.TrainMLModelParser at 0x23ff3e0fcd0>

### Instruction Parsers

In [66]:
key = "machine_learning_instruction"


In [70]:
instruction = {**default_params, **specs["instructions"][key]}


result_path


WindowsPath('../../quickstart/machine_learning_analysis/result')

In [1]:
instruction

NameError: name 'instruction' is not defined

In [69]:
symbol_table.__dict__

{'_items': {'e1': <immuneML.dsl.symbol_table.SymbolTableEntry.SymbolTableEntry at 0x23ff47faf40>,
  'e2': <immuneML.dsl.symbol_table.SymbolTableEntry.SymbolTableEntry at 0x23ff45c4040>,
  'simpleLR': <immuneML.dsl.symbol_table.SymbolTableEntry.SymbolTableEntry at 0x23ff45c4100>,
  'coef': <immuneML.dsl.symbol_table.SymbolTableEntry.SymbolTableEntry at 0x23ff55a9100>,
  'hprep': <immuneML.dsl.symbol_table.SymbolTableEntry.SymbolTableEntry at 0x23ff74c6e80>,
  'rep1': <immuneML.dsl.symbol_table.SymbolTableEntry.SymbolTableEntry at 0x23ff55a91f0>,
  'd1': <immuneML.dsl.symbol_table.SymbolTableEntry.SymbolTableEntry at 0x23ff74d1070>,
  'machine_learning_instruction': <immuneML.dsl.symbol_table.SymbolTableEntry.SymbolTableEntry at 0x23ff74e6eb0>,
  'output': <immuneML.dsl.symbol_table.SymbolTableEntry.SymbolTableEntry at 0x23ff74e65b0>}}

In [77]:
symbol_table.get('output')

{'format': 'HTML'}

### TrainML Model parser

In [71]:
instructions_object = parser.parse(key, instruction, symbol_table, result_path)


In [None]:
class TrainMLModelParser:

    def parse(self, key: str, instruction: dict, symbol_table: SymbolTable, path: Path = None) -> TrainMLModelInstruction:

        valid_keys = ["assessment", "selection", "dataset", "strategy", "labels", "metrics", "settings", "number_of_processes", "type", "reports",
                      "optimization_metric", 'refit_optimal_model']
        ParameterValidator.assert_type_and_value(
            instruction['settings'], list, TrainMLModelParser.__name__, 'settings')
        ParameterValidator.assert_keys(list(
            instruction.keys()), valid_keys, TrainMLModelParser.__name__, "TrainMLModel")
        ParameterValidator.assert_type_and_value(
            instruction['refit_optimal_model'], bool, TrainMLModelParser.__name__, 'refit_optimal_model')
        ParameterValidator.assert_type_and_value(
            instruction['metrics'], list, TrainMLModelParser.__name__, 'metrics')
        ParameterValidator.assert_type_and_value(
            instruction['optimization_metric'], str, TrainMLModelParser.__name__, 'optimization_metric')
        ParameterValidator.assert_type_and_value(
            instruction['number_of_processes'], int, TrainMLModelParser.__name__, 'number_of_processes')
        ParameterValidator.assert_type_and_value(
            instruction['strategy'], str, TrainMLModelParser.__name__, 'strategy')
        if instruction["reports"] is not None:
            ParameterValidator.assert_type_and_value(
                instruction['reports'], list, TrainMLModelParser.__name__, 'reports')

        settings = self._parse_settings(instruction, symbol_table)
        dataset = symbol_table.get(instruction["dataset"])
        label_config = LabelHelper.create_label_config(
            instruction["labels"], dataset, TrainMLModelParser.__name__, key)
        assessment = self._parse_split_config(
            key, instruction, "assessment", symbol_table, len(settings), label_config)
        selection = self._parse_split_config(
            key, instruction, "selection", symbol_table, len(settings), label_config)
        assessment, selection = self._update_split_configs(
            assessment, selection, dataset)
        strategy = ReflectionHandler.get_class_by_name(
            instruction["strategy"], "hyperparameter_optimization/")
        metrics = {Metric[metric.upper()] for metric in instruction["metrics"]}
        optimization_metric = Metric[instruction["optimization_metric"].upper(
        )]
        metric_search_criterion = Metric.get_search_criterion(
            optimization_metric)
        path = self._prepare_path(instruction)
        context = self._prepare_context(instruction, symbol_table)
        reports = self._prepare_reports(instruction["reports"], symbol_table)

        hp_instruction = TrainMLModelInstruction(dataset=dataset, hp_strategy=strategy(settings, metric_search_criterion),
                                                 hp_settings=settings, assessment=assessment, selection=selection, metrics=metrics,
                                                 optimization_metric=optimization_metric, refit_optimal_model=instruction[
                                                     'refit_optimal_model'],
                                                 label_configuration=label_config, path=path, context=context,
                                                 number_of_processes=instruction["number_of_processes"], reports=reports, name=key)

        return hp_instruction


In [73]:
instructions_object.__dict__

{'state': TrainMLModelState(dataset=<immuneML.data_model.dataset.RepertoireDataset.RepertoireDataset object at 0x0000023FF7526FA0>, hp_strategy=<immuneML.hyperparameter_optimization.strategy.GridSearch.GridSearch object at 0x0000023FF75009D0>, hp_settings=[<immuneML.hyperparameter_optimization.HPSetting.HPSetting object at 0x0000023FF55A9700>, <immuneML.hyperparameter_optimization.HPSetting.HPSetting object at 0x0000023FF3DF26D0>], assessment=<immuneML.hyperparameter_optimization.config.SplitConfig.SplitConfig object at 0x0000023FF7500DF0>, selection=<immuneML.hyperparameter_optimization.config.SplitConfig.SplitConfig object at 0x0000023FF75004F0>, metrics={<Metric.ACCURACY: 'accuracy_score'>}, optimization_metric=<Metric.BALANCED_ACCURACY: 'balanced_accuracy_score'>, label_configuration=<immuneML.environment.LabelConfiguration.LabelConfiguration object at 0x0000023FF75000D0>, path=WindowsPath('C:/Users/karth/Desktop/PhD projects/immuneML/immuneML/analysis_runs/7445dd22466e58dfe647730a

In [39]:
specification_path = Path(specs_file)
result_path = Path(os.path.relpath(resultpath))
PathBuilder.build(result_path)

cache_path = result_path / "cache"



In [44]:
def set_cache( ):
    os.environ["cache_type"] = "production"
    EnvironmentSettings.set_cache_path(cache_path)

def clear_cache( ):
        shutil.rmtree(cache_path, ignore_errors=True)
        EnvironmentSettings.reset_cache_path()
        del os.environ[Constants.CACHE_TYPE]

In [45]:
set_cache()

2022-04-20 13:41:03.513241: Setting temporary cache path to ..\..\quickstart\machine_learning_analysis\result\cache


### ImmuneMLPraser

In [63]:

from immuneML.dsl.ImmuneMLParser import ImmuneMLParser

from immuneML.caching.CacheType import CacheType
from immuneML.dsl.ImmuneMLParser import ImmuneMLParser
from immuneML.dsl.semantic_model.SemanticModel import SemanticModel
from immuneML.dsl.symbol_table.SymbolType import SymbolType
from immuneML.environment.Constants import Constants
from immuneML.environment.EnvironmentSettings import EnvironmentSettings
from immuneML.util.PathBuilder import PathBuilder
from immuneML.util.ReflectionHandler import ReflectionHandler


In [47]:
symbol_table, specification_path = ImmuneMLParser.parse_yaml_file(
    specification_path, result_path)


2022-04-20 13:44:05.708609: Full specification is available at ..\..\quickstart\machine_learning_analysis\result\full_specs.yaml.



In [49]:
instructions = symbol_table.get_by_type(SymbolType.INSTRUCTION)


In [54]:
instructions[0].__dict__

{'symbol': 'machine_learning_instruction',
 'symbol_type': <SymbolType.INSTRUCTION: 8>,
 'item': <immuneML.workflows.instructions.TrainMLModelInstruction.TrainMLModelInstruction at 0x23ff74e6df0>,
 'config': None}

In [55]:
output = symbol_table.get("output")


In [79]:
instruction

{'reports': ['hprep'],
 'strategy': 'GridSearch',
 'number_of_processes': 3,
 'refit_optimal_model': False,
 'metrics': ['accuracy'],
 'assessment': {'split_strategy': 'random',
  'split_count': 1,
  'training_percentage': 0.7,
  'reports': {'data_splits': ['rep1'], 'models': ['coef']}},
 'selection': {'split_strategy': 'random',
  'split_count': 1,
  'training_percentage': 0.7,
  'reports': {'data_splits': ['rep1'], 'models': []}},
 'type': 'TrainMLModel',
 'settings': [{'encoding': 'e1',
   'ml_method': 'simpleLR',
   'preprocessing': None},
  {'encoding': 'e2', 'ml_method': 'simpleLR', 'preprocessing': None}],
 'labels': ['my_signal'],
 'dataset': 'd1',
 'optimization_metric': 'balanced_accuracy'}

In [80]:
instructions_object.__dict__


{'state': TrainMLModelState(dataset=<immuneML.data_model.dataset.RepertoireDataset.RepertoireDataset object at 0x0000023FF7526FA0>, hp_strategy=<immuneML.hyperparameter_optimization.strategy.GridSearch.GridSearch object at 0x0000023FF75009D0>, hp_settings=[<immuneML.hyperparameter_optimization.HPSetting.HPSetting object at 0x0000023FF55A9700>, <immuneML.hyperparameter_optimization.HPSetting.HPSetting object at 0x0000023FF3DF26D0>], assessment=<immuneML.hyperparameter_optimization.config.SplitConfig.SplitConfig object at 0x0000023FF7500DF0>, selection=<immuneML.hyperparameter_optimization.config.SplitConfig.SplitConfig object at 0x0000023FF75004F0>, metrics={<Metric.ACCURACY: 'accuracy_score'>}, optimization_metric=<Metric.BALANCED_ACCURACY: 'balanced_accuracy_score'>, label_configuration=<immuneML.environment.LabelConfiguration.LabelConfiguration object at 0x0000023FF75000D0>, path=WindowsPath('C:/Users/karth/Desktop/PhD projects/immuneML/immuneML/analysis_runs/7445dd22466e58dfe647730a

### Sematic Model

In [81]:
instructions1 = symbol_table.get_by_type(SymbolType.INSTRUCTION)


In [83]:
instructions1[0].__dict__

{'symbol': 'machine_learning_instruction',
 'symbol_type': <SymbolType.INSTRUCTION: 8>,
 'item': <immuneML.workflows.instructions.TrainMLModelInstruction.TrainMLModelInstruction at 0x23ff74e6df0>,
 'config': None}

In [85]:
for instruction in instructions1:

    print(instruction.item)


<immuneML.workflows.instructions.TrainMLModelInstruction.TrainMLModelInstruction object at 0x0000023FF74E6DF0>


In [87]:
output = symbol_table.get("output")

output

{'format': 'HTML'}

In [92]:
report_builder = ReflectionHandler.get_class_by_name(
    f"{output['format']}Builder", "presentation/")


In [93]:
report_builder

immuneML.presentation.html.HTMLBuilder.HTMLBuilder

In [89]:
for index, instruction in enumerate([instruction.item]):

    print(index, instruction)

0 <immuneML.workflows.instructions.TrainMLModelInstruction.TrainMLModelInstruction object at 0x0000023FF74E6DF0>


In [99]:
instructions

[<immuneML.dsl.symbol_table.SymbolTableEntry.SymbolTableEntry at 0x23ff74e6eb0>]

In [100]:
model = SemanticModel(
    [instruction.item for instruction in instructions], result_path, output)


#### TrainMLModelinstruction

In [106]:
 state = instruction.__dict__['state']

WindowsPath('C:/Users/karth/Desktop/PhD projects/immuneML/immuneML/quickstart/machine_learning_analysis/result')

In [109]:
state.path = resultpath

In [110]:
state

TrainMLModelState(dataset=<immuneML.data_model.dataset.RepertoireDataset.RepertoireDataset object at 0x0000023FF7526FA0>, hp_strategy=<immuneML.hyperparameter_optimization.strategy.GridSearch.GridSearch object at 0x0000023FF74E6880>, hp_settings=[<immuneML.hyperparameter_optimization.HPSetting.HPSetting object at 0x0000023FF74E6CA0>, <immuneML.hyperparameter_optimization.HPSetting.HPSetting object at 0x0000023FF74E6820>], assessment=<immuneML.hyperparameter_optimization.config.SplitConfig.SplitConfig object at 0x0000023FF74E6A60>, selection=<immuneML.hyperparameter_optimization.config.SplitConfig.SplitConfig object at 0x0000023FF74E68B0>, metrics={<Metric.ACCURACY: 'accuracy_score'>}, optimization_metric=<Metric.BALANCED_ACCURACY: 'balanced_accuracy_score'>, label_configuration=<immuneML.environment.LabelConfiguration.LabelConfiguration object at 0x0000023FF74E6A90>, path=WindowsPath('C:/Users/karth/Desktop/PhD projects/immuneML/immuneML/quickstart/machine_learning_analysis/result'), c

In [None]:
{'reports': ['hprep'],
 'strategy': 'GridSearch',
 'number_of_processes': 3,
 'refit_optimal_model': False,
 'metrics': ['accuracy'],
 'assessment': {'split_strategy': 'random',
                'split_count': 1,
                'training_percentage': 0.7,
                'reports': {'data_splits': ['rep1'], 'models': ['coef']}},
 'selection': {'split_strategy': 'random',
               'split_count': 1,
               'training_percentage': 0.7,
               'reports': {'data_splits': ['rep1'], 'models': []}},
 'type': 'TrainMLModel',
 'settings': [{'encoding': 'e1',
               'ml_method': 'simpleLR',
               'preprocessing': None},
              {'encoding': 'e2', 'ml_method': 'simpleLR', 'preprocessing': None}],
 'labels': ['my_signal'],
 'dataset': 'd1',
 'optimization_metric': 'balanced_accuracy'}


##### HPA Assement

In [None]:
from immuneML.hyperparameter_optimization.core.HPAssessment import HPAssessment
