In [2]:
import logging
import os
import shutil
import sys
import warnings
from pathlib import Path

import yaml

from immuneML.app.ImmuneMLApp import ImmuneMLApp
from immuneML.environment.EnvironmentSettings import EnvironmentSettings
from immuneML.util.PathBuilder import PathBuilder


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def build_path(path: str = None):
    if path is None:
        path = EnvironmentSettings.root_path / "quickstart/"
        if os.path.isdir(path):
            shutil.rmtree(path)
        PathBuilder.build(path)
    else:
        path = PathBuilder.build(path)
    return path


In [4]:
path = build_path()

In [5]:
path

WindowsPath('C:/Users/karth/Desktop/PhD projects/immuneML/immuneML/quickstart')

In [6]:
specs = {
    "definitions": {
        "datasets": {
            "my_synthetic_dataset": {"format": "RandomRepertoireDataset", "params": {"labels": {}}}
        },
        "motifs": {"my_motif": {"seed": "AA", "instantiation": "GappedKmer"}},
        "signals": {"my_signal": {"motifs": ["my_motif"], "implanting": "HealthySequence"}},
        "simulations": {"my_simulation": {"my_implantng": {"signals": ["my_signal"], "dataset_implanting_rate": 0.5,
                                                           "repertoire_implanting_rate": 0.1}}}
    },
    "instructions": {"simulation_instruction": {"type": "Simulation", "dataset": "my_synthetic_dataset", "simulation": "my_simulation",
                                                "export_formats": ["AIRR"]}}
}


In [7]:
specs_file = path / "simulation_specs.yaml"
with specs_file.open("w") as file:
    yaml.dump(specs, file)


In [8]:
app = ImmuneMLApp(specs_file, path / "result")

app.run()


2022-05-24 11:10:12.848760: Setting temporary cache path to ..\..\..\immuneML\quickstart\result\cache
2022-05-24 11:10:12.849760: ImmuneML: parsing the specification...

2022-05-24 11:10:14.074323: Full specification is available at ..\..\..\immuneML\quickstart\result\full_simulation_specs.yaml.

2022-05-24 11:10:14.075326: ImmuneML: starting the analysis...

2022-05-24 11:10:14.076324: Instruction 1/1 has started.




2022-05-24 11:10:24.283321: Instruction 1/1 has finished.
2022-05-24 11:10:24.304320: Generating HTML reports...
2022-05-24 11:10:24.810366: HTML reports are generated.
2022-05-24 11:10:24.812365: ImmuneML: finished analysis.



[SimulationState(signals=[<immuneML.simulation.implants.Signal.Signal object at 0x0000027E0C48F880>], simulation=<immuneML.simulation.Simulation.Simulation object at 0x0000027E0C48F1C0>, dataset=<immuneML.data_model.dataset.RepertoireDataset.RepertoireDataset object at 0x0000027E0C7601C0>, formats=['AIRR'], paths={'my_synthetic_dataset': {'AIRR': WindowsPath('../../../immuneML/quickstart/result/simulation_instruction/exported_dataset/airr')}}, resulting_dataset=<immuneML.data_model.dataset.RepertoireDataset.RepertoireDataset object at 0x0000027E7336D820>, result_path=WindowsPath('../../../immuneML/quickstart/result/simulation_instruction'), name='simulation_instruction')]

In [9]:
path = EnvironmentSettings.root_path / "quickstart/"


In [10]:
path

WindowsPath('C:/Users/karth/Desktop/PhD projects/immuneML/immuneML/quickstart')

In [11]:
specs = {
    "definitions": {
        "datasets": {
            "d1": {
                "format": "AIRR",
                "params": {
                    "path": str(path / "result/simulation_instruction/exported_dataset/airr/"),
                    "metadata_file": str(path / "result/simulation_instruction/exported_dataset/airr/metadata.csv")
                }
            }
        },
        "encodings": {
            "e1": {
                "KmerFrequency": {
                    "k": 3
                }
            },
            "e2": {
                "KmerFrequency": {
                    "k": 2
                }
            }
        },
        "ml_methods": {
            "simpleLR": {
                "LogisticRegression": {
                    "C": 0.1,
                    "penalty": "l1",
                    "max_iter": 200
                }}
        },
        "reports": {
            "rep1": {
                "SequenceLengthDistribution": {
                    "batch_size": 3
                }
            },
            "hprep": "MLSettingsPerformance",
            "coef": "Coefficients"
        }
    },
    "instructions": {
        "machine_learning_instruction": {
            "type": "TrainMLModel",
            "settings": [
                {
                    "encoding": "e1",
                    "ml_method": "simpleLR"
                },
                {
                    "encoding": "e2",
                    "ml_method": "simpleLR"
                }
            ],
            "assessment": {
                "split_strategy": "random",
                "split_count": 1,
                "training_percentage": 0.7,
                "reports": {
                    "data_splits": ["rep1"],
                    'models': ["coef"]
                }
            },
            "selection": {
                "split_strategy": "random",
                "split_count": 1,
                "training_percentage": 0.7,
                "reports": {
                    "data_splits": ["rep1"],
                    "models": [],
                }
            },
            "labels": ["my_signal"],
            "dataset": "d1",
            "strategy": "GridSearch",
            "metrics": ["accuracy"],
            "reports": ["hprep"],
            "number_of_processes": 3,
            "optimization_metric": "balanced_accuracy",
            "refit_optimal_model": False
        }
    }
}


In [12]:
resultpath = path / "machine_learning_analysis/result"


In [13]:
PathBuilder.build(resultpath)


WindowsPath('C:/Users/karth/Desktop/PhD projects/immuneML/immuneML/quickstart/machine_learning_analysis/result')

In [14]:
PathBuilder.build(path)
specs_file = path / "machine_learning_analysis/specs.yaml"
with specs_file.open("w") as file:
    yaml.dump(specs, file)


In [15]:
specs_file

WindowsPath('C:/Users/karth/Desktop/PhD projects/immuneML/immuneML/quickstart/machine_learning_analysis/specs.yaml')

In [16]:
from immuneML.dsl.DefaultParamsLoader import DefaultParamsLoader
from immuneML.environment.Constants import Constants
from immuneML.caching.CacheType import CacheType


In [17]:
for key in specs["instructions"]:
    default_params = DefaultParamsLoader.load(
        "instructions/", specs["instructions"][key]["type"])

    print(specs["instructions"][key]["type"])

    print(default_params)


TrainMLModel
{'reports': [], 'strategy': 'GridSearch', 'number_of_processes': 4, 'refit_optimal_model': False, 'metrics': [], 'assessment': {'split_strategy': 'random', 'split_count': 1, 'training_percentage': 0.7}, 'selection': {'split_strategy': 'random', 'split_count': 1, 'training_percentage': 0.7}}


In [18]:
from immuneML.util.ReflectionHandler import ReflectionHandler


In [19]:
parser = ReflectionHandler.get_class_by_name("{}Parser".format(
    specs["instructions"][key]["type"]), "instruction_parsers/")()


In [20]:
parser

<immuneML.dsl.instruction_parsers.TrainMLModelParser.TrainMLModelParser at 0x27e0c760ac0>

In [21]:

from immuneML.dsl.ImmuneMLParser import ImmuneMLParser

from immuneML.caching.CacheType import CacheType
from immuneML.dsl.ImmuneMLParser import ImmuneMLParser
from immuneML.dsl.semantic_model.SemanticModel import SemanticModel
from immuneML.dsl.symbol_table.SymbolType import SymbolType
from immuneML.environment.Constants import Constants
from immuneML.environment.EnvironmentSettings import EnvironmentSettings
from immuneML.util.PathBuilder import PathBuilder
from immuneML.util.ReflectionHandler import ReflectionHandler


In [22]:
from immuneML.ml_methods.LogisticRegression import LogisticRegression

In [23]:
MLMethod = LogisticRegression()

In [24]:
symbol_table, specification_path = ImmuneMLParser.parse_yaml_file(
    specs_file, resultpath)


2022-05-24 11:10:39.546129: Full specification is available at C:\Users\karth\Desktop\PhD projects\immuneML\immuneML\quickstart\machine_learning_analysis\result\full_specs.yaml.



In [25]:
specification_path

WindowsPath('C:/Users/karth/Desktop/PhD projects/immuneML/immuneML/quickstart/machine_learning_analysis/result/full_specs.yaml')

In [26]:
output = symbol_table.get("output")
output

{'format': 'HTML'}

In [27]:
instructions = symbol_table.get_by_type(SymbolType.INSTRUCTION)


In [28]:
instructions[0].__dict__

{'symbol': 'machine_learning_instruction',
 'symbol_type': <SymbolType.INSTRUCTION: 8>,
 'item': <immuneML.workflows.instructions.TrainMLModelInstruction.TrainMLModelInstruction at 0x27e105b51c0>,
 'config': None}

In [29]:
from immuneML.dsl.DefaultParamsLoader import DefaultParamsLoader


In [30]:
for key in specs["instructions"]:

    key = key

In [31]:
key

'machine_learning_instruction'

In [32]:
default_params = DefaultParamsLoader.load(
    "instructions/", specs["instructions"][key]["type"])

default_params


{'reports': [],
 'strategy': 'GridSearch',
 'number_of_processes': 4,
 'refit_optimal_model': False,
 'metrics': [],
 'assessment': {'split_strategy': 'random',
  'split_count': 1,
  'training_percentage': 0.7},
 'selection': {'split_strategy': 'random',
  'split_count': 1,
  'training_percentage': 0.7}}

In [33]:
instruction = {**default_params, **specs["instructions"][key]}


In [34]:
instruction

{'reports': ['hprep'],
 'strategy': 'GridSearch',
 'number_of_processes': 3,
 'refit_optimal_model': False,
 'metrics': ['accuracy'],
 'assessment': {'split_strategy': 'random',
  'split_count': 1,
  'training_percentage': 0.7,
  'reports': {'data_splits': ['rep1'], 'models': ['coef']}},
 'selection': {'split_strategy': 'random',
  'split_count': 1,
  'training_percentage': 0.7,
  'reports': {'data_splits': ['rep1'], 'models': []}},
 'type': 'TrainMLModel',
 'settings': [{'encoding': 'e1', 'ml_method': 'simpleLR'},
  {'encoding': 'e2', 'ml_method': 'simpleLR'}],
 'labels': ['my_signal'],
 'dataset': 'd1',
 'optimization_metric': 'balanced_accuracy'}

In [35]:
resultpath

WindowsPath('C:/Users/karth/Desktop/PhD projects/immuneML/immuneML/quickstart/machine_learning_analysis/result')

In [36]:
parser = ReflectionHandler.get_class_by_name("{}Parser".format(
    specs["instructions"][key]["type"]), "instruction_parsers/")()


In [37]:
parser

<immuneML.dsl.instruction_parsers.TrainMLModelParser.TrainMLModelParser at 0x27e0c760190>

In [38]:
instructions_object = parser.parse(key, instruction , symbol_table, resultpath )

In [39]:
instructions_object

<immuneML.workflows.instructions.TrainMLModelInstruction.TrainMLModelInstruction at 0x27e105b6b50>

In [40]:
instructions_object.__dict__

{'state': TrainMLModelState(dataset=<immuneML.data_model.dataset.RepertoireDataset.RepertoireDataset object at 0x0000027E105ADE80>, hp_strategy=<immuneML.hyperparameter_optimization.strategy.GridSearch.GridSearch object at 0x0000027E0C495DC0>, hp_settings=[<immuneML.hyperparameter_optimization.HPSetting.HPSetting object at 0x0000027E0C7D2820>, <immuneML.hyperparameter_optimization.HPSetting.HPSetting object at 0x0000027E0C48F6D0>], assessment=<immuneML.hyperparameter_optimization.config.SplitConfig.SplitConfig object at 0x0000027E0C495CD0>, selection=<immuneML.hyperparameter_optimization.config.SplitConfig.SplitConfig object at 0x0000027E0C3C1160>, metrics={<Metric.ACCURACY: 'accuracy_score'>}, optimization_metric=<Metric.BALANCED_ACCURACY: 'balanced_accuracy_score'>, label_configuration=<immuneML.environment.LabelConfiguration.LabelConfiguration object at 0x0000027E0C48FF70>, path=WindowsPath('C:/Users/karth/Desktop/PhD projects/immuneML/immuneML/analysis_runs/7445dd22466e58dfe647730a

In [41]:
state = instructions_object.state

In [42]:
state

TrainMLModelState(dataset=<immuneML.data_model.dataset.RepertoireDataset.RepertoireDataset object at 0x0000027E105ADE80>, hp_strategy=<immuneML.hyperparameter_optimization.strategy.GridSearch.GridSearch object at 0x0000027E0C495DC0>, hp_settings=[<immuneML.hyperparameter_optimization.HPSetting.HPSetting object at 0x0000027E0C7D2820>, <immuneML.hyperparameter_optimization.HPSetting.HPSetting object at 0x0000027E0C48F6D0>], assessment=<immuneML.hyperparameter_optimization.config.SplitConfig.SplitConfig object at 0x0000027E0C495CD0>, selection=<immuneML.hyperparameter_optimization.config.SplitConfig.SplitConfig object at 0x0000027E0C3C1160>, metrics={<Metric.ACCURACY: 'accuracy_score'>}, optimization_metric=<Metric.BALANCED_ACCURACY: 'balanced_accuracy_score'>, label_configuration=<immuneML.environment.LabelConfiguration.LabelConfiguration object at 0x0000027E0C48FF70>, path=WindowsPath('C:/Users/karth/Desktop/PhD projects/immuneML/immuneML/analysis_runs/7445dd22466e58dfe647730a635b0547')

In [43]:
state.dataset.get_metadata_fields()

['my_signal', 'subject_id', 'identifier', 'filename']

In [44]:
state.dataset.get_filenames()

[WindowsPath('repertoires/1b0b00b15de940d29cb2b93785bf56b4.npy'),
 WindowsPath('repertoires/e9030a3054e74e809985664108993718.npy'),
 WindowsPath('repertoires/0141afbeefd4427b8ac6808637808210.npy'),
 WindowsPath('repertoires/7e7e75819126449dbc68b7e8e0bb44e1.npy'),
 WindowsPath('repertoires/534cb2c096bb456595925d170f899b3e.npy'),
 WindowsPath('repertoires/a67b0b669bcb44459be9d42c7c8e886d.npy'),
 WindowsPath('repertoires/53ccc6af09cd41a9aa21f59b1ee20aae.npy'),
 WindowsPath('repertoires/a8f621bc294a4497b0bb992af896e672.npy'),
 WindowsPath('repertoires/b65e572a12c749d29de7700f140fa455.npy'),
 WindowsPath('repertoires/4202c9f1332e41f1b23451dac71884dd.npy'),
 WindowsPath('repertoires/e2552214b84b4223b89d78c623accce7.npy'),
 WindowsPath('repertoires/8b58a4183132412d836656416bd9ba21.npy'),
 WindowsPath('repertoires/3d5d0a99e2f445ee837082d1c4742e13.npy'),
 WindowsPath('repertoires/78de5469429c4fa1a51a3a57a4958a98.npy'),
 WindowsPath('repertoires/d2752e8e0cf349b683d99f4b327186be.npy'),
 WindowsPa

In [45]:
state.dataset.get_data()

[<immuneML.data_model.repertoire.Repertoire.Repertoire at 0x27e105b0f40>,
 <immuneML.data_model.repertoire.Repertoire.Repertoire at 0x27e105b0a90>,
 <immuneML.data_model.repertoire.Repertoire.Repertoire at 0x27e105b0ac0>,
 <immuneML.data_model.repertoire.Repertoire.Repertoire at 0x27e105b0bb0>,
 <immuneML.data_model.repertoire.Repertoire.Repertoire at 0x27e105b0400>,
 <immuneML.data_model.repertoire.Repertoire.Repertoire at 0x27e105b0850>,
 <immuneML.data_model.repertoire.Repertoire.Repertoire at 0x27e105b0520>,
 <immuneML.data_model.repertoire.Repertoire.Repertoire at 0x27e105b0610>,
 <immuneML.data_model.repertoire.Repertoire.Repertoire at 0x27e105b09a0>,
 <immuneML.data_model.repertoire.Repertoire.Repertoire at 0x27e105b0640>,
 <immuneML.data_model.repertoire.Repertoire.Repertoire at 0x27e105b0730>,
 <immuneML.data_model.repertoire.Repertoire.Repertoire at 0x27e105b0760>,
 <immuneML.data_model.repertoire.Repertoire.Repertoire at 0x27e105b0f10>,
 <immuneML.data_model.repertoire.Reper

In [46]:
state.hp_settings[0]

<immuneML.hyperparameter_optimization.HPSetting.HPSetting at 0x27e0c7d2820>

In [47]:
state.hp_strategy.hp_settings['e1_simpleLR'].encoder_params

{'normalization_type': 'l2',
 'reads': 'unique',
 'sequence_encoding': 'continuous_kmer',
 'scale_to_unit_variance': True,
 'scale_to_zero_mean': False,
 'k': 3,
 'k_left': 1,
 'k_right': 1,
 'max_gap': 0,
 'min_gap': 0,
 'sequence_type': 'amino_acid',
 'name': 'e1'}

In [48]:
state.hp_strategy

<immuneML.hyperparameter_optimization.strategy.GridSearch.GridSearch at 0x27e0c495dc0>

In [49]:
state.hp_strategy.hp_settings['e1_simpleLR'].encoder

<immuneML.encodings.kmer_frequency.KmerFreqRepertoireEncoder.KmerFreqRepertoireEncoder at 0x27e7450c340>

In [50]:
state.hp_strategy.hp_settings['e1_simpleLR'].ml_params

{'model_selection_cv': False,
 'model_selection_n_folds': -1,
 'LogisticRegression': {'C': 0.1, 'max_iter': 200, 'penalty': 'l1'}}

In [51]:
state.hp_strategy.hp_settings['e1_simpleLR'].ml_method

<immuneML.ml_methods.LogisticRegression.LogisticRegression at 0x27e0d281070>

In [52]:
state.assessment.split_count

1

In [53]:
state.assessment.split_strategy

<SplitType.RANDOM: 2>

In [54]:
state.assessment.training_percentage

0.7

In [55]:
state.assessment_states

[]

In [56]:
state.selection.reports.data_split_reports['rep1']

<immuneML.reports.data_reports.SequenceLengthDistribution.SequenceLengthDistribution at 0x27e10592e80>

In [57]:

instructions = symbol_table.get_by_type(SymbolType.INSTRUCTION)
output = symbol_table.get("output")


In [58]:
instructions

[<immuneML.dsl.symbol_table.SymbolTableEntry.SymbolTableEntry at 0x27e105b52e0>]

In [59]:
output

{'format': 'HTML'}

In [60]:
symbol_table.add(key, SymbolType.INSTRUCTION, instructions_object)




In [61]:
instructions_object.state

TrainMLModelState(dataset=<immuneML.data_model.dataset.RepertoireDataset.RepertoireDataset object at 0x0000027E105ADE80>, hp_strategy=<immuneML.hyperparameter_optimization.strategy.GridSearch.GridSearch object at 0x0000027E0C495DC0>, hp_settings=[<immuneML.hyperparameter_optimization.HPSetting.HPSetting object at 0x0000027E0C7D2820>, <immuneML.hyperparameter_optimization.HPSetting.HPSetting object at 0x0000027E0C48F6D0>], assessment=<immuneML.hyperparameter_optimization.config.SplitConfig.SplitConfig object at 0x0000027E0C495CD0>, selection=<immuneML.hyperparameter_optimization.config.SplitConfig.SplitConfig object at 0x0000027E0C3C1160>, metrics={<Metric.ACCURACY: 'accuracy_score'>}, optimization_metric=<Metric.BALANCED_ACCURACY: 'balanced_accuracy_score'>, label_configuration=<immuneML.environment.LabelConfiguration.LabelConfiguration object at 0x0000027E0C48FF70>, path=WindowsPath('C:/Users/karth/Desktop/PhD projects/immuneML/immuneML/analysis_runs/7445dd22466e58dfe647730a635b0547')

In [62]:
specs["output"] = {"format": "HTML"}


In [63]:
symbol_table.add("output", SymbolType.OUTPUT, specs["output"])




<h3> TrainMLModelInstruction </h3>

In [64]:

instructions_object
# instructions_object.run(resultpath)


<immuneML.workflows.instructions.TrainMLModelInstruction.TrainMLModelInstruction at 0x27e105b6b50>

In [65]:
state

TrainMLModelState(dataset=<immuneML.data_model.dataset.RepertoireDataset.RepertoireDataset object at 0x0000027E105ADE80>, hp_strategy=<immuneML.hyperparameter_optimization.strategy.GridSearch.GridSearch object at 0x0000027E0C495DC0>, hp_settings=[<immuneML.hyperparameter_optimization.HPSetting.HPSetting object at 0x0000027E0C7D2820>, <immuneML.hyperparameter_optimization.HPSetting.HPSetting object at 0x0000027E0C48F6D0>], assessment=<immuneML.hyperparameter_optimization.config.SplitConfig.SplitConfig object at 0x0000027E0C495CD0>, selection=<immuneML.hyperparameter_optimization.config.SplitConfig.SplitConfig object at 0x0000027E0C3C1160>, metrics={<Metric.ACCURACY: 'accuracy_score'>}, optimization_metric=<Metric.BALANCED_ACCURACY: 'balanced_accuracy_score'>, label_configuration=<immuneML.environment.LabelConfiguration.LabelConfiguration object at 0x0000027E0C48FF70>, path=WindowsPath('C:/Users/karth/Desktop/PhD projects/immuneML/immuneML/analysis_runs/7445dd22466e58dfe647730a635b0547')

In [66]:
state.path = resultpath

In [67]:
state

TrainMLModelState(dataset=<immuneML.data_model.dataset.RepertoireDataset.RepertoireDataset object at 0x0000027E105ADE80>, hp_strategy=<immuneML.hyperparameter_optimization.strategy.GridSearch.GridSearch object at 0x0000027E0C495DC0>, hp_settings=[<immuneML.hyperparameter_optimization.HPSetting.HPSetting object at 0x0000027E0C7D2820>, <immuneML.hyperparameter_optimization.HPSetting.HPSetting object at 0x0000027E0C48F6D0>], assessment=<immuneML.hyperparameter_optimization.config.SplitConfig.SplitConfig object at 0x0000027E0C495CD0>, selection=<immuneML.hyperparameter_optimization.config.SplitConfig.SplitConfig object at 0x0000027E0C3C1160>, metrics={<Metric.ACCURACY: 'accuracy_score'>}, optimization_metric=<Metric.BALANCED_ACCURACY: 'balanced_accuracy_score'>, label_configuration=<immuneML.environment.LabelConfiguration.LabelConfiguration object at 0x0000027E0C48FF70>, path=WindowsPath('C:/Users/karth/Desktop/PhD projects/immuneML/immuneML/quickstart/machine_learning_analysis/result'), c

In [68]:
state.number_of_processes

3

<h3> Inject Dask Cluster or Client  object </h3>

<h4> Perform Grid Search </h4>
<ol> Tasks </ol>
<li> Hyperparameter optimization
- Grid Search
- Joblib
- Training with large data
- Creating large data 
<li> Onnx 
<li> Prefect
<li>Yaml to operations<li> H5py <li> parquet support
<li> Weights and bias
<li> Deepchecks

In [69]:
state.dataset.get_filenames()


[WindowsPath('repertoires/1b0b00b15de940d29cb2b93785bf56b4.npy'),
 WindowsPath('repertoires/e9030a3054e74e809985664108993718.npy'),
 WindowsPath('repertoires/0141afbeefd4427b8ac6808637808210.npy'),
 WindowsPath('repertoires/7e7e75819126449dbc68b7e8e0bb44e1.npy'),
 WindowsPath('repertoires/534cb2c096bb456595925d170f899b3e.npy'),
 WindowsPath('repertoires/a67b0b669bcb44459be9d42c7c8e886d.npy'),
 WindowsPath('repertoires/53ccc6af09cd41a9aa21f59b1ee20aae.npy'),
 WindowsPath('repertoires/a8f621bc294a4497b0bb992af896e672.npy'),
 WindowsPath('repertoires/b65e572a12c749d29de7700f140fa455.npy'),
 WindowsPath('repertoires/4202c9f1332e41f1b23451dac71884dd.npy'),
 WindowsPath('repertoires/e2552214b84b4223b89d78c623accce7.npy'),
 WindowsPath('repertoires/8b58a4183132412d836656416bd9ba21.npy'),
 WindowsPath('repertoires/3d5d0a99e2f445ee837082d1c4742e13.npy'),
 WindowsPath('repertoires/78de5469429c4fa1a51a3a57a4958a98.npy'),
 WindowsPath('repertoires/d2752e8e0cf349b683d99f4b327186be.npy'),
 WindowsPa

In [70]:
state.dataset

<immuneML.data_model.dataset.RepertoireDataset.RepertoireDataset at 0x27e105ade80>

In [71]:
state.dataset.metadata_file

WindowsPath('C:/Users/karth/Desktop/PhD projects/immuneML/immuneML/quickstart/machine_learning_analysis/result/datasets/d1/d1_metadata.csv')

In [72]:
import dask.dataframe as dd

In [73]:
import pandas as pd

In [74]:
metadata = pd.read_csv(state.dataset.metadata_file , header = 1)


In [75]:
state.label_configuration

<immuneML.environment.LabelConfiguration.LabelConfiguration at 0x27e0c48ff70>

In [76]:
state.path

WindowsPath('C:/Users/karth/Desktop/PhD projects/immuneML/immuneML/quickstart/machine_learning_analysis/result')

In [77]:
state.dataset.get_example_ids()

['51c59d7f79454b43b8198f0f48e02cc1',
 '3da090c00ae6471c974e9096305ece3a',
 'a9416fa713f84c3b906552bfc582a66c',
 'cb9af10cefd945a29fdcc8c87b5288b8',
 '8c7a1958e9ea4f2aaa16839d8f99aa84',
 'b599141cbc3746c09bf992987114f146',
 'daaa096d6af74b42ad9e0846f546ca0e',
 'b0c510d0a0c04c5197b4b6bf26ac201f',
 '226da91152b84e158fedf77bf6b96799',
 '555fac30cd134c6e8ed9dbcd4dfd53ac',
 'd0457289ff3b4b52b47f8fc953fa0e37',
 '01338a0a074043eb868168d4d6568cc5',
 'e055f9dfe14942349c42343d25c45d47',
 '04d88a16fa19407fa87d2d745c84c900',
 '10f9a1936a2b41acbbc06910784a8246',
 '91cc39fdb05c46de9ded1d74607d67be',
 '78a5887553bc4d7ea6eb327b40d59cbd',
 'a8fafcacb12e402bb1600c0ccbb03471',
 'e4f69934a02b45c3851d8bd05e9d0e06',
 '5421d59fad2d4b1eb0d5d5e07c2280af',
 '30eefb5dd4e54de7a536c3029989b292',
 '51187592298b43469f27ed42f35867a4',
 'a7075f452c024ae9882498ae95ed6b02',
 '7044a662daac46e5bc14668512ef7c6e',
 '735b3e21c3964a43bb86f72e35a9d308',
 '0463cd303472457292593403a4afbea1',
 'def941375edf4306a2d572bb66c7525d',
 

In [78]:
encoder = state.hp_strategy.hp_settings['e1_simpleLR'].encoder

In [79]:
ml_params = state.hp_strategy.hp_settings['e1_simpleLR'].ml_params

In [80]:
encoder_params = state.hp_strategy.hp_settings['e1_simpleLR'].encoder_params

In [81]:
label_config  = state.label_configuration

In [82]:
label = label_config.get_label_object("my_signal")

In [83]:
label.name
label.values

[False, True]

In [84]:
metadata

Unnamed: 0,my_signal,subject_id,identifier,filename
0,True,rep_0,51c59d7f79454b43b8198f0f48e02cc1,repertoires\1b0b00b15de940d29cb2b93785bf56b4.npy
1,True,rep_1,3da090c00ae6471c974e9096305ece3a,repertoires\e9030a3054e74e809985664108993718.npy
2,True,rep_2,a9416fa713f84c3b906552bfc582a66c,repertoires\0141afbeefd4427b8ac6808637808210.npy
3,True,rep_3,cb9af10cefd945a29fdcc8c87b5288b8,repertoires\7e7e75819126449dbc68b7e8e0bb44e1.npy
4,True,rep_4,8c7a1958e9ea4f2aaa16839d8f99aa84,repertoires\534cb2c096bb456595925d170f899b3e.npy
...,...,...,...,...
95,False,rep_95,ed16409f8355430b9782437eda4de015,repertoires\f148da942b3b4978a9780d778f504240.npy
96,False,rep_96,12cfaaac4c7048afa5d4268374f37283,repertoires\aa4c5bcfc1a344b182511f6807b443d1.npy
97,False,rep_97,94bbe9468fe346778be08763f8ca8b91,repertoires\4768b610335548b98a65e811e69eaa16.npy
98,False,rep_98,60dd92cd1aa346dca7f36a73fea8c456,repertoires\3176d33742364c348703e054cc4eaebb.npy


In [85]:
metadata

Unnamed: 0,my_signal,subject_id,identifier,filename
0,True,rep_0,51c59d7f79454b43b8198f0f48e02cc1,repertoires\1b0b00b15de940d29cb2b93785bf56b4.npy
1,True,rep_1,3da090c00ae6471c974e9096305ece3a,repertoires\e9030a3054e74e809985664108993718.npy
2,True,rep_2,a9416fa713f84c3b906552bfc582a66c,repertoires\0141afbeefd4427b8ac6808637808210.npy
3,True,rep_3,cb9af10cefd945a29fdcc8c87b5288b8,repertoires\7e7e75819126449dbc68b7e8e0bb44e1.npy
4,True,rep_4,8c7a1958e9ea4f2aaa16839d8f99aa84,repertoires\534cb2c096bb456595925d170f899b3e.npy
...,...,...,...,...
95,False,rep_95,ed16409f8355430b9782437eda4de015,repertoires\f148da942b3b4978a9780d778f504240.npy
96,False,rep_96,12cfaaac4c7048afa5d4268374f37283,repertoires\aa4c5bcfc1a344b182511f6807b443d1.npy
97,False,rep_97,94bbe9468fe346778be08763f8ca8b91,repertoires\4768b610335548b98a65e811e69eaa16.npy
98,False,rep_98,60dd92cd1aa346dca7f36a73fea8c456,repertoires\3176d33742364c348703e054cc4eaebb.npy


In [86]:
filename = metadata.filename

In [87]:
signal = metadata.my_signal

<h3> Split data and run Assessment

In [88]:
from immuneML.hyperparameter_optimization.core.HPAssessment import HPAssessment


In [89]:
state = HPAssessment._create_root_path(state)

In [90]:
from immuneML.hyperparameter_optimization.core.HPUtil import HPUtil


In [91]:
state

TrainMLModelState(dataset=<immuneML.data_model.dataset.RepertoireDataset.RepertoireDataset object at 0x0000027E105ADE80>, hp_strategy=<immuneML.hyperparameter_optimization.strategy.GridSearch.GridSearch object at 0x0000027E0C495DC0>, hp_settings=[<immuneML.hyperparameter_optimization.HPSetting.HPSetting object at 0x0000027E0C7D2820>, <immuneML.hyperparameter_optimization.HPSetting.HPSetting object at 0x0000027E0C48F6D0>], assessment=<immuneML.hyperparameter_optimization.config.SplitConfig.SplitConfig object at 0x0000027E0C495CD0>, selection=<immuneML.hyperparameter_optimization.config.SplitConfig.SplitConfig object at 0x0000027E0C3C1160>, metrics={<Metric.ACCURACY: 'accuracy_score'>}, optimization_metric=<Metric.BALANCED_ACCURACY: 'balanced_accuracy_score'>, label_configuration=<immuneML.environment.LabelConfiguration.LabelConfiguration object at 0x0000027E0C48FF70>, path=WindowsPath('C:/Users/karth/Desktop/PhD projects/immuneML/immuneML/quickstart/machine_learning_analysis/result/mach

In [92]:
state.path

WindowsPath('C:/Users/karth/Desktop/PhD projects/immuneML/immuneML/quickstart/machine_learning_analysis/result/machine_learning_instruction')

In [93]:
train_val_datasets, test_datasets = HPUtil.split_data(
    state.dataset, state.assessment, state.path, state.label_configuration)


In [94]:
train_val_datasets[0].__dict__

{'encoded_data': None,
 'identifier': '6567663e-db41-11ec-a23b-9cb6d0fe1bec',
 'name': '6567663e-db41-11ec-a23b-9cb6d0fe1bec',
 'labels': {'identifier': ['a3f5e2361e5f45869cf2bb18e061d0f7',
   '60a973e97f2c4fd48d4b7aae952f4e28',
   '32f32b893f6747a598c4e36da5cd3d2f',
   '9d9e841898cb46e3929bd1e86a869572',
   'd991bd8e3eb3496a9859d6dca0ab3a9a',
   'dc086370aa724984b52195c7cb759296',
   'fc62a4511edc4bbcafb598eaf7a1b6c4',
   'e2552214b84b4223b89d78c623accce7',
   '7777c717844647488ef113e9f4f0037c',
   '78de5469429c4fa1a51a3a57a4958a98',
   '27c42bca98f641259642d41ff2dc4562',
   'e9030a3054e74e809985664108993718',
   '5fffb8866c074ab29fde7a3b80b59eec',
   '5d61c1e46f1f4a28808623f32b9ee6aa',
   '96c18bd7e7c441328c18b31e8db5e1f6',
   '8b58a4183132412d836656416bd9ba21',
   'a1779d7cc5c744e1a0151c6fb74ce8c3',
   '1129c66ff15b43f5b5836496d6e0fb42',
   'a476688c246b4a8ca33bdea56cf1cef0',
   'a67b0b669bcb44459be9d42c7c8e886d',
   '1b0b00b15de940d29cb2b93785bf56b4',
   'f18458773861488d9a7cbf5ac7

In [95]:
n_splits = len(train_val_datasets)
n_splits

1

In [96]:
train_val_datasets

[<immuneML.data_model.dataset.RepertoireDataset.RepertoireDataset at 0x27e7450ce80>]

In [97]:
def create_selection_path(current_path = state.path):
    
    sel_path = current_path / \
        f"slection_{state.selection.split_strategy.name.lower()}"
        
    PathBuilder.build(sel_path)
    
    return sel_path

selection_path = create_selection_path()

In [98]:
train_datasets , test_datasets = HPUtil.split_data(train_val_datasets[0] , state.selection , selection_path , state.label_configuration)

In [99]:
train_dataset = train_datasets[0]

test_dataset = test_datasets[0]

In [108]:
hp_setting = state.hp_strategy.generate_next_setting()

In [109]:
hp_setting

<immuneML.hyperparameter_optimization.HPSetting.HPSetting at 0x27e138dca90>

In [110]:
selection_path

WindowsPath('C:/Users/karth/Desktop/PhD projects/immuneML/immuneML/quickstart/machine_learning_analysis/result/machine_learning_instruction/slection_random')

<h3>Encode Dataset

In [111]:
encoded_train = HPUtil.encode_dataset(train_dataset, hp_setting , selection_path / "encoded_dataset" , learn_model = True , context = state.context, number_of_processes= state.number_of_processes , label_configuration=state.label_configuration)

2022-05-24 11:26:17.027438: Encoding started...
2022-05-24 11:26:22.281437: Encoding finished.


In [116]:
encoded_train.encoded_data.examples

<49x4597 sparse matrix of type '<class 'numpy.float64'>'
	with 6772 stored elements in Compressed Sparse Row format>

In [169]:
encoded_train_data = encoded_train.encoded_data

In [119]:
from immuneML.ml_methods.util.Util import Util


In [170]:
encoded_train_data.labels[label.name]

[True,
 False,
 True,
 False,
 True,
 True,
 True,
 False,
 False,
 False,
 True,
 True,
 True,
 False,
 True,
 False,
 False,
 True,
 False,
 True,
 True,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 True,
 True,
 True,
 False,
 False,
 False,
 False,
 True,
 False,
 True,
 True,
 False,
 True,
 False,
 False,
 True]

<immuneML.data_model.dataset.RepertoireDataset.RepertoireDataset at 0x27e1114cb80>

In [171]:
class_mapping = Util.make_class_mapping(encoded_train_data.labels[label.name])


In [172]:
class_mapping

{0: False, 1: True}

In [173]:
mapped_y = Util.map_to_new_class_values(encoded_train_data.labels[label.name] , class_mapping)

In [176]:
len(mapped_y)

49

In [177]:
X_train = encoded_train.encoded_data.examples


In [178]:
X_train

<49x4597 sparse matrix of type '<class 'numpy.float64'>'
	with 6772 stored elements in Compressed Sparse Row format>

<h3>Dask workflow

In [179]:
import dask.dataframe as dd
import dask.array as da
import dask.bag as db
from dask.distributed import Client

client = Client()
client.dashboard_link


Perhaps you already have a cluster running?
Hosting the HTTP server on port 60880 instead


'http://127.0.0.1:60880/status'

In [208]:
state

TrainMLModelState(dataset=<immuneML.data_model.dataset.RepertoireDataset.RepertoireDataset object at 0x0000027E105ADE80>, hp_strategy=<immuneML.hyperparameter_optimization.strategy.GridSearch.GridSearch object at 0x0000027E0C495DC0>, hp_settings=[<immuneML.hyperparameter_optimization.HPSetting.HPSetting object at 0x0000027E0C7D2820>, <immuneML.hyperparameter_optimization.HPSetting.HPSetting object at 0x0000027E0C48F6D0>], assessment=<immuneML.hyperparameter_optimization.config.SplitConfig.SplitConfig object at 0x0000027E0C495CD0>, selection=<immuneML.hyperparameter_optimization.config.SplitConfig.SplitConfig object at 0x0000027E0C3C1160>, metrics={<Metric.ACCURACY: 'accuracy_score'>}, optimization_metric=<Metric.BALANCED_ACCURACY: 'balanced_accuracy_score'>, label_configuration=<immuneML.environment.LabelConfiguration.LabelConfiguration object at 0x0000027E0C48FF70>, path=WindowsPath('C:/Users/karth/Desktop/PhD projects/immuneML/immuneML/quickstart/machine_learning_analysis/result/mach

In [205]:
state.DaskClient = client

In [207]:
state.DaskClient

0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:60880/status,

0,1
Dashboard: http://127.0.0.1:60880/status,Workers: 4
Total threads: 8,Total memory: 15.86 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:60881,Workers: 4
Dashboard: http://127.0.0.1:60880/status,Total threads: 8
Started: 1 day ago,Total memory: 15.86 GiB

0,1
Comm: tcp://127.0.0.1:60909,Total threads: 2
Dashboard: http://127.0.0.1:60913/status,Memory: 3.96 GiB
Nanny: tcp://127.0.0.1:60884,
Local directory: c:\Users\karth\Desktop\PhD projects\immuneML\dev_immuneML\immuneML\Notebook\dask-worker-space\worker-iq48po05,Local directory: c:\Users\karth\Desktop\PhD projects\immuneML\dev_immuneML\immuneML\Notebook\dask-worker-space\worker-iq48po05

0,1
Comm: tcp://127.0.0.1:60908,Total threads: 2
Dashboard: http://127.0.0.1:60910/status,Memory: 3.96 GiB
Nanny: tcp://127.0.0.1:60886,
Local directory: c:\Users\karth\Desktop\PhD projects\immuneML\dev_immuneML\immuneML\Notebook\dask-worker-space\worker-qp24ijsg,Local directory: c:\Users\karth\Desktop\PhD projects\immuneML\dev_immuneML\immuneML\Notebook\dask-worker-space\worker-qp24ijsg

0,1
Comm: tcp://127.0.0.1:60917,Total threads: 2
Dashboard: http://127.0.0.1:60918/status,Memory: 3.96 GiB
Nanny: tcp://127.0.0.1:60887,
Local directory: c:\Users\karth\Desktop\PhD projects\immuneML\dev_immuneML\immuneML\Notebook\dask-worker-space\worker-s0dwsdcg,Local directory: c:\Users\karth\Desktop\PhD projects\immuneML\dev_immuneML\immuneML\Notebook\dask-worker-space\worker-s0dwsdcg

0,1
Comm: tcp://127.0.0.1:60912,Total threads: 2
Dashboard: http://127.0.0.1:60915/status,Memory: 3.96 GiB
Nanny: tcp://127.0.0.1:60885,
Local directory: c:\Users\karth\Desktop\PhD projects\immuneML\dev_immuneML\immuneML\Notebook\dask-worker-space\worker-jza0sdyr,Local directory: c:\Users\karth\Desktop\PhD projects\immuneML\dev_immuneML\immuneML\Notebook\dask-worker-space\worker-jza0sdyr


In [None]:
client

0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:8787/status,

0,1
Dashboard: http://127.0.0.1:8787/status,Workers: 4
Total threads: 8,Total memory: 15.86 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:59837,Workers: 4
Dashboard: http://127.0.0.1:8787/status,Total threads: 8
Started: Just now,Total memory: 15.86 GiB

0,1
Comm: tcp://127.0.0.1:59865,Total threads: 2
Dashboard: http://127.0.0.1:59869/status,Memory: 3.96 GiB
Nanny: tcp://127.0.0.1:59840,
Local directory: c:\Users\karth\Desktop\PhD projects\immuneML\dev_immuneML\immuneML\Notebook\dask-worker-space\worker-k6ayyl70,Local directory: c:\Users\karth\Desktop\PhD projects\immuneML\dev_immuneML\immuneML\Notebook\dask-worker-space\worker-k6ayyl70

0,1
Comm: tcp://127.0.0.1:59866,Total threads: 2
Dashboard: http://127.0.0.1:59868/status,Memory: 3.96 GiB
Nanny: tcp://127.0.0.1:59842,
Local directory: c:\Users\karth\Desktop\PhD projects\immuneML\dev_immuneML\immuneML\Notebook\dask-worker-space\worker-q9cet14p,Local directory: c:\Users\karth\Desktop\PhD projects\immuneML\dev_immuneML\immuneML\Notebook\dask-worker-space\worker-q9cet14p

0,1
Comm: tcp://127.0.0.1:59873,Total threads: 2
Dashboard: http://127.0.0.1:59875/status,Memory: 3.96 GiB
Nanny: tcp://127.0.0.1:59843,
Local directory: c:\Users\karth\Desktop\PhD projects\immuneML\dev_immuneML\immuneML\Notebook\dask-worker-space\worker-v0fbfgnt,Local directory: c:\Users\karth\Desktop\PhD projects\immuneML\dev_immuneML\immuneML\Notebook\dask-worker-space\worker-v0fbfgnt

0,1
Comm: tcp://127.0.0.1:59867,Total threads: 2
Dashboard: http://127.0.0.1:59872/status,Memory: 3.96 GiB
Nanny: tcp://127.0.0.1:59841,
Local directory: c:\Users\karth\Desktop\PhD projects\immuneML\dev_immuneML\immuneML\Notebook\dask-worker-space\worker-0t6mru2q,Local directory: c:\Users\karth\Desktop\PhD projects\immuneML\dev_immuneML\immuneML\Notebook\dask-worker-space\worker-0t6mru2q


In [180]:
from sklearn.model_selection import GridSearchCV


In [181]:
from sklearn.linear_model import LogisticRegression


In [182]:
logreg = LogisticRegression()

In [190]:
parameters = [{'penalty': ['none', 'elasticnet', 'l1', 'l2']},
              {'C': [ 0.01, 0.1, 1, 10]}]


grid_search = GridSearchCV(estimator=logreg,
                           param_grid=parameters,
                           cv=3,
                           verbose=2, n_jobs=-1)




In [203]:
label.__dict__

{'name': 'my_signal',
 'values': [False, True],
 'auxiliary_label_names': None,
 'positive_class': True}

In [192]:
# ml_method = state.hp_settings[0].ml_method.fit(encoded_train.encoded_data , label )

In [193]:
encoded_train.encoded_data.feature_names

['AAA',
 'AAC',
 'AAD',
 'AAE',
 'AAF',
 'AAG',
 'AAH',
 'AAI',
 'AAK',
 'AAL',
 'AAM',
 'AAN',
 'AAP',
 'AAQ',
 'AAR',
 'AAS',
 'AAT',
 'AAW',
 'AAY',
 'ACA',
 'ACC',
 'ACD',
 'ACE',
 'ACI',
 'ACN',
 'ACP',
 'ACQ',
 'ACR',
 'ACS',
 'ACT',
 'ACV',
 'ADD',
 'ADF',
 'ADI',
 'ADM',
 'ADN',
 'ADP',
 'ADQ',
 'ADS',
 'ADT',
 'ADV',
 'AEC',
 'AEE',
 'AEF',
 'AEH',
 'AEK',
 'AEN',
 'AER',
 'AES',
 'AEV',
 'AEW',
 'AFC',
 'AFD',
 'AFE',
 'AFH',
 'AFI',
 'AFK',
 'AFL',
 'AFM',
 'AFN',
 'AFR',
 'AFS',
 'AFV',
 'AGA',
 'AGC',
 'AGE',
 'AGF',
 'AGG',
 'AGH',
 'AGI',
 'AGK',
 'AGN',
 'AGT',
 'AGW',
 'AGY',
 'AHD',
 'AHE',
 'AHF',
 'AHK',
 'AHL',
 'AHM',
 'AHS',
 'AHV',
 'AIA',
 'AIC',
 'AID',
 'AIE',
 'AIF',
 'AIG',
 'AII',
 'AIL',
 'AIN',
 'AIP',
 'AIQ',
 'AIR',
 'AIS',
 'AIT',
 'AIW',
 'AKA',
 'AKC',
 'AKE',
 'AKG',
 'AKI',
 'AKK',
 'AKL',
 'AKM',
 'AKN',
 'AKQ',
 'AKR',
 'AKS',
 'AKV',
 'ALA',
 'ALD',
 'ALF',
 'ALK',
 'ALL',
 'ALM',
 'ALP',
 'ALQ',
 'ALV',
 'ALY',
 'AMA',
 'AMC',
 'AMG',
 'AMH',


In [194]:
import joblib


In [195]:
X_train

<49x4597 sparse matrix of type '<class 'numpy.float64'>'
	with 6772 stored elements in Compressed Sparse Row format>

In [196]:
%%time
with joblib.parallel_backend("dask", scatter=[X_train, mapped_y]):
    grid_search.fit(X_train, mapped_y)

Fitting 3 folds for each of 8 candidates, totalling 24 fits
CPU times: total: 578 ms
Wall time: 802 ms


6 fits failed out of a total of 24.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
3 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\karth\anaconda3\envs\test\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\karth\anaconda3\envs\test\lib\site-packages\sklearn\linear_model\_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "c:\Users\karth\anaconda3\envs\test\lib\site-packages\sklearn\linear_model\_logistic.py", line 447, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got elasticnet pe

In [197]:
grid_search.best_params_, grid_search.best_score_

({'penalty': 'l2'}, 0.551470588235294)

In [202]:
grid_search.best_estimator_

LogisticRegression()

In [198]:
pd.DataFrame(grid_search.cv_results_).head()


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_penalty,param_C,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,0.363006,0.010238,0.00266,0.000479,none,,{'penalty': 'none'},0.588235,0.6875,0.3125,0.529412,0.158643,6
1,0.001335,0.00047,0.0,0.0,elasticnet,,{'penalty': 'elasticnet'},,,,,,7
2,0.002661,0.000947,0.0,0.0,l1,,{'penalty': 'l1'},,,,,,8
3,0.472333,0.035368,0.001332,0.000469,l2,,{'penalty': 'l2'},0.529412,0.5625,0.5625,0.551471,0.015598,1
4,0.308664,0.005908,0.001666,0.000943,,0.01,{'C': 0.01},0.529412,0.5625,0.5625,0.551471,0.015598,1


In [200]:
grid_search.predict(X_train)[:5]


array([1, 0, 1, 0, 1])

In [201]:
mapped_y[:5]

array([1, 0, 1, 0, 1])

In [199]:
grid_search.score(X_train, mapped_y)


1.0

In [None]:
# for index in range(n_splits):
    
#     print(index , n_splits)
#     state = HPAssessment.run_assessment_split(state, train_val_datasets[index], test_datasets[index], index, n_splits)
    

0 1
2022-05-23 15:45:58.919902: Training ML model: running outer CV loop: started split 1/1.

2022-05-23 15:45:59.057899: Hyperparameter optimization: running the inner loop of nested CV: selection for label my_signal (label 1 / 1).

2022-05-23 15:45:59.168905: Evaluating hyperparameter setting: e1_simpleLR...
2022-05-23 15:45:59.171901: Encoding started...
2022-05-23 15:46:03.554422: Encoding finished.
2022-05-23 15:46:03.556422: ML model training started...
2022-05-23 15:46:04.029418: ML model training finished.
2022-05-23 15:46:04.030419: Encoding started...
2022-05-23 15:46:07.572459: Encoding finished.
2022-05-23 15:46:07.588461: Completed hyperparameter setting e1_simpleLR.

2022-05-23 15:46:07.591455: Evaluating hyperparameter setting: e2_simpleLR...
2022-05-23 15:46:07.594463: Encoding started...
2022-05-23 15:46:12.913201: Encoding finished.
2022-05-23 15:46:12.913201: ML model training started...
2022-05-23 15:46:13.054201: ML model training finished.
2022-05-23 15:46:13.0562


The max_iter was reached which means the coef_ did not converge



2022-05-23 15:46:24.220846: ML model training finished.
2022-05-23 15:46:24.222847: Encoding started...
2022-05-23 15:46:28.408949: Encoding finished.
2022-05-23 15:46:28.570829: Completed hyperparameter setting e1_simpleLR.

2022-05-23 15:46:28.573795: Evaluating hyperparameter setting: e2_simpleLR...
2022-05-23 15:46:28.575796: Encoding started...
2022-05-23 15:46:33.734347: Encoding finished.
2022-05-23 15:46:33.734347: ML model training started...
2022-05-23 15:46:33.823348: ML model training finished.
2022-05-23 15:46:33.825348: Encoding started...
2022-05-23 15:46:38.581927: Encoding finished.
2022-05-23 15:46:38.692930: Completed hyperparameter setting e2_simpleLR.

2022-05-23 15:46:38.693929: Training ML model: running the inner loop of nested CV: completed retraining models for label my_signal (label 1 / 1).

2022-05-23 15:46:39.252930: Training ML model: running outer CV loop: finished split 1/1.



In [None]:
state

NameError: name 'state' is not defined

In [None]:
train_val_datasets

[<immuneML.data_model.dataset.RepertoireDataset.RepertoireDataset at 0x19527e05f70>]

In [None]:
instructions_object

<immuneML.workflows.instructions.TrainMLModelInstruction.TrainMLModelInstruction at 0x19527e050d0>

In [None]:
state

TrainMLModelState(dataset=<immuneML.data_model.dataset.RepertoireDataset.RepertoireDataset object at 0x0000019527DA19D0>, hp_strategy=<immuneML.hyperparameter_optimization.strategy.GridSearch.GridSearch object at 0x0000019527E05E50>, hp_settings=[<immuneML.hyperparameter_optimization.HPSetting.HPSetting object at 0x0000019524A40E20>, <immuneML.hyperparameter_optimization.HPSetting.HPSetting object at 0x0000019525897AC0>], assessment=<immuneML.hyperparameter_optimization.config.SplitConfig.SplitConfig object at 0x0000019527E05BB0>, selection=<immuneML.hyperparameter_optimization.config.SplitConfig.SplitConfig object at 0x0000019527E058E0>, metrics={<Metric.BALANCED_ACCURACY: 'balanced_accuracy_score'>, <Metric.ACCURACY: 'accuracy_score'>}, optimization_metric=<Metric.BALANCED_ACCURACY: 'balanced_accuracy_score'>, label_configuration=<immuneML.environment.LabelConfiguration.LabelConfiguration object at 0x00000195258978E0>, path=WindowsPath('C:/Users/karth/Desktop/PhD projects/immuneML/im

Create Selection path

In [None]:
state.path

WindowsPath('C:/Users/karth/Desktop/PhD projects/immuneML/immuneML/quickstart/machine_learning_analysis/result/machine_learning_instruction')

In [None]:
path

WindowsPath('C:/Users/karth/Desktop/PhD projects/immuneML/immuneML/quickstart/machine_learning_analysis/result/machine_learning_instruction/slection_random')

In [None]:
train_datasets , val_datasets = HPUtil.split_data(train_val_datasets[0], state.selection , path, state.label_configuration)

In [None]:
train_val_datasets[0].__dict__

{'encoded_data': None,
 'identifier': 'a37b6c1e-da9d-11ec-aa5b-9cb6d0fe1bec',
 'name': 'a37b6c1e-da9d-11ec-aa5b-9cb6d0fe1bec',
 'labels': {'identifier': ['cccfb940720444989d09d7137603cf90',
   '5f8820e20939495896eefc84221b4b46',
   '2e8fceddbb67413da205330fee15d1a8',
   '78fac66301534482aba5d89a35969d6c',
   'e179bf7f6d1d438ba63f48bb52ec90d6',
   'c5f02c99332049b1ab5d93574600efb3',
   '6822f24fbdaf4d05b4682d86d360bec6',
   '09057bbb03014176918a8fe95c65ec15',
   'ba29f0a9fe6843638ff953b2731180ad',
   'a202c923acbc46ef85e93cab675d8a32',
   '3d5b6f81223a4739bc56e3980ceea591',
   '27a932da250a4abb84e881d5acbe4e62',
   '016fa8e9d6014857b62b934db610e248',
   'ac0e5866a240492aa13309ceebb6b190',
   '6eb171c7f07342978f4e07a37820dc90',
   '71a93224d8de420b9cece7043f932317',
   '77072fb76eb2465097c03951e0def51a',
   '452a17b914f34120af1312ebca4a0b7c',
   'bba37eb8cd614f60870be9abebe08b7e',
   '7f53beef82304d8c97b2e668f23099ac',
   '96862ec4c13949cbb4bf07f1cf3869fa',
   '95f6cf386e2d4e0b85230e990d

In [None]:
val_datasets[0].__dict__

{'encoded_data': None,
 'identifier': 'ad5fac06-da9e-11ec-a716-9cb6d0fe1bec',
 'name': 'ad5fac06-da9e-11ec-a716-9cb6d0fe1bec',
 'labels': {'identifier': ['cccfb940720444989d09d7137603cf90',
   '5f8820e20939495896eefc84221b4b46',
   '2e8fceddbb67413da205330fee15d1a8',
   '78fac66301534482aba5d89a35969d6c',
   'e179bf7f6d1d438ba63f48bb52ec90d6',
   'c5f02c99332049b1ab5d93574600efb3',
   '6822f24fbdaf4d05b4682d86d360bec6',
   '09057bbb03014176918a8fe95c65ec15',
   'ba29f0a9fe6843638ff953b2731180ad',
   'a202c923acbc46ef85e93cab675d8a32',
   '3d5b6f81223a4739bc56e3980ceea591',
   '27a932da250a4abb84e881d5acbe4e62',
   '016fa8e9d6014857b62b934db610e248',
   'ac0e5866a240492aa13309ceebb6b190',
   '6eb171c7f07342978f4e07a37820dc90',
   '71a93224d8de420b9cece7043f932317',
   '77072fb76eb2465097c03951e0def51a',
   '452a17b914f34120af1312ebca4a0b7c',
   'bba37eb8cd614f60870be9abebe08b7e',
   '7f53beef82304d8c97b2e668f23099ac',
   '96862ec4c13949cbb4bf07f1cf3869fa',
   '95f6cf386e2d4e0b85230e990d

In [None]:
train_datasets[0].__dict__

{'encoded_data': None,
 'identifier': 'ad572082-da9e-11ec-b20b-9cb6d0fe1bec',
 'name': 'ad572082-da9e-11ec-b20b-9cb6d0fe1bec',
 'labels': {'identifier': ['cccfb940720444989d09d7137603cf90',
   '5f8820e20939495896eefc84221b4b46',
   '2e8fceddbb67413da205330fee15d1a8',
   '78fac66301534482aba5d89a35969d6c',
   'e179bf7f6d1d438ba63f48bb52ec90d6',
   'c5f02c99332049b1ab5d93574600efb3',
   '6822f24fbdaf4d05b4682d86d360bec6',
   '09057bbb03014176918a8fe95c65ec15',
   'ba29f0a9fe6843638ff953b2731180ad',
   'a202c923acbc46ef85e93cab675d8a32',
   '3d5b6f81223a4739bc56e3980ceea591',
   '27a932da250a4abb84e881d5acbe4e62',
   '016fa8e9d6014857b62b934db610e248',
   'ac0e5866a240492aa13309ceebb6b190',
   '6eb171c7f07342978f4e07a37820dc90',
   '71a93224d8de420b9cece7043f932317',
   '77072fb76eb2465097c03951e0def51a',
   '452a17b914f34120af1312ebca4a0b7c',
   'bba37eb8cd614f60870be9abebe08b7e',
   '7f53beef82304d8c97b2e668f23099ac',
   '96862ec4c13949cbb4bf07f1cf3869fa',
   '95f6cf386e2d4e0b85230e990d

In [None]:
n_labels = state.label_configuration.get_label_count()

In [None]:
n_labels

1

In [None]:
index

0

In [None]:
label_config.get_label_object("my_signal").name

'my_signal'

In [None]:
from immuneML.hyperparameter_optimization.states.HPSelectionState import HPSelectionState


In [None]:
selection_state = HPSelectionState(train_datasets, val_datasets , path , state.hp_strategy)

In [None]:
selection_state.__dict__

{'train_datasets': [<immuneML.data_model.dataset.RepertoireDataset.RepertoireDataset at 0x1953fb5f7f0>],
 'val_datasets': [<immuneML.data_model.dataset.RepertoireDataset.RepertoireDataset at 0x1954027b670>],
 'path': WindowsPath('C:/Users/karth/Desktop/PhD projects/immuneML/immuneML/quickstart/machine_learning_analysis/result/machine_learning_instruction/slection_random'),
 'hp_strategy': <immuneML.hyperparameter_optimization.strategy.GridSearch.GridSearch at 0x1953fd9c700>,
 'hp_items': {'e1_simpleLR': [], 'e2_simpleLR': []},
 'train_data_reports': [],
 'val_data_reports': [],
 'data_reports': []}

In [None]:
state.assessment_states[index].label_states[label_config.get_label_object(
    "my_signal").name].selection_state = selection_state


In [None]:
state

TrainMLModelState(dataset=<immuneML.data_model.dataset.RepertoireDataset.RepertoireDataset object at 0x0000019527DA19D0>, hp_strategy=<immuneML.hyperparameter_optimization.strategy.GridSearch.GridSearch object at 0x0000019527E05E50>, hp_settings=[<immuneML.hyperparameter_optimization.HPSetting.HPSetting object at 0x0000019524A40E20>, <immuneML.hyperparameter_optimization.HPSetting.HPSetting object at 0x0000019525897AC0>], assessment=<immuneML.hyperparameter_optimization.config.SplitConfig.SplitConfig object at 0x0000019527E05BB0>, selection=<immuneML.hyperparameter_optimization.config.SplitConfig.SplitConfig object at 0x0000019527E058E0>, metrics={<Metric.BALANCED_ACCURACY: 'balanced_accuracy_score'>, <Metric.ACCURACY: 'accuracy_score'>}, optimization_metric=<Metric.BALANCED_ACCURACY: 'balanced_accuracy_score'>, label_configuration=<immuneML.environment.LabelConfiguration.LabelConfiguration object at 0x00000195258978E0>, path=WindowsPath('C:/Users/karth/Desktop/PhD projects/immuneML/im

In [None]:
split_index = index + 1

In [None]:
from immuneML.workflows.instructions.MLProcess import MLProcess


In [None]:
hp_setting = selection_state.hp_strategy.generate_next_setting()

In [None]:
hp_setting

<immuneML.hyperparameter_optimization.HPSetting.HPSetting at 0x1954028bf40>

In [None]:
hp_item = MLProcess(train_dataset=train_datasets[0], test_dataset=val_datasets[0], encoding_reports=state.selection.reports.encoding_reports.values(), label_config=LabelConfiguration([label]), report_context=state.context,
                    number_of_processes=state.number_of_processes, metrics=state.metrics, optimization_metric=state.optimization_metric,
                    ml_reports=state.selection.reports.model_reports.values(), label=label, path=path, hp_setting=hp_setting)\
    .run(split_index)


2022-05-24 06:03:53.440487: Evaluating hyperparameter setting: e1_simpleLR...
2022-05-24 06:03:53.450484: Encoding started...
2022-05-24 06:03:53.513482: Encoding finished.
2022-05-24 06:03:53.513482: ML model training started...
2022-05-24 06:03:54.653781: ML model training finished.
2022-05-24 06:03:54.657967: Encoding started...
2022-05-24 06:03:54.678778: Encoding finished.
2022-05-24 06:03:54.692777: Completed hyperparameter setting e1_simpleLR.



In [None]:
state.assessment_states[index].label_states[label_config.get_label_object(
    "my_signal").name].selection_state.hp_items[hp_setting.get_key(
)].append(hp_item)


In [None]:
performance = hp_item.performance[state.optimization_metric.name.lower(
)] if hp_item.performance is not None else None


In [None]:
performance

NameError: name 'performance' is not defined

<h3> Load Npy files into paraquet format

AttributeError: 'RepertoireDataset' object has no attribute 'get_'

<h3> Load Npy files into h5py format

<h3> Split data into train and test

<h3> Distributed Grid search using joblib

In [None]:
import joblib

import dask.distributed

c = dask.distributed.Client()

Perhaps you already have a cluster running?
Hosting the HTTP server on port 64643 instead


<h3> Distributed training using joblib

<h3> Testing 

In [None]:
import os
import shutil
from unittest import TestCase

import pandas as pd

from immuneML.analysis.data_manipulation.NormalizationType import NormalizationType
from immuneML.caching.CacheType import CacheType
from immuneML.encodings.EncoderParams import EncoderParams
from immuneML.encodings.kmer_frequency.KmerFreqRepertoireEncoder import KmerFreqRepertoireEncoder
from immuneML.util.ReadsType import ReadsType
from immuneML.encodings.kmer_frequency.sequence_encoding.SequenceEncodingType import SequenceEncodingType
from immuneML.environment.Constants import Constants
from immuneML.environment.EnvironmentSettings import EnvironmentSettings
from immuneML.environment.Label import Label
from immuneML.environment.LabelConfiguration import LabelConfiguration
from immuneML.hyperparameter_optimization.HPSetting import HPSetting
from immuneML.ml_methods.LogisticRegression import LogisticRegression
from immuneML.simulation.dataset_generation.RandomDatasetGenerator import RandomDatasetGenerator
from immuneML.util.PathBuilder import PathBuilder
from immuneML.workflows.instructions.ml_model_application.MLApplicationInstruction import MLApplicationInstruction




def setUp():
    os.environ[Constants.CACHE_TYPE] = CacheType.TEST.name



  
        


In [None]:
path = EnvironmentSettings.tmp_test_path / "mlapplicationtest/"
PathBuilder.build(path)

path

WindowsPath('C:/Users/karth/Desktop/PhD projects/immuneML/immuneML/test/tmp/mlapplicationtest')

In [None]:

dataset = RandomDatasetGenerator.generate_repertoire_dataset(
    50, {5: 1}, {5: 1}, {"l1": {1: 0.5, 2: 0.5}}, path / 'dataset/')


In [None]:
ml_method = LogisticRegression()


In [None]:
encoder = KmerFreqRepertoireEncoder(NormalizationType.RELATIVE_FREQUENCY, ReadsType.UNIQUE, SequenceEncodingType.CONTINUOUS_KMER, 3,
                                    scale_to_zero_mean=True, scale_to_unit_variance=True)

In [None]:
label = Label("l1", [1, 2])
label_config = LabelConfiguration([label])

In [None]:
enc_dataset = encoder.encode(dataset, EncoderParams(
    result_path=path, label_config=label_config, filename="tmp_enc_dataset.pickle", pool_size=4))



In [None]:
enc_dataset.metadata_file

WindowsPath('C:/Users/karth/Desktop/PhD projects/immuneML/immuneML/test/tmp/mlapplicationtest/dataset/metadata.csv')

In [None]:
label.values

[1, 2]

In [None]:
enc_dataset.encoded_data.examples

array([[-0.14285714, -0.14285714, -0.14285714, ..., -0.14285714,
        -0.14285714, -0.14285714],
       [-0.14285714, -0.14285714, -0.14285714, ..., -0.14285714,
        -0.14285714, -0.14285714],
       [-0.14285714, -0.14285714, -0.14285714, ..., -0.14285714,
        -0.14285714, -0.14285714],
       ...,
       [-0.14285714, -0.14285714, -0.14285714, ..., -0.14285714,
        -0.14285714, -0.14285714],
       [-0.14285714, -0.14285714, -0.14285714, ..., -0.14285714,
        -0.14285714, -0.14285714],
       [-0.14285714, -0.14285714, -0.14285714, ..., -0.14285714,
        -0.14285714, -0.14285714]])

In [None]:
ml_method.fit(enc_dataset.encoded_data, label)


In [None]:

hp_setting = HPSetting(encoder, {"normalization_type": "relative_frequency", "reads": "unique", "sequence_encoding": "continuous_kmer",
                                 "k": 3, "scale_to_zero_mean": True, "scale_to_unit_variance": True}, ml_method, {}, [], 'enc1', 'ml1')


In [None]:

PathBuilder.build(path / 'result/instr1/')

ml_app = MLApplicationInstruction(
    dataset, label_config, hp_setting, 4, "instr1")
ml_app.run(path / 'result/')

predictions_path = path / "result/instr1/predictions.csv"
df = pd.read_csv(predictions_path)

shutil.rmtree(path)
