In [1]:

from immuneML.data_model.dataset.ReceptorDataset import ReceptorDataset
from immuneML.data_model.dataset.RepertoireDataset import RepertoireDataset
from immuneML.data_model.dataset.SequenceDataset import SequenceDataset
from immuneML.environment.EnvironmentSettings import EnvironmentSettings
from immuneML.simulation.dataset_generation.RandomDatasetGenerator import RandomDatasetGenerator


In [2]:
path = EnvironmentSettings.tmp_test_path / \
    "random_repertoire_dataset_generation/"


In [3]:
dataset = RandomDatasetGenerator.generate_repertoire_dataset(repertoire_count=100,
                                                             sequence_count_probabilities={
                                                                 100: 0.5, 120: 0.5},
                                                             sequence_length_probabilities={12: 0.33, 14: 0.33, 15: 0.33}, labels={}, path=path)


In [4]:
type(dataset)
path


WindowsPath('C:/Users/Karthik/.conda/envs/uni/lib/site-packages/test/tmp/random_repertoire_dataset_generation')

In [5]:
from immuneML.workflows.steps.SignalImplanter import SignalImplanter
from immuneML.simulation.SimulationState import SimulationState
from immuneML.simulation.Implanting import Implanting
from immuneML.simulation.Simulation import Simulation

from immuneML.simulation.implants.Motif import Motif
from immuneML.simulation.implants.Signal import Signal
from immuneML.simulation.motif_instantiation_strategy.GappedKmerInstantiation import GappedKmerInstantiation
from immuneML.simulation.sequence_implanting.GappedMotifImplanting import GappedMotifImplanting
from immuneML.simulation.signal_implanting_strategy.HealthySequenceImplanting import HealthySequenceImplanting
from immuneML.simulation.signal_implanting_strategy.ImplantingComputation import ImplantingComputation


In [6]:
result_path = path  /  "simulation_data"

In [7]:
signal = Signal("my_signal", [Motif(
    "my_motif", GappedKmerInstantiation(), "AA")],   implanting_strategy=HealthySequenceImplanting(GappedMotifImplanting(), implanting_computation=ImplantingComputation.ROUND))


In [8]:
simulation = Simulation([Implanting(dataset_implanting_rate=0.5,
                        repertoire_implanting_rate=0.1, signals=[signal], name="my_simulation")])


In [9]:
input_params = SimulationState(dataset=dataset, result_path=result_path,
                               simulation=simulation, signals=[signal], formats=["AIRR"])


In [11]:
new_dataset = SignalImplanter.run(input_params)




In [12]:
from immuneML.IO.dataset_export.AIRRExporter import AIRRExporter


In [13]:
path_exported = path / "exported"


In [14]:
path_exported

WindowsPath('C:/Users/Karthik/.conda/envs/uni/lib/site-packages/test/tmp/random_repertoire_dataset_generation/exported')

In [15]:
exported_dataset = AIRRExporter.export(new_dataset, path_exported)


In [16]:
type(exported_dataset)

NoneType

In [17]:
from immuneML.IO.dataset_import.AIRRImport import AIRRImport


In [18]:
column_mapping = {
    "junction": "sequences",
    "junction_aa": "sequence_aas",
    "v_call": "v_alleles",
    "j_call": "j_alleles",
    "locus": "chains",
    "duplicate_count": "counts",
    "sequence_id": "sequence_identifiers"
}


In [19]:
params = {"is_repertoire": True, "result_path": path_exported / "results", "path": path_exported, "metadata_file": path_exported / "metadata.csv",
          "import_out_of_frame": False, "import_with_stop_codon": False, "import_illegal_characters": False,
          "import_productive": True, "region_type": "IMGT_CDR3", "import_empty_nt_sequences": True, "import_empty_aa_sequences": False,
          "column_mapping": column_mapping,
          "separator": "\t"}


In [20]:
dataset = AIRRImport.import_dataset(params, "airr_repertoire_dataset_1")


In [21]:
type(dataset)

immuneML.data_model.dataset.RepertoireDataset.RepertoireDataset

In [22]:
path_ml = path / "ML"

In [23]:
from immuneML.util.PathBuilder import PathBuilder


In [24]:
path = PathBuilder.build(path_ml)


    """
    ML process
    Class that implements the machine learning process:
        1. encodes the training dataset
        2. encodes the test dataset (using parameters learnt in step 1 if there are any such parameters)
        3. trains the ML method on encoded training dataset
        4. assesses the method's performance on encoded test dataset
    It performs the task for a given label configuration, and given list of metrics (used only in the assessment step).
    """

In [25]:
from immuneML.hyperparameter_optimization.core.HPUtil import HPUtil


In [26]:
path

WindowsPath('C:/Users/Karthik/.conda/envs/uni/lib/site-packages/test/tmp/random_repertoire_dataset_generation/ML')

In [27]:
processed_dataset = HPUtil.preprocess_dataset(dataset= dataset , preproc_sequence= [] ,path = path / "preprocessed_train_data" )

In [28]:
processed_dataset.get_example_count()

100

In [29]:
import datetime
from collections import Counter
from pathlib import Path

import pandas as pd

from immuneML.IO.ml_method.MLExporter import MLExporter
from immuneML.environment.LabelConfiguration import LabelConfiguration
from immuneML.hyperparameter_optimization.config.SplitConfig import SplitConfig
from immuneML.hyperparameter_optimization.config.SplitType import SplitType
from immuneML.hyperparameter_optimization.core.HPAssessment import HPAssessment
from immuneML.hyperparameter_optimization.core.HPUtil import HPUtil
from immuneML.hyperparameter_optimization.states.TrainMLModelState import TrainMLModelState
from immuneML.hyperparameter_optimization.strategy.HPOptimizationStrategy import HPOptimizationStrategy
from immuneML.ml_metrics.Metric import Metric
from immuneML.reports.train_ml_model_reports.TrainMLModelReport import TrainMLModelReport
from immuneML.util.ReflectionHandler import ReflectionHandler
from immuneML.workflows.instructions.Instruction import Instruction
from immuneML.workflows.instructions.MLProcess import MLProcess
import os
import shutil
from unittest import TestCase

import numpy as np

from immuneML.caching.CacheType import CacheType
from immuneML.data_model.dataset.RepertoireDataset import RepertoireDataset
from immuneML.data_model.encoded_data.EncodedData import EncodedData
from immuneML.environment.Constants import Constants
from immuneML.environment.EnvironmentSettings import EnvironmentSettings
from immuneML.ml_methods.LogisticRegression import LogisticRegression
from immuneML.workflows.steps.MLMethodTrainer import MLMethodTrainer
from immuneML.workflows.steps.MLMethodTrainerParams import MLMethodTrainerParams
from immuneML.encodings.kmer_frequency.KmerFrequencyEncoder import KmerFrequencyEncoder
from immuneML.encodings.kmer_frequency.KmerFreqRepertoireEncoder import KmerFreqRepertoireEncoder


In [None]:
ml_method = LogisticRegression({"C": 0.1,
                               "penalty": "l1",
                                "max_iter": 200})

dataset = dataset

encoder = KmerFrequencyEncoder


In [72]:
from immuneML.hyperparameter_optimization.HPSetting import HPSetting
from immuneML.encodings.kmer_frequency.KmerFreqRepertoireEncoder import KmerFreqRepertoireEncoder
from immuneML.encodings.kmer_frequency.KmerFrequencyEncoder import KmerFrequencyEncoder

from immuneML.encodings.kmer_frequency.ReadsType import ReadsType
from immuneML.encodings.kmer_frequency.sequence_encoding.SequenceEncodingType import SequenceEncodingType
from immuneML.analysis.data_manipulation.NormalizationType import NormalizationType
Hpsetting1 = HPSetting(encoder=KmerFreqRepertoireEncoder(reads=ReadsType.UNIQUE, sequence_encoding=SequenceEncodingType.GAPPED_KMER, normalization_type=NormalizationType.RELATIVE_FREQUENCY), encoder_params={"k": 2, "reads": ReadsType.UNIQUE, "sequence_encoding": SequenceEncodingType.GAPPED_KMER, "normalization_type": NormalizationType.RELATIVE_FREQUENCY}, ml_method=LogisticRegression(
    {"C": 0.1, "penalty": "l1", "max_iter": 200}), ml_params={"model_selection_cv": False,
                                                              "model_selection_n_folds": 0}, preproc_sequence=[], encoder_name="e1", ml_method_name="simpleLR")


In [70]:
Hpsetting2 = HPSetting(encoder=KmerFreqRepertoireEncoder(reads=ReadsType.UNIQUE, sequence_encoding=SequenceEncodingType.GAPPED_KMER, normalization_type=NormalizationType.RELATIVE_FREQUENCY), encoder_params={
                       "k": 2, "reads": ReadsType.UNIQUE, "sequence_encoding": SequenceEncodingType.GAPPED_KMER, "normalization_type": NormalizationType.RELATIVE_FREQUENCY}, ml_method=LogisticRegression({"C": 0.1, "penalty": "l1", "max_iter": 200}), ml_params={"model_selection_cv": True,
                                                                                                                                                                                                                                                                     "model_selection_n_folds": 5}, preproc_sequence=[], encoder_name="e2", ml_method_name="simpleLR")


In [34]:
from immuneML.hyperparameter_optimization.core.HPUtil import HPUtil
from immuneML.hyperparameter_optimization.strategy.GridSearch import GridSearch

HPStrategy = GridSearch(hp_settings=[Hpsetting1, Hpsetting2])


In [35]:
from immuneML.hyperparameter_optimization.config.ReportConfig import ReportConfig


In [36]:
datasplits = {
    "SequenceLengthDistribution": {
        "batch_size": 3
    }
}
models1 = {"coef": "Coefficients"}


In [37]:
reportconfig1 = ReportConfig(
    data_splits=datasplits, models=models1)


In [39]:
from immuneML.hyperparameter_optimization.config.SplitConfig import SplitConfig


In [40]:
split_config_assesment = SplitConfig(
    split_strategy=SplitType.RANDOM, split_count=1, training_percentage=0.7, reports=reportconfig1)


In [41]:
split_config_selection = SplitConfig(
    split_strategy=SplitType.RANDOM, split_count=1, training_percentage=.07, reports=reportconfig2)


In [42]:
from immuneML.environment.LabelConfiguration import LabelConfiguration
from immuneML.environment.Label import Label
label = LabelConfiguration(labels=[Label("my_signal")])


In [43]:
from immuneML.ml_metrics.Metric import Metric
metrics = Metric.ACCURACY
optimisatization_metric = Metric.BALANCED_ACCURACY


In [44]:
from immuneML.reports.train_ml_model_reports.MLSettingsPerformance import MLSettingsPerformance
reports = [MLSettingsPerformance]
number_of_processes = 3

refit_optimal_model = True


In [45]:
train_val_datasets, test_datasets = HPUtil.split_data(
    processed_dataset, split_config_assesment, path, label)


In [46]:
len(train_val_datasets)

1

In [47]:
processed_dataset.get_example_count()

100

In [48]:
processed_dataset = HPUtil.preprocess_dataset(
    dataset=processed_dataset, preproc_sequence=[], path=path / "preprocessed_train_data")


In [49]:
Hpsetting1.encoder_params

{'k': 2,
 'reads': <ReadsType.UNIQUE: 'unique'>,
 'sequence_encoding': <SequenceEncodingType.GAPPED_KMER: 'GappedKmerSequenceEncoder'>,
 'normalization_type': <NormalizationType.RELATIVE_FREQUENCY: 'l1'>}

In [50]:
encoded_train_dataset = HPUtil.encode_dataset(processed_dataset , hp_setting= Hpsetting1 , path= path / "encoded_datasets" , learn_model = True , context= {} , number_of_processes= number_of_processes , label_configuration= label)

2022-01-11 18:32:14.298089: Encoding started...
2022-01-11 18:32:16.660777: Encoding finished.


In [51]:
Hpsetting1.encoder

<immuneML.encodings.kmer_frequency.KmerFreqRepertoireEncoder.KmerFreqRepertoireEncoder at 0x22a5f7e94c0>

In [52]:
Hpsetting1.encoder_params

{'k': 2,
 'reads': <ReadsType.UNIQUE: 'unique'>,
 'sequence_encoding': <SequenceEncodingType.GAPPED_KMER: 'GappedKmerSequenceEncoder'>,
 'normalization_type': <NormalizationType.RELATIVE_FREQUENCY: 'l1'>}

In [53]:
processed_dataset

<immuneML.data_model.dataset.RepertoireDataset.RepertoireDataset at 0x22a4c00d070>

In [55]:
from immuneML.workflows.steps.DataEncoder import DataEncoder
from immuneML.workflows.steps.DataEncoderParams import DataEncoderParams
from immuneML.encodings.EncoderParams import EncoderParams


In [56]:
label.get_label_values()

TypeError: get_label_values() missing 1 required positional argument: 'label'

In [57]:
encoded_path = path / "encoded_datasets"

In [None]:
encoded_path

In [None]:
type(label)

In [None]:
learn_model = True
encode_labels = True

In [None]:
Hpsetting1.encoder

In [None]:
Hpsetting1.encoder_params

In [None]:
encoded_datasets = DataEncoder.run(DataEncoderParams(
    dataset=processed_dataset,
    encoder=Hpsetting1.encoder,
    encoder_params=EncoderParams(
        model = Hpsetting1.encoder_params,
        result_path = encoded_path,
        pool_size = number_of_processes,
        label_config = label,
        learn_model=learn_model,
        filename="train_dataset.pkl" if learn_model else "test_dataset.pkl",
        encode_labels=encode_labels

    
    )
))


In [None]:
ml_method = LogisticRegression()

In [None]:
dataset

In [None]:
encoder = KmerFreqRepertoireEncoder(NormalizationType.RELATIVE_FREQUENCY, ReadsType.UNIQUE, SequenceEncodingType.CONTINUOUS_KMER, 3,
                                    scale_to_zero_mean=True, scale_to_unit_variance=True)


In [None]:
label_config = LabelConfiguration([Label("my_signal")])


In [None]:
enc_dataset = encoder.encode(processed_dataset, EncoderParams(
    result_path=encoded_path, label_config=label_config, filename="tmp_enc_dataset.pkl", pool_size=4))


In [None]:
ml_method.fit(enc_dataset.encoded_data, 'my_signal')


In [None]:
hp_setting = HPSetting(encoder, {"normalization_type": "relative_frequency", "reads": "unique", "sequence_encoding": "continuous_kmer",
                                 "k": 3, "scale_to_zero_mean": True, "scale_to_unit_variance": True}, ml_method, {}, [], 'enc1', 'ml1')


In [None]:
path

In [None]:
PathBuilder.build(path / 'result/instr1/')


In [None]:
from immuneML.workflows.instructions.ml_model_application.MLApplicationInstruction import MLApplicationInstruction


In [None]:
ml_app = MLApplicationInstruction(
    dataset, label_config, hp_setting, 4, "instr1")


In [None]:
ml_app.run(path / 'result/')


In [1]:
!pip install snoop

Collecting snoop
  Downloading snoop-0.4.1-py2.py3-none-any.whl (27 kB)
Collecting cheap-repr>=0.4.0
  Downloading cheap_repr-0.5.1-py2.py3-none-any.whl (12 kB)
Collecting asttokens
  Downloading asttokens-2.0.5-py2.py3-none-any.whl (20 kB)
Collecting executing
  Downloading executing-0.8.2-py2.py3-none-any.whl (16 kB)
Installing collected packages: executing, cheap-repr, asttokens, snoop
Successfully installed asttokens-2.0.5 cheap-repr-0.5.1 executing-0.8.2 snoop-0.4.1


In [9]:
import logging
import os
import shutil
import sys
import warnings
from pathlib import Path
import snoop

import yaml

from immuneML.app.ImmuneMLApp import ImmuneMLApp
from immuneML.environment.EnvironmentSettings import EnvironmentSettings
from immuneML.util.PathBuilder import PathBuilder
import traceback

class Quickstart:

    def create_specfication(self, path: Path):

        specs = {
            "definitions": {
                "datasets": {
                    "d1": {
                        "format": "AIRR",
                        "params": {
                            "path": str(path / "../synthetic_dataset/result/simulation_instruction/exported_dataset/airr/"),
                            "metadata_file": str(path / "../synthetic_dataset/result/simulation_instruction/exported_dataset/airr/metadata.csv")
                        }
                    }
                },
                "encodings": {
                    "e1": {
                        "KmerFrequency": {
                            "k": 3
                        }
                    },
                    "e2": {
                        "KmerFrequency": {
                            "k": 2
                        }
                    }
                },
                "ml_methods": {
                    "simpleLR": {
                        "LogisticRegression": {
                            "C": 0.1,
                            "penalty": "l1",
                            "max_iter": 200
                        }}
                },
                "reports": {
                    "rep1": {
                        "SequenceLengthDistribution": {
                            "batch_size": 3
                        }
                    },
                    "hprep": "MLSettingsPerformance",
                    "coef": "Coefficients"
                }
            },
            "instructions": {
                "machine_learning_instruction": {
                    "type": "TrainMLModel",
                    "settings": [
                        {
                            "encoding": "e1",
                            "ml_method": "simpleLR"
                        },
                        {
                            "encoding": "e2",
                            "ml_method": "simpleLR"
                        }
                    ],
                    "assessment": {
                        "split_strategy": "random",
                        "split_count": 1,
                        "training_percentage": 0.7,
                        "reports": {
                            "data_splits": ["rep1"],
                            'models': ["coef"]
                        }
                    },
                    "selection": {
                        "split_strategy": "random",
                        "split_count": 1,
                        "training_percentage": 0.7,
                        "reports": {
                            "data_splits": ["rep1"],
                            "models": [],
                        }
                    },
                    "labels": ["my_signal"],
                    "dataset": "d1",
                    "strategy": "GridSearch",
                    "metrics": ["accuracy"],
                    "reports": ["hprep"],
                    "number_of_processes": 3,
                    "optimization_metric": "balanced_accuracy",
                    "refit_optimal_model": False
                }
            }
        }
        PathBuilder.build(path)
        specs_file = path / "specs.yaml"
        with specs_file.open("w") as file:
            yaml.dump(specs, file)

        return specs_file

    def build_path(self, path: str = None):
        if path is None:
            path = EnvironmentSettings.root_path / "quickstart/"
            if os.path.isdir(path):
                shutil.rmtree(path)
            PathBuilder.build(path)
        else:
            path = PathBuilder.build(path)
        return path

    def _simulate_dataset_with_signals(self, path: Path):

        print("immuneML quickstart: generating a synthetic dataset...")

        PathBuilder.build(path)

        specs = {
            "definitions": {
                "datasets": {
                    "my_synthetic_dataset": {"format": "RandomRepertoireDataset", "params": {"labels": {}}}
                },
                "motifs": {"my_motif": {"seed": "AA", "instantiation": "GappedKmer"}},
                "signals": {"my_signal": {"motifs": ["my_motif"], "implanting": "HealthySequence"}},
                "simulations": {"my_simulation": {"my_implantng": {"signals": ["my_signal"], "dataset_implanting_rate": 0.5,
                                                                   "repertoire_implanting_rate": 0.1}}}
            },
            "instructions": {"simulation_instruction": {"type": "Simulation", "dataset": "my_synthetic_dataset", "simulation": "my_simulation",
                                                        "export_formats": ["AIRR"]}}
        }

        specs_file = path / "simulation_specs.yaml"
        with specs_file.open("w") as file:
            yaml.dump(specs, file)

        app = ImmuneMLApp(specs_file, path / "result")
        app.run()

        print("immuneML quickstart: finished generating a synthetic dataset.")

    def run(self, result_path: str):

        result_path = self.build_path(result_path)

        logging.basicConfig(filename=Path(result_path) / "log.txt",
                            level=logging.ERROR, format='%(asctime)s %(levelname)s: %(message)s')
        warnings.showwarning = lambda message, category, filename, lineno, file=None, line=None: logging.warning(
            message)

        self._simulate_dataset_with_signals(result_path / "synthetic_dataset")

        print("immuneML quickstart: training a machine learning model...")
        specs_file = self.create_specfication(
            result_path / "machine_learning_analysis")
        app = ImmuneMLApp(specs_file, result_path /
                          "machine_learning_analysis/result")
        app.run()

        print("immuneML quickstart: finished training a machine learning model.")


def main():
    path = EnvironmentSettings.tmp_test_path / \
        "random_repertoire_dataset_generation/"

    quickstart = Quickstart()
 
    quickstart.run(path)
    







In [10]:
path

NameError: name 'path' is not defined

In [11]:

main()
    

SyntaxError: invalid syntax (Temp/ipykernel_25168/596146001.py, line 2)

In [None]:
%run quickstart.py

In [None]:
encoded_train_dataset


In [59]:
label._labels

{'my_signal': <immuneML.environment.Label.Label at 0x22a61624b50>}

In [60]:
path

WindowsPath('C:/Users/Karthik/.conda/envs/uni/lib/site-packages/test/tmp/random_repertoire_dataset_generation/ML')

In [62]:
optimisatization_metric

<Metric.BALANCED_ACCURACY: 'balanced_accuracy_score'>

In [79]:
method = HPUtil.train_method(
    label,encoded_train_dataset, Hpsetting1, path, path / "train_predictions.csv", path / "ml_details.yaml" , number_of_processes , optimisatization_metric)


2022-01-11 18:51:59.432865: ML model training started...


KeyError: <immuneML.environment.LabelConfiguration.LabelConfiguration object at 0x0000022A61624490>

In [68]:
Hpsetting1.ml_params

{}

In [78]:
encoded_train_dataset.labels["my_signal"]

[False, True]