diff --git a/bert_squeeze/assistants/__init__.py b/bert_squeeze/assistants/__init__.py
new file mode 100644
index 0000000..4038508
--- /dev/null
+++ b/bert_squeeze/assistants/__init__.py
@@ -0,0 +1 @@
+from .train_assistant import TrainAssistant
diff --git a/bert_squeeze/assistants/configs/train_bert.yaml b/bert_squeeze/assistants/configs/train_bert.yaml
new file mode 100644
index 0000000..5eaf467
--- /dev/null
+++ b/bert_squeeze/assistants/configs/train_bert.yaml
@@ -0,0 +1,47 @@
+general:
+  debug: false
+  do_train: true
+  do_eval: false
+  get_mismatched: true
+  evaluate_during_training: true
+  labels: [ 0, 1 ]
+  output_dir: outputs
+  save_steps: 500
+  validation_every_n_epoch: 1
+
+train:
+  adam_eps: 1e-8
+  accumulation_steps: 1
+  auto_lr: false
+  discriminative_learning: true
+  dropout: 0.2
+  layer_lr_decay: 0.95
+  learning_rates: [ 2e-5 ]
+  logging_steps: 50
+  lr_scheduler: true
+  max_grad_norm: 1.0
+  num_epochs: 10
+  optimizer: bertadam
+  objective: lsl
+  smoothing: 0.1
+  warmup_ratio: 0.06
+  warmup_steps: true
+  weight_decay: 0.01
+
+model:
+  _target_: bert_squeeze.models.lt_bert.LtCustomBert
+  num_labels: 2
+  pretrained_model: "bert-base-cased"
+  training_config: ${train}
+
+data:
+  _target_: bert_squeeze.data.modules.transformer_module.TransformerDataModule
+  dataset_config:
+    is_local: false
+    label_col: label
+    path:
+    split:
+    text_col: text
+    truncate_mode: head
+  max_length: 256
+  tokenizer_name: ${model.pretrained_model}
diff --git a/bert_squeeze/assistants/configs/train_deebert.yaml b/bert_squeeze/assistants/configs/train_deebert.yaml
new file mode 100644
index 0000000..c6eed11
--- /dev/null
+++ b/bert_squeeze/assistants/configs/train_deebert.yaml
@@ -0,0 +1,51 @@
+general:
+  debug: false
+  do_train: true
+  do_eval: false
+  get_mismatched: true
+  evaluate_during_training: true
+  labels: [ 0,1 ]
+  num_labels: 2
+  output_dir: outputs
+  save_steps: 500
+  validation_every_n_epoch: 1
+
+train:
+  adam_eps: 1e-8
+  accumulation_steps: 1
+  auto_lr: false
+  discriminative_learning: true
+  dropout: 0.2
+  layer_lr_decay: 0.95
+  learning_rates: [ 2e-5 ]
+  logging_steps: 100
+  lr_scheduler: true
+  max_grad_norm: 1.0
+  num_epochs: 10
+  optimizer: bertadam
+  objective: lsl
+  smoothing: 0.1
+  warmup_ratio: 0.06
+  warmup_steps: true
+  weight_decay: 0.01
+
+  train_highway: true
+  early_exit_entropy: -1
+
+model:
+  _target_: bert_squeeze.models.lt_deebert.LtDeeBert
+  training_config: ${train}
+  pretrained_model: "bert-base-cased"
+  num_labels: ${general.num_labels}
+
+data:
+  _target_: bert_squeeze.data.modules.transformer_module.TransformerDataModule
+  dataset_config:
+    is_local: false
+    path: emotion
+    split:
+    text_col: text
+    label_col: label
+    truncate_mode: head
+  tokenizer_name: ${model.pretrained_model}
+  max_length: 256
\ No newline at end of file
diff --git a/examples/configs/train_labse.yaml b/bert_squeeze/assistants/configs/train_fastbert.yaml
similarity index 50%
rename from examples/configs/train_labse.yaml
rename to bert_squeeze/assistants/configs/train_fastbert.yaml
index fd3ceff..bb1474b 100644
--- a/examples/configs/train_labse.yaml
+++ b/bert_squeeze/assistants/configs/train_fastbert.yaml
@@ -1,53 +1,45 @@
-task:
-  name: train
-
 general:
+  debug: false
   do_train: true
   do_eval: false
-  debug: false
-  labels:
-    - sadness
-    - joy
-    - love
-    - anger
-    - fear
-    - surprise
+  get_mismatched: true
+  evaluate_during_training: true
+  labels: [ 0, 1 ]
   output_dir: outputs
   save_steps: 500
   validation_every_n_epoch: 1
-  evaluate_during_training: true
-  get_mismated: true
-  model_path: ../checkpoints/BER-511/
+
+callbacks:
+  - _target_: bert_squeeze.utils.callbacks.fastbert_logic.FastBertLogic
 
 train:
-  training_batch_size: 8
-  num_epochs: 10
-  dropout: 0.2
-  objective: lsl
-  smoothing: 0.1
-  optimizer: bertadam
-  weight_decay: 0.01
-  discriminative_learning: true
-  learning_rates: [ 2e-5 ]
+  adam_eps: 1e-8
+  accumulation_steps: 1
   auto_lr: false
+  discriminative_learning: true
+  dropout: 0.2
   layer_lr_decay: 0.95
+  learning_rates: [ 2e-5 ]
+  logging_steps: 50
   lr_scheduler: true
-  adam_eps: 1e-8
+  max_grad_norm: 1.0
+  num_epochs: 10
+  optimizer: bertadam
+  objective: lsl
+  smoothing: 0.1
   warmup_ratio: 0.06
   warmup_steps: true
-  max_grad_norm: 1.0
-  accumulation_steps: 1
-  eval_batch_size: 16
-  logging_steps: 50
+  weight_decay: 0.01
 
 model:
-  _target_: bert-squeeze.models.lt_labse.LtCustomLabse
+  _target_: bert_squeeze.models.lt_fastbert.LtFastBert
   training_config: ${train}
-  pretrained_model: "sentence-transformers/LaBSE"
-  num_labels: 6
+  pretrained_model: "bert-base-cased"
+  num_labels: 2
+  scorer_type: "fast"
 
 data:
-  _target_: bert-squeeze.data.modules.transformer_module.TransformerDataModule
+  _target_: bert_squeeze.data.modules.transformer_module.TransformerDataModule
   dataset_config:
     is_local: false
     path: emotion
@@ -56,16 +48,4 @@ data:
     label_col: label
     truncate_mode: head
   tokenizer_name: ${model.pretrained_model}
-  max_length: 256
-  train_batch_size: ${train.training_batch_size}
-  eval_batch_size: ${train.eval_batch_size}
-
-neptune:
-  user_name: julesbelveze
-  project: bert-tricks
-  tags: [ ]
-  logger:
-    _target_: neptune.new.integrations.pytorch_lightning.NeptuneLogger
-    project: ${neptune.user_name}/${neptune.project}
-    name: ${task.name}
-
+  max_length: 256
\ No newline at end of file
diff --git a/bert_squeeze/assistants/configs/train_lr.yaml b/bert_squeeze/assistants/configs/train_lr.yaml
new file mode 100644
index 0000000..d926660
--- /dev/null
+++ b/bert_squeeze/assistants/configs/train_lr.yaml
@@ -0,0 +1,42 @@
+general:
+  debug: false
+  do_train: true
+  do_eval: false
+  get_mismatched: true
+  evaluate_during_training: true
+  labels: [ 0,1 ]
+  num_labels: 2
+  output_dir: outputs
+  save_steps: 500
+  validation_every_n_epoch: 1
+
+train:
+  accumulation_steps: 1
+  auto_lr: false
+  discriminative_learning: false
+  eval_batch_size: 16
+  learning_rates: [ 2e-1 ]
+  logging_steps: 100
+  max_grad_norm: 1.0
+  num_epochs: 10
+  objective: ce
+  smoothing: 0.1
+  training_batch_size: 16
+
+model:
+  _target_: bert_squeeze.models.lr.BowLogisticRegression
+  training_config: ${train}
+  vocab_size: 5000
+  embed_dim: 256
+  num_labels: ${general.num_labels}
+  name: "bow_lr"
+
+data:
+  _target_: bert_squeeze.data.modules.lr_module.LrDataModule
+  dataset_config:
+    is_local: false
+    label_col: label
+    path:
+    split:
+    text_col: text
+  max_features: ${model.vocab_size}
diff --git a/bert_squeeze/assistants/configs/train_lstm.yaml b/bert_squeeze/assistants/configs/train_lstm.yaml
new file mode 100644
index 0000000..033af78
--- /dev/null
+++ b/bert_squeeze/assistants/configs/train_lstm.yaml
@@ -0,0 +1,40 @@
+general:
+  debug: false
+  do_train: true
+  do_eval: false
+  get_mismatched: true
+  evaluate_during_training: true
+  labels: [ 0,1 ]
+  num_labels: 2
+  output_dir: outputs
+  save_steps: 500
+  validation_every_n_epoch: 1
+
+train:
+  accumulation_steps: 1
+  auto_lr: false
+  dropout: 0.2
+  learning_rates: [ 2e-3 ]
+  logging_steps: 100
+  max_grad_norm: 1.0
+  num_epochs: 10
+  objective: ce
+  smoothing: 0.1
+
+model:
+  _target_: bert_squeeze.models.lstm.LtLSTM
+  hidden_dim: 128
+  name: lstm
+  num_labels: ${general.num_labels}
+  training_config: ${train}
+  vocab_size: 20000
+
+data:
+  _target_: bert_squeeze.data.modules.lstm_module.LSTMDataModule
+  dataset_config:
+    is_local: false
+    path: emotion
+    split:
+    text_col: text
+    label_col: label
+  max_features: ${model.vocab_size}
\ No newline at end of file
diff --git a/examples/configs/train_theseus_labse.yaml b/bert_squeeze/assistants/configs/train_theseus_bert.yaml
similarity index 54%
rename from examples/configs/train_theseus_labse.yaml
rename to bert_squeeze/assistants/configs/train_theseus_bert.yaml
index 2f174a5..b3f883e 100644
--- a/examples/configs/train_theseus_labse.yaml
+++ b/bert_squeeze/assistants/configs/train_theseus_bert.yaml
@@ -1,57 +1,45 @@
-task:
-  name: train
-
 general:
+  debug: false
   do_train: true
   do_eval: false
-  debug: false
-  labels:
-    - sadness
-    - joy
-    - love
-    - anger
-    - fear
-    - surprise
+  get_mismatched: true
+  evaluate_during_training: true
+  labels: [ 0, 1 ]
   output_dir: outputs
   save_steps: 500
   validation_every_n_epoch: 1
-  evaluate_during_training: true
-  get_mismated: true
-  model_path: ../checkpoints/BER-511/
 
 train:
-  training_batch_size: 8
-  num_epochs: 10
-  dropout: 0.2
-  objective: lsl
-  smoothing: 0.1
-  optimizer: bertadam
-  weight_decay: 0.01
-  discriminative_learning: true
-  learning_rates: [ 2e-5 ]
+  adam_eps: 1e-8
+  accumulation_steps: 1
   auto_lr: false
+  discriminative_learning: true
+  dropout: 0.2
   layer_lr_decay: 0.95
+  learning_rates: [ 2e-5 ]
+  logging_steps: 50
   lr_scheduler: true
-  adam_eps: 1e-8
+  max_grad_norm: 1.0
+  num_epochs: 10
+  optimizer: bertadam
+  objective: lsl
+  smoothing: 0.1
   warmup_ratio: 0.06
   warmup_steps: true
-  max_grad_norm: 1.0
-  accumulation_steps: 1
-  eval_batch_size: 16
-  logging_steps: 50
+  weight_decay: 0.01
 
 model:
-  _target_: bert-squeeze.models.lt_theseus_bert.LtTheseusBert
+  _target_: bert_squeeze.models.lt_theseus_bert.LtTheseusBert
   training_config: ${train}
-  pretrained_model: "sentence-transformers/LaBSE"
-  num_labels: 6
+  pretrained_model: "bert-base-cased"
+  num_labels: 2
   replacement_scheduler:
     type: "linear"
     base_replacing_rate: 0.3
     coefficient: 0.0006
 
 data:
-  _target_: bert-squeeze.data.modules.transformer_module.TransformerDataModule
+  _target_: bert_squeeze.data.modules.transformer_module.TransformerDataModule
   dataset_config:
     is_local: false
     path: emotion
@@ -61,15 +49,3 @@ data:
     truncate_mode: head
   tokenizer_name: ${model.pretrained_model}
   max_length: 256
-  train_batch_size: ${train.training_batch_size}
-  eval_batch_size: ${train.eval_batch_size}
-
-neptune:
-  user_name: julesbelveze
-  project: bert-tricks
-  tags: [ ]
-  logger:
-    _target_: neptune.new.integrations.pytorch_lightning.NeptuneLogger
-    project: ${neptune.user_name}/${neptune.project}
-    name: ${task.name}
-
diff --git a/bert_squeeze/assistants/train_assistant.py b/bert_squeeze/assistants/train_assistant.py
new file mode 100644
index 0000000..ffca42e
--- /dev/null
+++ b/bert_squeeze/assistants/train_assistant.py
@@ -0,0 +1,152 @@
+import logging
+import os
+from typing import Any, Dict, List, Optional
+
+from hydra.utils import instantiate
+from omegaconf import OmegaConf
+from pkg_resources import resource_filename
+from pydantic.utils import deep_update
+from pytorch_lightning.callbacks.callback import Callback
+from pytorch_lightning.loggers import TensorBoardLogger
+from pytorch_lightning.loggers.logger import Logger
+
+CONFIG_MAPPER = {
+    "lr": "train_lr.yaml",
+    "bert": "train_bert.yaml",
+    "lstm": "train_lstm.yaml",
+    "deebert": "train_deebert.yaml",
+    "fastbert": "train_fastbert.yaml",
+    "theseus-bert": "train_theseus_bert.yaml"
+}
+
+
+class TrainAssistant(object):
+    """
+    Helper object that holds and instantiate the needed for training.
+
+    For every available model for fine-tuning it will load a default configuration that
+    can be overwritten by passing some keyword arguments.
+    It contains four main sub-configurations:
+    - general: various high level parameters unrelated to the training procedure
+    - train: training related parameters
+    - model: parameters necessary to build and define the model
+    - data: parameters necessary to define the dataset and featurize it
+
+    Args:
+        name (str):
+            name of the base model to fine-tune
+        dataset_path (str):
+            path of the dataset to use
+        general_kwargs (Dict[str, Any]):
+            keyword arguments that can be added or overwrite the default 'general' configuration
+        train_kwargs (Dict[str, Any]):
+            keyword arguments that can be added or overwrite the default 'train' configuration
+        model_kwargs (Dict[str, Any]):
+            keyword arguments that can be added or overwrite the default 'model' configuration
+        data_kwargs (Dict[str, Any]):
+            keyword arguments that can be added or overwrite the default 'data' configuration
+        logger_kwargs (Dict[str, Any]):
+            keyword arguments that can be added or overwrite the default 'logger' configuration
+        callbacks (List[Callback]):
+            list of callbacks to use during training
+    """
+
+    def __init__(
+            self,
+            name: str,
+            dataset_path: str,
+            general_kwargs: Dict[str, Any] = None,
+            train_kwargs: Dict[str, Any] = None,
+            model_kwargs: Dict[str, Any] = None,
+            data_kwargs: Dict[str, Any] = None,
+            logger_kwargs: Dict[str, Any] = None,
+            callbacks: List[Callback] = None
+    ):
+        conf = OmegaConf.load(
+            resource_filename("bert_squeeze", os.path.join("assistants/configs", CONFIG_MAPPER[name]))
+        )
+        if data_kwargs is not None and data_kwargs.get("dataset_config", {}).get("path") is not None:
+            logging.warning("Found value for `dataset_config.path` which conflicts with parameter `dataset_path`, using"
+                            "value from the later.")
+        conf["data"]["dataset_config"]["path"] = dataset_path
+
+        for name, kws in zip(["general", "train", "model", "data", "logger", "callbacks"],
+                             [general_kwargs, train_kwargs, model_kwargs, data_kwargs, logger_kwargs, callbacks]):
+            if kws is not None:
+                conf[name] = deep_update(conf[name], kws)
+
+        self.name = name
+        self.general = conf["general"]
+        self.train = conf["train"]
+        self._model_conf = conf["model"]
+        self._data_conf = conf["data"]
+        self._logger_conf = conf.get("logger")
+        self._callbacks_conf = conf.get("callbacks", [])
+
+        self._model = None
+        self._data = None
+        self._logger = None
+        self._callbacks = None
+
+    @property
+    def model(self) -> Any:
+        """"""
+        if self._model is None:
+            self.model = instantiate(self._model_conf)
+        return self._model
+
+    @model.setter
+    def model(self, value: Any) -> None:
+        """"""
+        self._model = value
+
+    @property
+    def data(self) -> Any:
+        """"""
+        if self._data is None:
+            data = instantiate(self._data_conf)
+            data.prepare_data()
+            data.setup()
+            self.data = data
+        return self._data
+
+    @data.setter
+    def data(self, value: Any) -> None:
+        """"""
+        self._data = value
+
+    @property
+    def logger(self) -> Logger:
+        """"""
+        if self._logger is None:
+            if self._logger_conf is not None:
+                self.logger = instantiate(self._logger_conf)
+            else:
+                self.logger = TensorBoardLogger(self.general["output_dir"])
+        return self._logger
+
+    @logger.setter
+    def logger(self, value: Logger) -> None:
+        """"""
+        self._logger = value
+
+    @property
+    def callbacks(self) -> List[Callback]:
+        """"""
+        if self._callbacks is None:
+            if self._callbacks_conf is not None:
+                self.callbacks = [instantiate(callback) for callback in self._callbacks_conf]
+            else:
+                self.callbacks = []
+        return self._callbacks
+
+    @callbacks.setter
+    def callbacks(self, value: List[Callback]) -> None:
+        """"""
+        self._callbacks = value
+
+    def __repr__(self):
+        return f"<Assistant(name={self.name})>"
+
+    def __str__(self):
+        return f"Assistant_{self.name}"
diff --git a/examples/configs/distil_config.yaml b/examples/configs/distil_config.yaml
deleted file mode 100644
index 1fd0088..0000000
--- a/examples/configs/distil_config.yaml
+++ /dev/null
@@ -1,109 +0,0 @@
-task:
-  name: distil
-  strategy: t2lr
-
-general:
-  do_train: true
-  do_eval: false
-  debug: false
-  labels:
-    - sadness
-    - joy
-    - love
-    - anger
-    - fear
-    - surprise
-  output_dir: outputs
-  save_steps: 1000
-  validation_every_n_epoch: 1
-  evaluate_during_training: true
-  get_mismated: true
-
-
-train:
-  num_epochs: 10
-  dropout: 0.2
-  objective: lsl
-  smoothing: 0.1
-  optimizer: bertadam
-  weight_decay: 0.01
-  discriminative_learning: false
-  learning_rates: [ 2e-5 ]
-  auto_lr: false
-  layer_lr_decay: 0.95
-  lr_scheduler: true
-  adam_eps: 1e-8
-  warmup_ratio: 0.06
-  warmup_steps: true
-  max_grad_norm: 1.0
-  accumulation_steps: 1
-  eval_batch_size: 2
-  training_batch_size: 2
-  num_labels: 6
-  alpha: 0.5
-  logging_steps: 100
-
-
-model:
-  _target_: bert-squeeze.distillation.distiller.Distiller
-  teacher_config:
-    _target_: bert-squeeze.models.lt_labse.LtCustomLabse
-    training_config: ${train} # for the sake of compatibility but useless as model won't be finetuned
-    pretrained_model: "sentence-transformers/LaBSE"
-    num_labels: ${train.num_labels}
-    checkpoint_path: ../checkpoints/BER-769/N-Step-Checkpoint_1_2000.ckpt
-    name: labse
-  student_config:
-    _target_: bert-squeeze.models.lr.BowLogisticRegression
-    architecture: "lr"
-    vocab_size: 10000
-    num_labels: ${train.num_labels}
-    name: bow_lr
-    training_config: ${train}
-  training_config: ${train}
-
-
-data:
-  train_batch_size: ${train.training_batch_size}
-  eval_batch_size: ${train.eval_batch_size}
-  _target_: bert-squeeze.data.modules.distillation_module.DistillationDataModule
-  teacher_module:
-    _target_: bert-squeeze.data.modules.transformer_module.TransformerDataModule
-    dataset_config:
-      is_local: false
-      name: emotion
-      split:
-      text_col: text
-      label_col: label
-      truncate_mode: head
-    tokenizer_name: ${model.teacher_config.pretrained_model}
-    max_length: 256
-    train_batch_size: ${train.training_batch_size}
-    eval_batch_size: ${train.eval_batch_size}
-  student_module:
-    _target_: bert-squeeze.data.modules.lr_module.LrDataModule
-    dataset_config:
-      is_local: false
-      path: emotion
-      split:
-      text_col: text
-      label_col: label
-      truncate_mode: head
-    max_features: 30000
-    train_batch_size: ${train.training_batch_size}
-    eval_batch_size: ${train.eval_batch_size}
-
-
-neptune:
-  user_name: julesbelveze
-  project: bert-tricks
-  tags: [ ]
-  logger:
-    _target_: neptune.new.integrations.pytorch_lightning.NeptuneLogger
-    project: ${neptune.user_name}/${neptune.project}
-    name: ${task.name}
-
-
-hydra:
-  run:
-    dir: ./outputs/${task.name}/${task.strategy}/${now:%Y-%m-%d_%H-%M-%S}
\ No newline at end of file
diff --git a/examples/configs/distil_t2t_config.yaml b/examples/configs/distil_t2t_config.yaml
deleted file mode 100644
index a04db0f..0000000
--- a/examples/configs/distil_t2t_config.yaml
+++ /dev/null
@@ -1,110 +0,0 @@
-task:
-  name: distil
-  strategy: t2t
-
-general:
-  do_train: true
-  do_eval: false
-  debug: false
-  labels:
-    - sadness
-    - joy
-    - love
-    - anger
-    - fear
-    - surprise
-  output_dir: outputs
-  save_steps: 1000
-  validation_every_n_epoch: 1
-  evaluate_during_training: true
-  get_mismated: true
-
-
-train:
-  num_epochs: 10
-  dropout: 0.2
-  objective: lsl
-  smoothing: 0.1
-  optimizer: adamw
-  weight_decay: 0.01
-  discriminative_learning: false
-  learning_rates: [ 2e-5 ]
-  auto_lr: false
-  layer_lr_decay: 0.95
-  lr_scheduler: true
-  adam_eps: 1e-8
-  warmup_ratio: 0.06
-  warmup_steps: true
-  max_grad_norm: 1.0
-  accumulation_steps: 1
-  eval_batch_size: 2
-  training_batch_size: 2
-  num_labels: 6
-  alpha: 0.5
-  logging_steps: 100
-
-
-model:
-  _target_: bert-squeeze.distillation.distiller.Distiller
-  teacher_config:
-    _target_: bert-squeeze.models.lt_labse.LtCustomLabse
-    training_config: ${train} # for the sake of compatibility but useless as model won't be finetuned
-    pretrained_model: "sentence-transformers/LaBSE"
-    num_labels: ${train.num_labels}
-    checkpoint_path: ../checkpoints/BER-511/checkpoints/N-Step-Checkpoint_9_4500.ckpt
-    name: labse
-  student_config:
-    _target_: bert-squeeze.models.lt_bert.LtCustomBert
-    architecture: "transformer"
-    training_config: ${train} # for the sake of compatibility but useless as model won't be finetuned
-    pretrained_model: "bert-base-cased"
-    num_labels: ${train.num_labels}
-    name: bert
-  training_config: ${train}
-
-
-data:
-  train_batch_size: ${train.training_batch_size}
-  eval_batch_size: ${train.eval_batch_size}
-  _target_: bert-squeeze.data.modules.distillation_module.DistillationDataModule
-  teacher_module:
-    _target_: bert-squeeze.data.modules.transformer_module.TransformerDataModule
-    dataset_config:
-      is_local: false
-      name: emotion
-      split:
-      text_col: text
-      label_col: label
-      truncate_mode: head
-    tokenizer_name: ${model.teacher_config.pretrained_model}
-    max_length: 256
-    train_batch_size: ${train.training_batch_size}
-    eval_batch_size: ${train.eval_batch_size}
-  student_module:
-    _target_: bert-squeeze.data.modules.transformer_module.TransformerDataModule
-    dataset_config:
-      is_local: false
-      path: emotion
-      split:
-      text_col: text
-      label_col: label
-      truncate_mode: head
-    tokenizer_name: ${model.student_config.pretrained_model}
-    max_length: 256
-    train_batch_size: ${train.training_batch_size}
-    eval_batch_size: ${train.eval_batch_size}
-
-
-neptune:
-  user_name: julesbelveze
-  project: bert-tricks
-  tags: [ ]
-  logger:
-    _target_: neptune.new.integrations.pytorch_lightning.NeptuneLogger
-    project: ${neptune.user_name}/${neptune.project}
-    name: ${task.name}
-
-
-hydra:
-  run:
-    dir: ./outputs/${task.name}/${task.strategy}/${now:%Y-%m-%d_%H-%M-%S}
\ No newline at end of file
diff --git a/examples/configs/hard_distil_t2t_config.yaml b/examples/configs/hard_distil_t2t_config.yaml
deleted file mode 100644
index e8bc69e..0000000
--- a/examples/configs/hard_distil_t2t_config.yaml
+++ /dev/null
@@ -1,124 +0,0 @@
-task:
-  name: distil
-  strategy: t2t
-
-general:
-  do_train: true
-  do_eval: false
-  debug: false
-  labels:
-    - sadness
-    - joy
-    - love
-    - anger
-    - fear
-    - surprise
-  output_dir: outputs
-  save_steps: 1000
-  validation_every_n_epoch: 1
-  evaluate_during_training: true
-  get_mismated: true
-
-
-train:
-  num_epochs: 10
-  dropout: 0.2
-  objective: lsl
-  smoothing: 0.1
-  optimizer: adamw
-  weight_decay: 0.01
-  discriminative_learning: false
-  learning_rates: [ 2e-5 ]
-  auto_lr: false
-  layer_lr_decay: 0.95
-  lr_scheduler: true
-  adam_eps: 1e-8
-  warmup_ratio: 0.06
-  warmup_steps: true
-  max_grad_norm: 1.0
-  accumulation_steps: 1
-  eval_batch_size: 2
-  training_batch_size: 2
-  num_labels: 6
-  alpha: 0.5
-  logging_steps: 100
-
-
-model:
-  _target_: bert-squeeze.distillation.distiller.Distiller
-  teacher_config:
-    _target_: bert-squeeze.models.lt_labse.LtCustomLabse
-    training_config: ${train} # for the sake of compatibility but useless as model won't be finetuned
-    pretrained_model: "sentence-transformers/LaBSE"
-    num_labels: ${train.num_labels}
-    checkpoint_path: ../checkpoints/BER-511/checkpoints/N-Step-Checkpoint_9_4500.ckpt
-    name: labse
-  student_config:
-    _target_: bert-squeeze.models.lt_bert.LtCustomBert
-    architecture: "transformer"
-    training_config: ${train} # for the sake of compatibility but useless as model won't be finetuned
-    pretrained_model: "bert-base-cased"
-    num_labels: ${train.num_labels}
-    name: bert
-  training_config: ${train}
-
-
-data:
-  train_batch_size: ${train.training_batch_size}
-  eval_batch_size: ${train.eval_batch_size}
-  _target_: bert-squeeze.data.modules.distillation_module.DistillationDataModule
-  teacher_module:
-    _target_: bert-squeeze.data.modules.transformer_module.TransformerDataModule
-    dataset_config:
-      is_local: false
-      path: emotion
-      split:
-      text_col: text
-      label_col: label
-      truncate_mode: head
-    tokenizer_name: ${model.teacher_config.pretrained_model}
-    max_length: 256
-    train_batch_size: ${train.training_batch_size}
-    eval_batch_size: ${train.eval_batch_size}
-  student_module:
-    _target_: bert-squeeze.data.modules.transformer_module.TransformerDataModule
-    dataset_config:
-      is_local: false
-      path: emotion
-      split:
-      text_col: text
-      label_col: label
-      truncate_mode: head
-    tokenizer_name: ${model.student_config.pretrained_model}
-    max_length: 256
-    train_batch_size: ${train.training_batch_size}
-    eval_batch_size: ${train.eval_batch_size}
-  hard_labeler:
-    _target_: bert-squeeze.distillation.utils.labeler.HardLabeler
-    labeler_config:
-      teacher: ${model.teacher_config}
-      pretrained_model: ${model.teacher_config.pretrained_model}
-      num_labels: ${model.teacher_config.num_labels}
-      checkpoint_path: ${model.teacher_config.checkpoint_path}
-      max_length: ${data.teacher_module.max_length}
-    dataset_config:
-      is_local: false
-      name: go_emotions
-      split: raw
-      text_col: text
-      max_samples: 10000
-    max_length: ${data.teacher_module.max_length}
-
-neptune:
-  user_name: julesbelveze
-  project: bert-tricks
-  tags: [ ]
-  logger:
-    _target_: neptune.new.integrations.pytorch_lightning.NeptuneLogger
-    project: ${neptune.user_name}/${neptune.project}
-    name: ${task.name}
-
-
-hydra:
-  run:
-    dir: ./outputs/${task.name}/${task.strategy}/${now:%Y-%m-%d_%H-%M-%S}
\ No newline at end of file
diff --git a/examples/configs/soft_distil_t2t_config.yaml b/examples/configs/soft_distil_t2t_config.yaml
deleted file mode 100644
index a5aa088..0000000
--- a/examples/configs/soft_distil_t2t_config.yaml
+++ /dev/null
@@ -1,116 +0,0 @@
-task:
-  name: distil
-  strategy: t2t
-
-general:
-  do_train: true
-  do_eval: false
-  debug: false
-  labels:
-    - sadness
-    - joy
-    - love
-    - anger
-    - fear
-    - surprise
-  output_dir: outputs
-  save_steps: 1000
-  validation_every_n_epoch: 1
-  evaluate_during_training: true
-  get_mismated: true
-
-
-train:
-  num_epochs: 10
-  dropout: 0.2
-  objective: lsl
-  smoothing: 0.1
-  optimizer: adamw
-  weight_decay: 0.01
-  discriminative_learning: false
-  learning_rates: [ 2e-5 ]
-  auto_lr: false
-  layer_lr_decay: 0.95
-  lr_scheduler: true
-  adam_eps: 1e-8
-  warmup_ratio: 0.06
-  warmup_steps: true
-  max_grad_norm: 1.0
-  accumulation_steps: 1
-  eval_batch_size: 2
-  training_batch_size: 2
-  num_labels: 6
-  alpha: 0.5
-  logging_steps: 100
-
-
-model:
-  _target_: bert-squeeze.distillation.distiller.Distiller
-  teacher_config:
-    _target_: bert-squeeze.models.lt_labse.LtCustomLabse
-    training_config: ${train} # for the sake of compatibility but useless as model won't be finetuned
-    pretrained_model: "sentence-transformers/LaBSE"
-    num_labels: ${train.num_labels}
-    checkpoint_path: ../checkpoints/BER-511/checkpoints/N-Step-Checkpoint_9_4500.ckpt
-    name: labse
-  student_config:
-    _target_: bert-squeeze.models.lt_bert.LtCustomBert
-    architecture: "transformer"
-    training_config: ${train} # for the sake of compatibility but useless as model won't be finetuned
-    pretrained_model: "bert-base-cased"
-    num_labels: ${train.num_labels}
-    name: bert
-  training_config: ${train}
-
-
-data:
-  train_batch_size: ${train.training_batch_size}
-  eval_batch_size: ${train.eval_batch_size}
-  _target_: bert-squeeze.data.modules.distillation_module.DistillationDataModule
-  teacher_module:
-    _target_: bert-squeeze.data.modules.transformer_module.TransformerDataModule
-    dataset_config:
-      is_local: false
-      path: emotion
-      split:
-      text_col: text
-      label_col: label
-      truncate_mode: head
-    tokenizer_name: ${model.teacher_config.pretrained_model}
-    max_length: 256
-    train_batch_size: ${train.training_batch_size}
-    eval_batch_size: ${train.eval_batch_size}
-  student_module:
-    _target_: bert-squeeze.data.modules.transformer_module.TransformerDataModule
-    dataset_config:
-      is_local: false
-      path: emotion
-      split:
-      text_col: text
-      label_col: label
-      truncate_mode: head
-    tokenizer_name: ${model.student_config.pretrained_model}
-    max_length: 256
-    train_batch_size: ${train.training_batch_size}
-    eval_batch_size: ${train.eval_batch_size}
-  soft_data_config:
-    is_local: false
-    name: go_emotions
-    split: raw
-    text_col: text
-    max_samples: 10000
-
-
-neptune:
-  user_name: julesbelveze
-  project: bert-tricks
-  tags: [ ]
-  logger:
-    _target_: neptune.new.integrations.pytorch_lightning.NeptuneLogger
-    project: ${neptune.user_name}/${neptune.project}
-    name: ${task.name}
-
-
-hydra:
-  run:
-    dir: ./outputs/${task.name}/${task.strategy}/${now:%Y-%m-%d_%H-%M-%S}
\ No newline at end of file
diff --git a/examples/configs/train_deebert.yaml b/examples/configs/train_deebert.yaml
deleted file mode 100644
index f5cda78..0000000
--- a/examples/configs/train_deebert.yaml
+++ /dev/null
@@ -1,75 +0,0 @@
-task:
-  name: train
-
-general:
-  do_train: true
-  do_eval: false
-  debug: false
-  labels:
-    - sadness
-    - joy
-    - love
-    - anger
-    - fear
-    - surprise
-  output_dir: outputs
-  save_steps: 500
-  logging_steps: 50
-  validation_every_n_epoch: 1
-  evaluate_during_training: true
-  get_mismated: true
-
-train:
-  train_highway: true
-  early_exit_entropy: -1
-
-  dropout: 0.2
-  optimizer: bertadam
-  weight_decay: 0.01
-  discriminative_learning: true
-  learning_rates: [ 2e-5 ]
-  auto_lr: false
-  layer_lr_decay: 0.95
-  lr_scheduler: true
-  adam_eps: 1e-8
-  warmup_ratio: 0.06
-  warmup_steps: true
-  max_grad_norm: 1.0
-  accumulation_steps: 1
-  eval_batch_size: 1
-  training_batch_size: 1
-  num_epochs: 10
-  logging_steps: 50
-
-model:
-  _target_: bert-squeeze.models.exit_berts.deebert.DeeBert
-  training_config: ${train}
-  pretrained_model: "sentence-transformers/LaBSE"
-  num_labels: 6
-
-data:
-  _target_: bert-squeeze.data.modules.transformer_module.TransformerDataModule
-  dataset_config:
-    is_local: false
-    path: emotion
-    split:
-    text_col: text
-    label_col: label
-    truncate_mode: head
-  tokenizer_name: ${model.pretrained_model}
-  max_length: 256
-  train_batch_size: ${train.training_batch_size}
-  eval_batch_size: ${train.eval_batch_size}
-
-neptune:
-  user_name: julesbelveze
-  project: bert-tricks
-  tags: [ ]
-  logger:
-    _target_: neptune.new.integrations.pytorch_lightning.NeptuneLogger
-    project: ${neptune.user_name}/${neptune.project}
-    name: ${task.name}
-
-hydra:
-  run:
-    dir: ./outputs/${task.name}/${now:%Y-%m-%d_%H-%M-%S}
\ No newline at end of file
diff --git a/examples/configs/train_fastlabse.yaml b/examples/configs/train_fastlabse.yaml
deleted file mode 100644
index 1544d4f..0000000
--- a/examples/configs/train_fastlabse.yaml
+++ /dev/null
@@ -1,74 +0,0 @@
-task:
-  name: train
-
-general:
-  do_train: true
-  do_eval: false
-  debug: false
-  labels:
-    - sadness
-    - joy
-    - love
-    - anger
-    - fear
-    - surprise
-  output_dir: outputs
-  save_steps: 500
-  validation_every_n_epoch: 1
-  evaluate_during_training: true
-  get_mismated: true
-
-finetuning_callback:
-  _target_: bert-squeeze.utils.callbacks.fastbert_logic.FastBertLogic
-
-train:
-  training_batch_size: 8
-  num_epochs: 10
-  dropout: 0.2
-  objective: lsl
-  smoothing: 0.1
-  optimizer: bertadam
-  weight_decay: 0.01
-  discriminative_learning: true
-  learning_rates: [ 2e-5 ]
-  auto_lr: false
-  layer_lr_decay: 0.95
-  lr_scheduler: true
-  adam_eps: 1e-8
-  warmup_ratio: 0.06
-  warmup_steps: true
-  max_grad_norm: 1.0
-  accumulation_steps: 1
-  eval_batch_size: 16
-  logging_steps: 50
-
-model:
-  _target_: bert-squeeze.models.lt_fastbert.LtFastBert
-  training_config: ${train}
-  pretrained_model: "sentence-transformers/LaBSE"
-  num_labels: 6
-  scorer_type: "fast"
-
-data:
-  _target_: bert-squeeze.data.modules.transformer_module.TransformerDataModule
-  dataset_config:
-    is_local: false
-    path: emotion
-    split:
-    text_col: text
-    label_col: label
-    truncate_mode: head
-  tokenizer_name: ${model.pretrained_model}
-  max_length: 256
-  train_batch_size: ${train.training_batch_size}
-  eval_batch_size: ${train.eval_batch_size}
-
-neptune:
-  user_name: julesbelveze
-  project: bert-tricks
-  tags: [ ]
-  logger:
-    _target_: neptune.new.integrations.pytorch_lightning.NeptuneLogger
-    project: ${neptune.user_name}/${neptune.project}
-    name: ${task.name}
-
diff --git a/examples/configs/train_labse_pruning.yaml b/examples/configs/train_labse_pruning.yaml
deleted file mode 100644
index 0f249c8..0000000
--- a/examples/configs/train_labse_pruning.yaml
+++ /dev/null
@@ -1,73 +0,0 @@
-task:
-  name: train
-
-general:
-  do_train: true
-  do_eval: false
-  debug: false
-  pruning:
-    _target_: bert-squeeze.utils.callbacks.lottery_ticket.LotteryTicket
-  labels:
-    - sadness
-    - joy
-    - love
-    - anger
-    - fear
-    - surprise
-  output_dir: outputs
-  save_steps: 500
-  validation_every_n_epoch: 1
-  evaluate_during_training: true
-  get_mismated: true
-  model_path: ../checkpoints/BER-511/
-
-train:
-  training_batch_size: 2
-  num_epochs: 10
-  dropout: 0.2
-  objective: lsl
-  smoothing: 0.1
-  optimizer: bertadam
-  weight_decay: 0.01
-  discriminative_learning: true
-  learning_rates: [ 2e-5 ]
-  auto_lr: false
-  layer_lr_decay: 0.95
-  lr_scheduler: true
-  adam_eps: 1e-8
-  warmup_ratio: 0.06
-  warmup_steps: true
-  max_grad_norm: 1.0
-  accumulation_steps: 1
-  eval_batch_size: 2
-  logging_steps: 50
-
-model:
-  _target_: bert-squeeze.models.lt_labse.LtCustomLabse
-  training_config: ${train}
-  pretrained_model: "sentence-transformers/LaBSE"
-  num_labels: 6
-
-data:
-  _target_: bert-squeeze.data.modules.transformer_module.TransformerDataModule
-  dataset_config:
-    is_local: false
-    path: emotion
-    split:
-    text_col: text
-    label_col: label
-    truncate_mode: head
-  tokenizer_name: ${model.pretrained_model}
-  max_length: 256
-  train_batch_size: ${train.training_batch_size}
-  eval_batch_size: ${train.eval_batch_size}
-
-neptune:
-  user_name: julesbelveze
-  project: bert-tricks
-  tags: [ ]
-  logger:
-    _target_: neptune.new.integrations.pytorch_lightning.NeptuneLogger
-    project: ${neptune.user_name}/${neptune.project}
-    name: ${task.name}
-
diff --git a/examples/configs/train_labse_quantize.yaml b/examples/configs/train_labse_quantize.yaml
deleted file mode 100644
index 1e4bafc..0000000
--- a/examples/configs/train_labse_quantize.yaml
+++ /dev/null
@@ -1,74 +0,0 @@
-task:
-  name: train
-
-general:
-  quantization:
-    _target_: bert-squeeze.utils.callbacks.quantization.DynamicQuantization
-    layers_to_quantize: { 'torch.nn.Linear' }
-  do_train: true
-  do_eval: false
-  debug: false
-  labels:
-    - sadness
-    - joy
-    - love
-    - anger
-    - fear
-    - surprise
-  output_dir: outputs
-  save_steps: 500
-  validation_every_n_epoch: 1
-  evaluate_during_training: true
-  get_mismated: true
-  model_path: ../checkpoints/BER-511/
-
-train:
-  training_batch_size: 8
-  num_epochs: 10
-  dropout: 0.2
-  objective: lsl
-  smoothing: 0.1
-  optimizer: bertadam
-  weight_decay: 0.01
-  discriminative_learning: true
-  learning_rates: [ 2e-5 ]
-  auto_lr: false
-  layer_lr_decay: 0.95
-  lr_scheduler: true
-  adam_eps: 1e-8
-  warmup_ratio: 0.06
-  warmup_steps: true
-  max_grad_norm: 1.0
-  accumulation_steps: 1
-  eval_batch_size: 16
-  logging_steps: 50
-
-model:
-  _target_: bert-squeeze.models.lt_labse.LtCustomLabse
-  training_config: ${train}
-  pretrained_model: "sentence-transformers/LaBSE"
-  num_labels: 6
-
-data:
-  _target_: bert-squeeze.data.modules.transformer_module.TransformerDataModule
-  dataset_config:
-    is_local: false
-    path: emotion
-    split:
-    text_col: text
-    label_col: label
-    truncate_mode: head
-  tokenizer_name: ${model.pretrained_model}
-  max_length: 256
-  train_batch_size: ${train.training_batch_size}
-  eval_batch_size: ${train.eval_batch_size}
-
-neptune:
-  user_name: julesbelveze
-  project: bert-tricks
-  tags: [ ]
-  logger:
-    _target_: neptune.new.integrations.pytorch_lightning.NeptuneLogger
-    project: ${neptune.user_name}/${neptune.project}
-    name: ${task.name}
-
diff --git a/examples/configs/train_lr.yaml b/examples/configs/train_lr.yaml
deleted file mode 100644
index a1ff841..0000000
--- a/examples/configs/train_lr.yaml
+++ /dev/null
@@ -1,70 +0,0 @@
-task:
-  name: train
-
-general:
-  do_train: true
-  do_eval: false
-  debug: false
-  labels:
-    - sadness
-    - joy
-    - love
-    - anger
-    - fear
-    - surprise
-  output_dir: outputs
-  save_steps: 500
-  validation_every_n_epoch: 1
-  evaluate_during_training: true
-  get_mismatched: true
-
-train:
-  training_batch_size: 16
-  eval_batch_size: 16
-  num_epochs: 10
-  dropout: 0.2
-  objective: ce
-  smoothing: 0.1
-  optimizer: sgd
-  weight_decay: 0.01
-  discriminative_learning: false
-  learning_rates: [ 2e-1 ]
-  auto_lr: false
-  layer_lr_decay: 0.95
-  lr_scheduler: true
-  adam_eps: 1e-8
-  warmup_ratio: 0.06
-  warmup_steps: true
-  max_grad_norm: 1.0
-  accumulation_steps: 1
-  logging_steps: 100
-
-model:
-  _target_: bert-squeeze.models.lr.BowLogisticRegression
-  training_config: ${train}
-  vocab_size: 5000
-  embed_dim: 256
-  num_labels: 6
-  name: bow_lr
-
-data:
-  _target_: bert-squeeze.data.modules.lr_module.LrDataModule
-  dataset_config:
-    is_local: false
-    path: emotion
-    split:
-    text_col: text
-    label_col: label
-  max_features: ${model.vocab_size}
-  train_batch_size: ${train.training_batch_size}
-  eval_batch_size: ${train.eval_batch_size}
-
-neptune:
-  user_name: julesbelveze
-  project: bert-tricks
-  tags: [ ]
-  logger:
-    _target_: neptune.new.integrations.pytorch_lightning.NeptuneLogger
-    project: ${neptune.user_name}/${neptune.project}
-    name: ${task.name}
-
diff --git a/examples/configs/train_lstm.yaml b/examples/configs/train_lstm.yaml
deleted file mode 100644
index 95e2745..0000000
--- a/examples/configs/train_lstm.yaml
+++ /dev/null
@@ -1,64 +0,0 @@
-task:
-  name: train
-
-general:
-  do_train: true
-  do_eval: false
-  debug: false
-  labels:
-    - sadness
-    - joy
-    - love
-    - anger
-    - fear
-    - surprise
-  output_dir: outputs
-  save_steps: 500
-  validation_every_n_epoch: 1
-  evaluate_during_training: true
-  get_mismated: true
-
-train:
-  training_batch_size: 16
-  eval_batch_size: 16
-  num_epochs: 10
-  dropout: 0.2
-  objective: ce
-  smoothing: 0.1
-  optimizer: sgd
-  learning_rates: [ 2e-3 ]
-  auto_lr: false
-  lr_scheduler: true
-  max_grad_norm: 1.0
-  accumulation_steps: 1
-  logging_steps: 100
-
-model:
-  _target_: bert-squeeze.models.lstm.LtLSTM
-  training_config: ${train}
-  vocab_len: 20000
-  hidden_dim: 128
-  num_labels: 6
-  name: lstm
-
-data:
-  _target_: bert-squeeze.data.modules.lstm_module.LSTMDataModule
-  dataset_config:
-    is_local: false
-    path: emotion
-    split:
-    text_col: text
-    label_col: label
-  max_features: ${model.vocab_len}
-  train_batch_size: ${train.training_batch_size}
-  eval_batch_size: ${train.eval_batch_size}
-
-neptune:
-  user_name: julesbelveze
-  project: bert-tricks
-  tags: [ ]
-  logger:
-    _target_: neptune.new.integrations.pytorch_lightning.NeptuneLogger
-    project: ${neptune.user_name}/${neptune.project}
-    name: ${task.name}
-
diff --git a/examples/main.py b/examples/main.py
deleted file mode 100644
index 211abe9..0000000
--- a/examples/main.py
+++ /dev/null
@@ -1,97 +0,0 @@
-# To run such a program one can run it the following way:
-# python3 -m bert-squeeze.main -cp=configs -cn=training_config
-#
-# To override arguments of the config file run as follow:
-# python3 -m bert_squeeze.main -cp=configs -cn=training_config --task=test +new_attr=test
-
-import hydra
-import logging
-import sys
-import torch
-from dotenv import load_dotenv
-from hydra.utils import instantiate
-from pkg_resources import resource_filename
-from pytorch_lightning import Trainer
-from pytorch_lightning.callbacks import LearningRateMonitor
-
-from bert_squeeze.utils import get_neptune_tags, load_model_from_exp
-from bert_squeeze.utils.callbacks import CheckpointEveryNSteps
-from bert_squeeze.utils.errors import ConfigurationException
-
-load_dotenv()
-logging.basicConfig(stream=sys.stdout, level=logging.INFO)
-
-
-@hydra.main(config_path="./configs/", config_name="training_config")
-def run(args):
-    logging.info(f"Using config: {args}")
-
-    data = instantiate(args.data)
-    data.prepare_data()
-    data.setup()
-
-    if args.general.do_train:
-        neptune_logger = instantiate(args.neptune.logger)
-        neptune_logger.experiment["sys/tags"].add(get_neptune_tags(args))
-        neptune_logger.log_hyperparams(args)
-
-        model = instantiate(args.model, _recursive_=False)
-
-        callbacks = [CheckpointEveryNSteps(args.general.save_steps)]
-        if args.train.get("lr_scheduler", False):
-            callbacks.append(LearningRateMonitor(logging_interval='epoch'))
-
-        if args.general.get("quantization", None) is not None:
-            quantization_callback = instantiate(args.general.quantization)
-            callbacks.append(quantization_callback)
-
-        if args.general.get("pruning", None) is not None:
-            pruning_callback = instantiate(args.general.pruning)
-            callbacks.append(pruning_callback)
-
-        if "fast" in args.model._target_:
-            callbacks.append(instantiate(args.finetuning_callback))
-
-        # NOTE: when performing manual optimization the 'gradient_clip_val' flag needs
-        # to be set to None.
-        # Issue here: https://github.com/PyTorchLightning/pytorch-lightning/issues/7698
-        trainer = Trainer(
-            gpus=torch.cuda.device_count(),
-            accumulate_grad_batches=args.train.accumulation_steps,
-            gradient_clip_val=args.train.max_grad_norm,
-            accelerator='ddp',
-            auto_lr_find=args.train.auto_lr,
-            logger=neptune_logger,
-            callbacks=callbacks,
-            check_val_every_n_epoch=args.general.validation_every_n_epoch
-        )
-
-        logging.info(f"Starting training: {model}")
-
-        trainer.fit(
-            model=model,
-            train_dataloaders=data.train_dataloader(),
-            val_dataloaders=data.val_dataloader()
-        )
-
-        # exporting trained model to ONNX
-        input_sample = iter(data.test_dataloader).next()
-        model.to_onnx(f"{args.general.output_dir}/model.onnx", input_sample, export_params=True)
-
-    if args.general.do_eval:
-        if not hasattr(args.general, "model_path"):
-            raise ConfigurationException("You are on 'eval' mode you need to specify path to model checkpoint.")
-        args.general.model_path = resource_filename("bert-squeeze", args.general.model_path)
-
-        model = load_model_from_exp(path_to_folder=args.general.model_path, module=args.model._target_)
-
-        model.eval()
-        trainer = Trainer(
-            gpus=torch.cuda.device_count(),
-            accelerator='ddp'
-        )
-        trainer.test(model, datamodule=data)
-
-
-if __name__ == "__main__":
-    run()
diff --git a/tests/assistants/test_train_assistant.py b/tests/assistants/test_train_assistant.py
new file mode 100644
index 0000000..9f0e8f1
--- /dev/null
+++ b/tests/assistants/test_train_assistant.py
@@ -0,0 +1,94 @@
+import pytest
+from pytorch_lightning.loggers import TensorBoardLogger
+from torch.utils.data import DataLoader
+
+from bert_squeeze.assistants.train_assistant import TrainAssistant
+from bert_squeeze.data.modules import LSTMDataModule, LrDataModule, TransformerDataModule
+from bert_squeeze.models import BowLogisticRegression, LtCustomBert, LtDeeBert, LtFastBert, LtLSTM, LtTheseusBert
+
+
+@pytest.fixture
+def lr_assistant():
+    return TrainAssistant(
+        "lr",
+        dataset_path="emotion",
+        general_kwargs={"labels": [0, 1, 2, 3, 4, 5], "num_labels": 6}
+    )
+
+
+class TestTrainAssistant:
+    def test_sanity_assistant(self, lr_assistant):
+        """"""
+        assert lr_assistant.general.num_labels == 6
+        assert isinstance(lr_assistant.model, BowLogisticRegression)
+        assert isinstance(lr_assistant.data, LrDataModule)
+        assert isinstance(lr_assistant.logger, TensorBoardLogger)
+
+    def test_data(self, lr_assistant):
+        """"""
+        assert isinstance(lr_assistant.data.train_dataloader(), DataLoader)
+        assert len(lr_assistant.data.train_dataloader()) == 1000
+
+    def test_bert_assistant(self):
+        """"""
+        bert_assistant = TrainAssistant(
+            "bert",
+            dataset_path="emotion",
+            general_kwargs={"labels": [0, 1, 2, 3, 4, 5], "num_labels": 6},
+            model_kwargs={"pretrained_model": "bert-base-uncased"}
+        )
+        assert bert_assistant.general.num_labels == 6
+        assert isinstance(bert_assistant.model, LtCustomBert)
+        assert bert_assistant.model.encoder.config._name_or_path == "bert-base-uncased"
+        assert isinstance(bert_assistant.data, TransformerDataModule)
+
+    def test_lstm_assistant(self):
+        """"""
+        lstm_assistant = TrainAssistant(
+            "lstm",
+            dataset_path="emotion",
+            general_kwargs={"labels": [0, 1, 2, 3, 4, 5], "num_labels": 6}
+        )
+        assert lstm_assistant.general.num_labels == 6
+        assert isinstance(lstm_assistant.model, LtLSTM)
+        assert isinstance(lstm_assistant.data, LSTMDataModule)
+
+    def test_deebert_assistant(self):
+        """"""
+        deebert_assistant = TrainAssistant(
+            "deebert",
+            dataset_path="emotion",
+            general_kwargs={"labels": [0, 1, 2, 3, 4, 5], "num_labels": 6},
+            model_kwargs={"pretrained_model": "bert-base-uncased"}
+        )
+        assert deebert_assistant.general.num_labels == 6
+        assert isinstance(deebert_assistant.model, LtDeeBert)
+        assert deebert_assistant.model.bert.config._name_or_path == "bert-base-uncased"
+        assert isinstance(deebert_assistant.data, TransformerDataModule)
+
+    def test_fastbert_assistant(self):
+        """"""
+        fastbert_assistant = TrainAssistant(
+            "fastbert",
+            dataset_path="emotion",
+            general_kwargs={"labels": [0, 1, 2, 3, 4, 5], "num_labels": 6},
+            model_kwargs={"pretrained_model": "bert-base-uncased"}
+        )
+        assert fastbert_assistant.general.num_labels == 6
+        assert isinstance(fastbert_assistant.model, LtFastBert)
+        assert fastbert_assistant.model.encoder.config._name_or_path == "bert-base-uncased"
+        assert isinstance(fastbert_assistant.data, TransformerDataModule)
+        assert len(fastbert_assistant.callbacks) > 0
+
+    def test_theseusbert_assistant(self):
+        """"""
+        fastbert_assistant = TrainAssistant(
+            "theseus-bert",
+            dataset_path="emotion",
+            general_kwargs={"labels": [0, 1, 2, 3, 4, 5], "num_labels": 6},
+            model_kwargs={"pretrained_model": "bert-base-uncased"}
+        )
+        assert fastbert_assistant.general.num_labels == 6
+        assert isinstance(fastbert_assistant.model, LtTheseusBert)
+        assert fastbert_assistant.model.encoder.config._name_or_path == "bert-base-uncased"
+        assert isinstance(fastbert_assistant.data, TransformerDataModule)