# Testing `random1000_10` dataset

**Authorship:**
Adam Klie, *07/26/2022*
***
**Description:**
Notebook to test the working with the `random1000_10` dataset.
***

In [2]:
import pandas as pd
import numpy as np

# Autoreload extension
if 'autoreload' not in get_ipython().extension_manager.loaded:
    %load_ext autoreload
%autoreload 2

# Basic import
import eugene as eu
eu.__version__

'0.1.0'

# Load and preprocess the data

In [3]:
# Load the dataset    
sdata = eu.datasets.random1000_10()
sdata

SeqData object with = 1000 seqs
seqs = (1000,)
names = (1000,)
rev_seqs = None
ohe_seqs = None
ohe_rev_seqs = None
seqs_annot: 'LABEL_0', 'LABEL_1', 'LABEL_2', 'LABEL_3', 'LABEL_4', 'LABEL_5', 'LABEL_6', 'LABEL_7', 'LABEL_8', 'LABEL_9', 'ACTIVITY_0', 'ACTIVITY_1', 'ACTIVITY_2', 'ACTIVITY_3', 'ACTIVITY_4', 'ACTIVITY_5', 'ACTIVITY_6', 'ACTIVITY_7', 'ACTIVITY_8', 'ACTIVITY_9'
pos_annot: None
seqsm: None
uns: None

In [4]:
# Add seq length to sdata
sdata["SEQ_LEN"] = [len(seq) for seq in sdata.seqs]

In [7]:
# Prepare data for training
eu.pp.prepare_data(sdata)

  0%|          | 0/3 [00:00<?, ?it/s]

SeqData object modified:
	rev_seqs: None -> 1000 rev_seqs added
	ohe_seqs: None -> 1000 ohe_seqs added
	ohe_rev_seqs: None -> 1000 ohe_rev_seqs added
    seqs_annot:
        + TRAIN


In [17]:
# Add positional annotation to sdata
sdata.pos_annot = eu.pp.convert2pyRanges(sdata.names, sdata.seqs)

0it [00:00, ?it/s]

SeqData object with = 1000 seqs
seqs = (1000,)
names = (1000,)
rev_seqs = (1000,)
ohe_seqs = (1000, 66, 4)
ohe_rev_seqs = (1000, 66, 4)
seqs_annot: 'LABEL_0', 'LABEL_1', 'LABEL_2', 'LABEL_3', 'LABEL_4', 'LABEL_5', 'LABEL_6', 'LABEL_7', 'LABEL_8', 'LABEL_9', 'ACTIVITY_0', 'ACTIVITY_1', 'ACTIVITY_2', 'ACTIVITY_3', 'ACTIVITY_4', 'ACTIVITY_5', 'ACTIVITY_6', 'ACTIVITY_7', 'ACTIVITY_8', 'ACTIVITY_9', 'SEQ_LEN', 'TRAIN'
pos_annot: PyRanges object with 1436 features
seqsm: None
uns: None

# Train a model on this data

In [18]:
# Build a single task model architecture, don't worry too much about the details (like loss functions, etc.)
model = eu.models.DeepBind(input_len=sdata["SEQ_LEN"].max(), output_dim=1)

In [20]:
# Fit the model on the random data
eu.train.fit(model, sdata=sdata, target_label="ACTIVITY_0", epochs=5, log_dir="../_logs")

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs

  | Name      | Type                      | Params
--------------------------------------------------------
0 | hp_metric | R2Score                   | 0     
1 | max_pool  | MaxPool1d                 | 0     
2 | avg_pool  | AvgPool1d                 | 0     
3 | convnet   | BasicConv1D               | 272   
4 | fcn       | BasicFullyConnectedModule | 146 K 
--------------------------------------------------------
147 K     Trainable params
0         Non-trainable params
147 K     Total params
0.588     Total estimated model params size (MB)


No transforms given, assuming just need to tensorize).
No transforms given, assuming just need to tensorize).


Validation sanity check: 0it [00:00, ?it/s]

  f"The dataloader, {name}, does not have many workers which may be a bottleneck."
Global seed set to 13
  f"The dataloader, {name}, does not have many workers which may be a bottleneck."
  f"The number of training samples ({self.num_training_batches}) is smaller than the logging interval"


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

In [22]:
# Make predictions on the random data
eu.predict.train_val_predictions(model, sdata=sdata, target_label="ACTIVITY_0", train_idx_label="TRAIN", out_dir="../_out/random1000_10")

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


No transforms given, assuming just need to tensorize).
No transforms given, assuming just need to tensorize).


  f"The dataloader, {name}, does not have many workers which may be a bottleneck."


Predicting: 0it [00:00, ?it/s]

Predicting: 0it [00:00, ?it/s]

SeqData object modified:
    seqs_annot:
        + ACTIVITY_0_PREDICTIONS


In [24]:
# Train more on a different label
eu.train.fit(model, sdata=sdata, target_label="ACTIVITY_1", epochs=5, log_dir="../_logs")

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs

  | Name      | Type                      | Params
--------------------------------------------------------
0 | hp_metric | R2Score                   | 0     
1 | max_pool  | MaxPool1d                 | 0     
2 | avg_pool  | AvgPool1d                 | 0     
3 | convnet   | BasicConv1D               | 272   
4 | fcn       | BasicFullyConnectedModule | 146 K 
--------------------------------------------------------
147 K     Trainable params
0         Non-trainable params
147 K     Total params
0.588     Total estimated model params size (MB)


No transforms given, assuming just need to tensorize).
No transforms given, assuming just need to tensorize).


Validation sanity check: 0it [00:00, ?it/s]

  f"The dataloader, {name}, does not have many workers which may be a bottleneck."
Global seed set to 13
  f"The dataloader, {name}, does not have many workers which may be a bottleneck."
  f"The number of training samples ({self.num_training_batches}) is smaller than the logging interval"


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

In [25]:
# Make new predictions on the random data
eu.predict.train_val_predictions(model, sdata=sdata, target_label="ACTIVITY_1", train_idx_label="TRAIN", out_dir="../_out/random1000_10")

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


No transforms given, assuming just need to tensorize).
No transforms given, assuming just need to tensorize).


  f"The dataloader, {name}, does not have many workers which may be a bottleneck."


Predicting: 0it [00:00, ?it/s]

Predicting: 0it [00:00, ?it/s]

SeqData object modified:
    seqs_annot:
        + ACTIVITY_1_PREDICTIONS


In [27]:
# Instantiate a new model that is multi-tasked
model = eu.models.DeepBind(input_len=sdata["SEQ_LEN"].max(), output_dim=10)

In [28]:
# Fit the new multi-tasked model
targets = [f"ACTIVITY_{i}" for i in range(10)]
eu.train.fit(model, sdata=sdata, target_label=targets, epochs=5, log_dir="../_logs")

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs

  | Name      | Type                      | Params
--------------------------------------------------------
0 | hp_metric | R2Score                   | 0     
1 | max_pool  | MaxPool1d                 | 0     
2 | avg_pool  | AvgPool1d                 | 0     
3 | convnet   | BasicConv1D               | 272   
4 | fcn       | BasicFullyConnectedModule | 146 K 
--------------------------------------------------------
147 K     Trainable params
0         Non-trainable params
147 K     Total params
0.589     Total estimated model params size (MB)


No transforms given, assuming just need to tensorize).
No transforms given, assuming just need to tensorize).


Validation sanity check: 0it [00:00, ?it/s]

  f"The dataloader, {name}, does not have many workers which may be a bottleneck."
Global seed set to 13
  f"The dataloader, {name}, does not have many workers which may be a bottleneck."
  f"The number of training samples ({self.num_training_batches}) is smaller than the logging interval"


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

In [34]:
# Make new predictions on the random data
eu.predict.train_val_predictions(model, sdata=sdata, target_label=targets, train_idx_label="TRAIN", out_dir="../_out/random1000_10")

# Intepret the trained model

In [38]:
# Generate pfms for the random data
eu.interpret.generate_pfms(model, sdata, target_label=targets)

0it [00:00, ?it/s]

0it [00:00, ?it/s]

In [61]:
# Get per nucleotide feature importances
eu.interpret.feature_attribution(model, sdata, target=0)

0
No transforms given, assuming just need to tensorize).


  0%|          | 0/31 [00:00<?, ?it/s]

In [63]:
# Add aggregated feature importances to positional annotation
eu.interpret.aggregate_importance(sdata, uns_key="InputXGradient_imps")

In [67]:
# Generate a pca plot for the feature importances
eu.interpret.pca(sdata, uns_key="InputXGradient_imps")

Make sure your matrix is sample by feature


In [68]:
# Generate a umap plot for the feature importances
eu.interpret.umap(sdata, uns_key="InputXGradient_imps")

Make sure your matrix is sample by feature


# Save the processed data object

In [74]:
eu.dl.write_h5sd(sdata, "../../eugene/datasets/random1000_10/random1000_10_processed.h5sd")

Unsupported type for InputXGradient_imps_pca
Unsupported type for InputXGradient_imps_umap


---

# Scratch