After rewriting/refactoring of code base, do a test pipeline from start to end to see if everything works in principle.

In [1]:
import json
import logging
import numpy as np
import os
from pathlib import Path
import tensorflow as tf
from time import time

import importlib

2024-06-20 16:34:28.417931: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-06-20 16:34:28.425280: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-06-20 16:34:28.540415: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
SEED = 42

In [3]:
from modules import SequenceRepresentation as sr
from modules import ModelDataSet
from modules import ProfileFindingSetup
from modules import plotting
from modules import model_new as model
from modules import training_new as training
from modules import Links

importlib.reload(sr)
importlib.reload(ModelDataSet)
importlib.reload(ProfileFindingSetup)
importlib.reload(plotting)
importlib.reload(model)
importlib.reload(Links)

<module 'modules.Links' from '/home/ebelm/genomegraph/learn_specific_profiles/modules/Links.py'>

In [4]:
datapath = Path("/home/ebelm/genomegraph/data/241_species/20231123_subset150_NM_RefSeqBest/20240605_fixed_out_subset150_withEnforced_20_15_20_50_15_20_15_20_mammals/exon_chr10_100356531_100356764/")
outpath  = Path("/home/ebelm/genomegraph/runs/20240605_testNewModel/")
os.makedirs(outpath, exist_ok=True)

In [5]:
sequences = sr.loadJSONSequenceList(datapath / "profile_finding_sequence_data.json")

In [6]:
genomes: list[sr.Genome] = []
for seq in sequences:
    # make annotations and transkripts unique
    sr.makeAnnotationsUnique(seq)
    sr.selectLongestTranscript(seq)

    g = sr.Genome()
    g.addSequence(seq)
    genomes.append(g)

INFO:root:[makeAnnotationsUnique] Sequence Fukomys_damarensis:KN123008.1:2,409,178-2,440,546: Found and uniq-ed 0 redundant annotations; removed total of 0 redundant annotations
INFO:root:[selectLongestTranscript] Found and removed subsequence annotations in {sequence}. Removed total of 0 subsequence annotations
INFO:root:[makeAnnotationsUnique] Sequence Cercocebus_atys:KQ012652.1:5,274,598-5,280,660: Found and uniq-ed 1 redundant annotations; removed total of 1 redundant annotations
INFO:root:[makeAnnotationsUnique] Sequence Saimiri_boliviensis:JH378110.1:32,223,310-32,229,802: Found and uniq-ed 1 redundant annotations; removed total of 1 redundant annotations
INFO:root:[makeAnnotationsUnique] Sequence Hipposideros_armiger:JXIK01000029.1:2,457,613-2,462,101: Found and uniq-ed 0 redundant annotations; removed total of 0 redundant annotations
INFO:root:[selectLongestTranscript] Found and removed subsequence annotations in {sequence}. Removed total of 1 subsequence annotations


In [7]:
# shorten data so that it runs faster
maxgenomes = 5
maxseqlen = 3000

genomes = genomes[:maxgenomes]
for genome in genomes:
    for seq in genome.sequences:
        if len(seq) > maxseqlen:
            d = len(seq) - maxseqlen
            seq.stripSequence(d, from_start=False)

In [8]:
# store shortened data as test genomes for unittests
with open("/home/ebelm/genomegraph/learn_specific_profiles/tests/testdata.json", 'wt') as fh:
    json.dump([g.toList() for g in genomes], fh, indent=2)

In [9]:
runID = "0000"
starttime = time()
logging.info(f"[main] Start training and evaluation for run {runID}")

# --- train our model (DNA mode) ---
logging.info(f"[main] Start training and evaluation on model for {runID}")
data = ModelDataSet.ModelDataSet(genomes, ModelDataSet.DataMode.DNA,
                                 tiles_per_X = 7, tile_size = 334)
trainsetup = ProfileFindingSetup.ProfileFindingTrainingSetup(data, U = 200, n_best_profiles=1)
trainsetup.initializeProfiles_kmers(enforceU=False, plot=False, overlapTilesize=6)
evaluator = training.MultiTrainingEvaluation()
# try:
training.trainAndEvaluate(runID, trainsetup, evaluator, 
                          outpath, outprefix=f"{runID}_", 
                          trainingWithReporting=True, rand_seed=SEED)
# except Exception as e:
#     logging.error(f"[main] trainAndEvaluate failed for homology {0}, check log for details")
#     logging.error(f"[main] Error message: {e}")

evaluator.dump(outpath / "evaluator.json")

INFO:root:[main] Start training and evaluation for run 0000
INFO:root:[main] Start training and evaluation on model for 0000


INFO:root:[ProfileFindingSetup.ProfileFindingTrainingSetup.initializeProfiles] >>> Number of profiles: 10
DEBUG:root:[model.__init__] >>> setting tf global seed to 42
DEBUG:root:[model.__init__] >>> Using initProfiles from training setup instead of random
DEBUG:root:[ModelDataSet.createBatch] >>> [[[3000, 3000]], [[3000, 3000]], [[3000, 3000]], [[3000, 3000]], [[3000, 3000]]]
DEBUG:root:[ModelDataSet.createBatch] >>> genomes[0][0][0][:min(10, len(genomes[0][0][0]))]='CCCTGATGGA'
DEBUG:root:[ModelDataSet.createBatch] >>> alphabet=['A', 'C', 'G', 'T']
2024-06-20 16:34:39.137969: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
DEBUG:root:[ModelDataSet.createBatch] >>> [[[3000, 3000]], [[3000, 3000]], [[3000, 3000]], [[3000, 3000]], [[3000, 3000]]]
DEBUG:root:[ModelDataSet.createBatch] >>> genomes[0][0][0][:min(10, len(genomes[0][0][0]))]='CCCTGATGGA'
DEBUG:root:[ModelDataSet.createBatch] >>> alphabet=['A', 'C', '

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method


DEBUG:root:[model.getR] >>> nan in P: Tensor("cond/Any:0", shape=(10,), dtype=bool) Tensor("cond/boolean_mask/GatherV2:0", shape=(20, 4, None), dtype=float32)
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method


DEBUG:root:[model.getR] >>> Q: [0.2664765  0.23352347 0.23352347 0.2664765 ]
DEBUG:root:[model.getZ] >>> nan in R
DEBUG:root:[model.getZ] >>> nan in X
DEBUG:root:[model.getZ] >>> nan in Z
DEBUG:root:[model.getR] >>> nan in P: Tensor("cond/Any:0", shape=(10,), dtype=bool) Tensor("cond/boolean_mask/GatherV2:0", shape=(20, 4, None), dtype=float32)
DEBUG:root:[model.getR] >>> Q: [0.2664765  0.23352347 0.23352347 0.2664765 ]
DEBUG:root:[model.getZ] >>> nan in R
DEBUG:root:[model.getZ] >>> nan in X
DEBUG:root:[model.getZ] >>> nan in Z
DEBUG:root:[ModelDataSet.createBatch] >>> [[[3000, 3000]], [[3000, 3000]], [[3000, 3000]], [[3000, 3000]], [[3000, 3000]]]
DEBUG:root:[ModelDataSet.createBatch] >>> genomes[0][0][0][:min(10, len(genomes[0][0][0]))]='CCCTGATGGA'
DEBUG:root:[ModelDataSet.createBatch] >>> alphabet=['A', 'C', 'G', 'T']
2024-06-20 16:34:43.747179: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
DEBUG:root:[

AssertionError: 

In [None]:
data.getRawData()[0][0][0]

'ccctgatggaggtggggatatggccctggcacctggtcattagggaccccattttttctcctgagactttcaaaatataagctgagaaatttgctgggtttgcatgttcacaatcttaatttaaaatcccaatttttaacatcccacgggcccgtagccatagactattgctccatttctttctctctgactatcttaattaaacccattacattcaagagatgtttattgtcctaggacagtcatagattcaaagatgattatagcctagttgcctaggtttgtttgtttgtttttgtgtttgtgtttcaacagtctttctctcttgcccaggctggagtgcagtggcacaatcatggctcactgcagccttgacttcccaggctcaagcaatccttctacctcaacctcctgagtatctgggactacaggcacacaccgccatgcctggctaattttttgtggggacaaggtctcactcactatattgcccaggccggtagcttagttcttaccttcaaaaagtttgtagcctatcggggtggagagataagccaagtatccagataaccatggcataaggcagaatattttctgtactatgagaggtacaaaggggagggagattgctcaatgggcaacaccaaggaagtgatatgaaataaatagtgttggaatccaccaacggatagaaatttttacaactatgtgtggggagagacagtgcaaacagaagaaacagaatgagctaaaacacgaagcatgttccagcaatagagtccttttgcttgaagtatagggtatgggaagaagtaagactggagagactaatgccattcttgtcgagtcctaaaagcagacttaggacttaattcaataagcaataggaagccattacatcttttgaactgcaatgtggcatagttacggacgtgctttaggaaggctgcttttagaacaagtgtaagaaaccactgagccaaagtgagaggtagggac