In [7]:
import h5py

def inspect_h5_file_for_ids(file_path):
    """
    检查 h5 文件中是否包含基因 ID 或其他标识符
    
    Args:
        file_path: h5 文件的路径
    """
    print(f"检查文件: {file_path}")
    
    try:
        with h5py.File(file_path, 'r') as h5_file:
            # 打印文件的所有顶层键
            print("\n顶层键:")
            for key in h5_file.keys():
                print(f"  {key}")
                
            # 检查是否有可能的基因 ID 数据集
            print("\n可能的基因 ID 数据集:")
            for key in h5_file.keys():
                if 'id' in key.lower() or 'gene' in key.lower():
                    print(f"  {key}: {h5_file[key].shape}, 类型 {h5_file[key].dtype}")
    
    except Exception as e:
        print(f"读取文件出错: {e}")

# 使用示例
h5_file_path = "/home/gl/projects/Borzoi/paddy/data/seqs_cov/0.h5"  # 替换为您的 h5 文件路径
inspect_h5_file_for_ids(h5_file_path)

检查文件: /home/gl/projects/Borzoi/paddy/data/seqs_cov/0.h5

顶层键:
  targets

可能的基因 ID 数据集:


In [8]:
import h5py
import numpy as np
import os
import glob
from natsort import natsorted
from pprint import pprint

def inspect_h5_file(file_path):
    """
    检查 h5 文件的结构和内容，查找是否包含基因 ID
    
    Args:
        file_path: h5 文件的路径
    """
    print(f"检查文件: {file_path}")
    
    try:
        with h5py.File(file_path, 'r') as h5_file:
            # 打印文件结构
            print("\n文件结构:")
            
            def print_structure(name, obj):
                indent = '  ' * name.count('/')
                if isinstance(obj, h5py.Dataset):
                    shape_str = str(obj.shape)
                    dtype_str = str(obj.dtype)
                    print(f"{indent}{name}: Dataset {shape_str}, 类型 {dtype_str}")
                    
                    # 检查是否可能包含 ID（样本查看前5个元素）
                    if len(obj.shape) == 1 and obj.size > 0:
                        print(f"{indent}  样本数据 (前5个):")
                        sample = obj[:5]
                        
                        # 如果是字节类型，尝试解码
                        if isinstance(sample[0], bytes):
                            try:
                                decoded = [s.decode('utf-8') for s in sample]
                                print(f"{indent}    {decoded}")
                            except:
                                print(f"{indent}    {sample} (无法解码)")
                        else:
                            print(f"{indent}    {sample}")
                else:
                    print(f"{indent}{name}: Group")
            
            h5_file.visititems(print_structure)
            
            # 获取并显示所有属性
            print("\n文件属性:")
            for attr_name, attr_value in h5_file.attrs.items():
                print(f"  {attr_name}: {attr_value}")
            
            # 获取并显示所有顶层数据集和组
            print("\n顶层项目:")
            for key in h5_file.keys():
                item = h5_file[key]
                if isinstance(item, h5py.Dataset):
                    print(f"  {key}: Dataset {item.shape}, 类型 {item.dtype}")
                    
                    # 对数据集，尝试显示前几个值
                    if len(item.shape) == 1 and item.size > 0:
                        print(f"    样本数据 (前5个): {item[:5]}")
                    
                    # 显示数据集的属性
                    if item.attrs:
                        print(f"    属性:")
                        for attr_name, attr_value in item.attrs.items():
                            print(f"      {attr_name}: {attr_value}")
                        
                else:  # h5py.Group
                    print(f"  {key}: Group，包含 {len(item.keys())} 个项目")
                    
                    # 列出组中的前几个项目
                    subkeys = list(item.keys())[:5]
                    if subkeys:
                        print(f"    子项目 (前5个): {subkeys}")
    
    except Exception as e:
        print(f"读取文件出错: {e}")

def inspect_multiple_h5_files(directory, num_files=3):
    """
    检查目录中的多个 h5 文件
    
    Args:
        directory: 包含 h5 文件的目录
        num_files: 要检查的文件数量
    """
    # 获取所有 h5 文件
    h5_files = natsorted(glob.glob(os.path.join(directory, "*.h5")))
    
    if not h5_files:
        print(f"在 {directory} 中未找到 h5 文件")
        return
    
    print(f"在 {directory} 中找到 {len(h5_files)} 个 h5 文件")
    
    # 检查前 num_files 个文件
    for i, file_path in enumerate(h5_files[:num_files]):
        print(f"\n{'='*50}")
        print(f"检查文件 {i+1}/{min(num_files, len(h5_files))}: {os.path.basename(file_path)}")
        print(f"{'='*50}")
        inspect_h5_file(file_path)
    
    # 如果有更多文件，显示它们的名称
    if len(h5_files) > num_files:
        print(f"\n还有 {len(h5_files) - num_files} 个文件未检查:")
        for file_path in h5_files[num_files:num_files+5]:
            print(f"  {os.path.basename(file_path)}")
        if len(h5_files) > num_files + 5:
            print(f"  ... 以及 {len(h5_files) - num_files - 5} 个其他文件")

# 运行此代码来检查指定目录中的 h5 文件
h5_dir = "/home/gl/projects/Borzoi/paddy/data/seqs_cov"  # 请将此路径替换为您的 h5 文件目录
inspect_multiple_h5_files(h5_dir, num_files=2)  # 检查前两个文件

# 如果您想检查特定的单个文件
# specific_file = "/home/gl/projects/Borzoi/paddy/data/seqs_cov/your_specific_file.h5"  # 替换为您要检查的文件
# inspect_h5_file(specific_file)

在 /home/gl/projects/Borzoi/paddy/data/seqs_cov 中找到 106 个 h5 文件

检查文件 1/2: 0.h5
检查文件: /home/gl/projects/Borzoi/paddy/data/seqs_cov/0.h5

文件结构:
targets: Dataset (41969, 1024), 类型 float16

文件属性:

顶层项目:
  targets: Dataset (41969, 1024), 类型 float16

检查文件 2/2: 1.h5
检查文件: /home/gl/projects/Borzoi/paddy/data/seqs_cov/1.h5

文件结构:
targets: Dataset (41969, 1024), 类型 float16

文件属性:

顶层项目:
  targets: Dataset (41969, 1024), 类型 float16

还有 104 个文件未检查:
  2.h5
  3.h5
  4.h5
  5.h5
  6.h5
  ... 以及 99 个其他文件


In [3]:
import tensorflow as tf

def parse_tfrecord(example_proto):
    """
    Parse a single example from a TFRecord file.
    Adjust the feature description according to your TFRecord structure.
    """
    feature_description = {
        'sequence_length': tf.io.FixedLenFeature([], tf.int64),
        'num_tracks': tf.io.FixedLenFeature([], tf.int64),
        'tracks': tf.io.FixedLenFeature([], tf.string)
    }
    return tf.io.parse_single_example(example_proto, feature_description)

def inspect_tfrecord(file_path):
    """
    Inspect the structure of a TFRecord file.
    """
    dataset = tf.data.TFRecordDataset(file_path)
    
    for example in dataset.take(1):  # Take the first example for inspection
        parsed_example = parse_tfrecord(example)
        print("Parsed Example:")
        for key, value in parsed_example.items():
            print(f"{key}: {value.numpy()}")  # Convert tensors to numpy for easier reading

if __name__ == "__main__":
    tfrecord_file_path = "/home/gl/projects/Borzoi/borzoi/examples/data/hg38/tfrecords/fold1-13.tfr"
    inspect_tfrecord(tfrecord_file_path)

2025-05-17 17:09:17.441959: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-05-17 17:09:17.459577: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-05-17 17:09:17.459597: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-05-17 17:09:17.460118: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-05-17 17:09:17.463508: I tensorflow/core/platform/cpu_feature_guar

DataLossError: {{function_node __wrapped__IteratorGetNext_output_types_1_device_/job:localhost/replica:0/task:0/device:CPU:0}} corrupted record at 0 (Is this even a TFRecord file?) [Op:IteratorGetNext] name: 

In [10]:
import tensorflow as tf
from sklearn import datasets
import numpy as np

x_train = datasets.load_iris().data
y_train = datasets.load_iris().target

np.random.seed(116)
np.random.shuffle(x_train)
np.random.shuffle(y_train)
tf.random.set_seed(116)

model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(3, activation='softmax', kernel_regularizer=tf.keras.regularizers.l2())
])

model.compile(optimizer=tf.keras.optimizers.SGD(learning_rate=0.1),
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False),
              metrics=['sparse_categorical_accuracy'])

model.fit(x_train, y_train, batch_size=32, epochs=500, validation_split=0.2, validation_freq=20)

model.summary()


Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78

In [12]:
import tensorflow as tf

print("TensorFlow 版本:", tf.__version__)
print("可用的GPU列表：", tf.config.list_physical_devices('GPU'))

TensorFlow 版本: 2.15.1
可用的GPU列表： [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [13]:
import tensorflow as tf
mnist = tf.keras.datasets.mnist
(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0

model = tf.keras.models.Sequential([
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(10, activation='softmax')
])

model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False),
              metrics=['sparse_categorical_accuracy'])

model.fit(x_train, y_train, batch_size=32, epochs=5, validation_data=(x_test, y_test), validation_freq=1)

model.summary()

import tensorflow as tf


Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz
 2285568/11490434 [====>.........................] - ETA: 2:55

KeyboardInterrupt: 

In [None]:
def generateds(path, txt):
    f = open(txt, 'r')
    contents = f.readlines()
    f.close()
    x, y_ = [], []
    for content in contents:
        value = content.split()

In [4]:
import os
# print current directory
print(os.getcwd())

/home/gl/projects/Borzoi/paddy


In [6]:
data_stats_file = f"./statistics.json"
with open(data_stats_file) as data_stats_open:
        data_stats = json.load(data_stats_open)

FileNotFoundError: [Errno 2] No such file or directory: './statistics.json'

In [14]:
# create a random int 2 * 2 * 3 matrix
# help me write a function to create a random int 2 * 2 * 3 matrix
import numpy as np
import tensorflow as tf
def create_random_matrix():
    seed = 1234 
    np.random.seed(seed)
    tf.random.set_seed(seed)
    return np.random.randint(0, 10, (2, 2, 3))

matrix = create_random_matrix()
print(matrix)
# use tf.reduce_sum to sum the matrix
matrix_sum = tf.reduce_sum(matrix, axis=[0,1])
matrix_sum





[[[3 6 5]
  [4 8 9]]

 [[1 7 9]
  [6 8 0]]]


<tf.Tensor: shape=(3,), dtype=int64, numpy=array([14, 29, 23])>