In [19]:
import pandas as pd
import numpy as np
import re

def load_data(file_path):
    """增强型数据加载函数"""
    # 读取原始数据
    with open(file_path, 'r') as f:
        lines = [line.strip() for line in f if line.strip()]
    
    # 解析列名
    header = re.split(r'\t+|\s{2,}', lines[0])  # 兼容制表符和双空格分隔
    
    # 构建数据框
    data = []
    for line in lines[1:]:
        parts = re.split(r'\t+|\s{2,}', line)  # 统一分隔符处理
        row = [parts[0]]  # GeneID
        for val in parts[1:]:
            try:
                # 处理混合分隔符数值
                clean_val = re.sub(r'[^0-9\.\-eE]', ' ', val)
                nums = [float(v) for v in clean_val.split() if v]
                row.append(np.mean(nums) if nums else np.nan)
            except:
                row.append(np.nan)
        data.append(row)
    
    return pd.DataFrame(data, columns=header)

def detect_wholebody_columns(df):
    """动态识别Whole_body列（包含Egg/Pupa特殊处理）"""
    pattern = r'''
    (?:^|_)          # 列名起始或前置分隔符
    (
        Whole[_\-]?body |   # 标准whole_body格式
        WB          |       # 缩写
        Egg         |       # 卵阶段
        Pupa                # 蛹阶段
    )
    (?=_|$|__)       # 确保是独立部分
    '''
    matches = []
    for col in df.columns:
        # 标准化列名结构
        normalized = re.sub(r'_{2,}', '__', col)
        
        # 验证三段式结构
        if len(normalized.split('__')) != 3:
            continue
            
        # 提取组织类型
        tissue = normalized.split('__')[-1]
        
        # 特殊处理Egg/Pupa
        if re.fullmatch(pattern, tissue, re.VERBOSE | re.IGNORECASE):
            matches.append(col)
    
    print("最终匹配的Whole_body列：")
    print('\n'.join(matches) if matches else "无匹配列")
    return matches

def calculate_tau(series):
    """鲁棒性Tau计算"""
    valid_values = series.dropna()
    if len(valid_values) < 2:
        return np.nan
    
    # 转换线性空间
    linear = 2 ** valid_values
    max_val = linear.max()
    
    # 计算tau
    return (1 - linear/max_val).sum() / (len(linear)-1)

def process_stages(df, whole_body_cols):
    """阶段处理引擎"""
    stage_data = []
    for col in whole_body_cols:
        parts = col.split('__')
        stage = f"{parts[0]}__{parts[1]}"
        stage_data.append((col, stage))
    
    stage_df = pd.DataFrame(stage_data, columns=['Column', 'Stage'])
    
    results = []
    for stage, group in stage_df.groupby('Stage'):
        cols = group['Column'].tolist()
        if len(cols) < 2:
            continue
            
        # 计算各基因Tau
        stage_df = df[['GeneID'] + cols].copy()
        stage_df['Tau'] = stage_df[cols].apply(calculate_tau, axis=1)
        stage_df['Developmental_Stage'] = stage
        results.append(stage_df[['GeneID', 'Developmental_Stage', 'Tau']])
    
    return pd.concat(results).dropna().round(4)

if __name__ == "__main__":
    df = load_data("GeneExpression_GroupedData.tsv")
    whole_body_cols = detect_wholebody_columns(df)
    
    if not whole_body_cols:
        print("未检测到有效Whole_body列")
    else:
        result = process_stages(df, whole_body_cols)
        print("\n计算结果示例：")
        print(result.head())
        result.to_csv("tau_results.csv", index=False)
        print("结果已保存到 tau_results.csv")


最终匹配的Whole_body列：
Egg__0_days__Egg
Egg__1_day__Egg
Egg__2_days__Egg
Egg__3_days__Egg
Larva__5_instar__Whole_body
Larva__1_instar__Whole_body
Larva__0_day_of_3_instar__Whole_body
Egg__24_hours__Egg
Egg__72_hours__Egg
Egg__120_hours__Egg
Egg__168_hours__Egg
Egg__216_hours__Egg
Egg__192_hours__Egg
Pupa__pre__Pupa
Pupa__4_days__Pupa
Pupa__1_days__Pupa
Pupa__7-8_days__Pupa
Pupa__1_day__Pupa
Egg__36_hours__Egg
Larva__2_instar__Whole_body
Larva__12_day_of_4_instar__Whole_body
Larva__11_day_of_4_instar__Whole_body
Larva__10_day_of_4_instar__Whole_body
Larva__3_day_of_4_instar__Whole_body
Larva__2_day_of_4_instar__Whole_body
Larva__1_day_of_4_instar__Whole_body
Egg__20-22_hours__Egg
Pupa__144_hours__Pupa
Pupa__84_hours__Pupa
Pupa__12_hours__Pupa
Larva__2_day_of_5_instar__Whole_body
Egg__8_days__Egg
Larva__1_day_of_1_instar__Whole_body
Pupa__2_day_of_pre__Pupa
Pupa__newly__Pupa
Pupa__stage_5__Pupa


ValueError: No objects to concatenate

In [17]:
# 执行分析
result = analyze("GeneExpression_GroupedData.tsv")
print(result.to_markdown(index=False)) if not result.empty else print("无有效数据")

发现 0 个Whole_body样本


KeyError: "None of [Index(['GeneID'], dtype='object')] are in the [columns]"

In [13]:
# Tau计算核心函数
def calculate_tau(series):
    values = series.astype(float)
    values = np.where(values < -5, 0, 2**values)  # 处理低表达噪声
    max_val = values.max()
    if max_val == 0 or len(values) < 2:
        return np.nan
    return sum(1 - values/max_val) / (len(values)-1)

In [14]:
# 主流程
def main(data):
    df = preprocess_data(data)
    
    # 提取Whole_body组织数据
    whole_body_cols = [col for col in df.columns if '__Whole_body' in col]
    
    # 解析发育阶段
    stage_info = pd.DataFrame([
        (col, col.split('__')[1]) 
        for col in whole_body_cols
    ], columns=['Column', 'Developmental_Stage'])
    
    # 按发育阶段分组计算
    results = []
    for stage, group in stage_info.groupby('Developmental_Stage'):
        cols = group['Column'].tolist()
        if len(cols) < 2:  # Tau计算需要至少两个样本
            continue
            
        stage_df = df[['GeneID'] + cols].copy()
        stage_df['Tau'] = stage_df[cols].apply(calculate_tau, axis=1)
        stage_df['Developmental_Stage'] = stage
        results.append(stage_df[['GeneID', 'Developmental_Stage', 'Tau']])
    
    return pd.concat(results).reset_index(drop=True)

In [15]:
result = main("GeneExpression_GroupedData.tsv")
print(result.to_markdown(index=False))

ValueError: No objects to concatenate