In [133]:
import pandas as pd

In [134]:
import re

In [135]:
df = pd.read_csv('Orthogroups.tsv', sep='\t', low_memory=False)

In [136]:
species = pd.read_csv('important_species_list.txt', header=None)
species_list = species[0].tolist()

In [137]:
print(species_list)

['Bombyx_mori', 'Apis_mellifera']


In [138]:
# 生成正则匹配模式（忽略大小写，并允许部分匹配）
species_pattern = "|".join(species_list)  # 组合成正则表达式 'Actias_luna|Adoxophyes_honmai|...'


In [139]:
# 找到所有匹配的列（第一列始终保留）
cols_to_keep = [df.columns[0]] + [col for col in df.columns if re.search(species_pattern, col, re.IGNORECASE)]

In [140]:
# 生成提取后的 DataFrame
df_filtered = df[cols_to_keep]

In [141]:
df_filtered = df_filtered.copy()
df_filtered.drop("Bombyx_mori.protein", axis=1, inplace=True)
df_filtered.rename(columns={"Bombyx_mori_new.protein": "Bombyx_mori.protein"}, inplace=True)

In [142]:
df_filtered.columns = df_filtered.columns.str.replace(r'\.protein', '', regex=True)


In [143]:
print(df_filtered)

       Orthogroup Apis_mellifera  \
0       OG0000000            NaN   
1       OG0000001            NaN   
2       OG0000002            NaN   
3       OG0000003            NaN   
4       OG0000004            NaN   
...           ...            ...   
280541  OG0280541            NaN   
280542  OG0280542            NaN   
280543  OG0280543            NaN   
280544  OG0280544            NaN   
280545  OG0280545            NaN   

                                              Bombyx_mori  
0       XM_004930167.5, XM_004934113.5, XM_004934284.2...  
1       XM_004933773.5, XM_021348530.3, XM_038011536.2...  
2       XM_012691466.4, XM_021348769.3, XM_021353027.2...  
3       XM_004926346.2, XM_004926774.5, XM_004931210.2...  
4       XM_004921625.2, XM_004925445.2, XM_012693465.1...  
...                                                   ...  
280541                     NM_001146241.1, XM_038017920.2  
280542                     XM_038011851.2, XM_062668998.1  
280543                     

In [144]:
gene_columns = df_filtered.columns


In [145]:
print(gene_columns)

Index(['Orthogroup', 'Apis_mellifera', 'Bombyx_mori'], dtype='object')


In [146]:
def count_gene_ids(cell):
    """
    统计单元格中基因ID的数量：
      - 对于空值返回 0
      - 按','拆分后，去除前后空格和空字符串，再统计数量
    """
    if pd.isnull(cell):
        return 0
    # 将 cell 转为字符串（防止非字符串类型），按','拆分
    # 使用 strip() 去除可能的空白字符，并过滤掉空串
    ids = [gene.strip() for gene in str(cell).split(',') if gene.strip()]
    return len(ids)

In [147]:
counts_df = df_filtered[gene_columns].map(count_gene_ids)


In [148]:
print(counts_df)

        Orthogroup  Apis_mellifera  Bombyx_mori
0                1               0          290
1                1               0           81
2                1               0          101
3                1               0           96
4                1               0           80
...            ...             ...          ...
280541           1               0            2
280542           1               0            2
280543           1               0            2
280544           1               0            2
280545           1               0            2

[280546 rows x 3 columns]


In [149]:
valid_rows = counts_df.apply(lambda row: (row == 1).sum() / len(row) >= 0.75, axis=1)


In [150]:
print(valid_rows)

0         False
1         False
2         False
3         False
4         False
          ...  
280541    False
280542    False
280543    False
280544    False
280545    False
Length: 280546, dtype: bool


In [151]:
df_valid = df_filtered[valid_rows]


In [152]:
print(df_valid)

     Orthogroup Apis_mellifera     Bombyx_mori
131   OG0000131   Amei004732.1  XM_038013974.2
289   OG0000289   Amei001984.1  NM_001043730.1
438   OG0000438   Amei007362.1  XM_062670092.1
468   OG0000468   Amei003734.1  NM_001043886.1
533   OG0000533   Amei000506.1  XM_038016145.2
...         ...            ...             ...
8133  OG0008133   Amei008956.1  XM_004922782.3
8231  OG0008231   Amei007448.1  XM_062670663.1
8335  OG0008335   Amei006714.1  XM_004929248.5
8338  OG0008338   Amei000943.1  XM_038017346.2
8356  OG0008356   Amei008809.1  XM_004924256.3

[2569 rows x 3 columns]


In [153]:
# 输出到文件（同样假设使用制表符分隔，且不保留索引）
df_valid.to_csv('apis_and_bombyx.txt', sep='\t', index=False)