In [1]:
import pandas as pd

In [2]:
import re

In [27]:
df = pd.read_csv('Orthogroups.tsv', sep='\t', low_memory=False)

In [28]:
species = pd.read_csv('Lepidoptera_species_list.txt', header=None)
species_list = species[0].tolist()

In [29]:
print(species_list)

['Actias_luna', 'Adoxophyes_honmai', 'Agrotis_ipsilon', 'Antheraea_pernyi', 'Antheraea_yamamai', 'Arctia_plantaginis', 'Aricia_agestis', 'Autographa_gamma', 'Bicyclus_anynana', 'Bombyx_mandarina', 'Bombyx_mori', 'Busseola_fusca', 'Carposina_sasakii', 'Chilo_suppressalis', 'Chrysodeixis_includens', 'Conogethes_punctiferalis', 'Conopomorpha_cramerella', 'Cydia_pomonella', 'Danaus_plexippus', 'Dendrolimus_punctatus', 'Diatraea_saccharalis', 'Dione_vanillae', 'Drepana_arcuata', 'Ectropis_grisescens', 'Galleria_mellonella', 'Grapholita_molesta', 'Heliconius_cydno', 'Heliconius_erato', 'Heliconius_himera', 'Heliconius_melpomene', 'Heliconius_numata', 'Helicoverpa_armigera', 'Helicoverpa_zea', 'Heliothis_subflexa', 'Heliothis_virescens', 'Heortia_vitessoides', 'Hyphantria_cunea', 'Junonia_coenia', 'Lerema_accius', 'Mamestra_configurata', 'Manduca_sexta', 'Melitaea_cinxia', 'Noctua_pronuba', 'Nymphalis_urticae', 'Nymphula_nitidulata', 'Operophtera_brumata', 'Ostrinia_furnacalis', 'Ostrinia_nub

In [30]:
# 生成正则匹配模式（忽略大小写，并允许部分匹配）
species_pattern = "|".join(species_list)  # 组合成正则表达式 'Actias_luna|Adoxophyes_honmai|...'


In [31]:
# 找到所有匹配的列（第一列始终保留）
cols_to_keep = [df.columns[0]] + [col for col in df.columns if re.search(species_pattern, col, re.IGNORECASE)]

In [32]:
# 生成提取后的 DataFrame
df_filtered = df[cols_to_keep]

In [33]:
df_filtered = df_filtered.copy()
df_filtered.drop("Bombyx_mori.protein", axis=1, inplace=True)
df_filtered.rename(columns={"Bombyx_mori_new.protein": "Bombyx_mori.protein"}, inplace=True)

In [34]:
df_filtered.columns = df_filtered.columns.str.replace(r'\.protein', '', regex=True)


In [35]:
print(df_filtered)

       Orthogroup                                        Actias_luna  \
0       OG0000000  Alun000947.1, Alun003090.1, Alun004358.1, Alun...   
1       OG0000001  Alun001508.1, Alun003656.1, Alun006632.1, Alun...   
2       OG0000002  Alun000569.1, Alun001833.1, Alun002126.1, Alun...   
3       OG0000003  Alun000859.1, Alun000860.1, Alun022521.1, Alun...   
4       OG0000004  Alun000219.1, Alun001236.1, Alun002701.1, Alun...   
...           ...                                                ...   
280541  OG0280541                                                NaN   
280542  OG0280542                                                NaN   
280543  OG0280543                                                NaN   
280544  OG0280544                                                NaN   
280545  OG0280545                                                NaN   

                                        Adoxophyes_honmai  \
0       Ahon000149.1, Ahon000331.1, Ahon000385.1, Ahon...   
1       Ahon0

In [36]:
gene_columns = df_filtered.columns


In [37]:
print(gene_columns)

Index(['Orthogroup', 'Actias_luna', 'Adoxophyes_honmai', 'Agrotis_ipsilon',
       'Antheraea_pernyi', 'Antheraea_yamamai', 'Arctia_plantaginis',
       'Aricia_agestis', 'Autographa_gamma', 'Bicyclus_anynana',
       'Bombyx_mandarina', 'Busseola_fusca', 'Carposina_sasakii',
       'Chilo_suppressalis', 'Chrysodeixis_includens',
       'Conogethes_punctiferalis', 'Conopomorpha_cramerella',
       'Cydia_pomonella', 'Danaus_plexippus', 'Dendrolimus_punctatus',
       'Diatraea_saccharalis', 'Dione_vanillae', 'Drepana_arcuata',
       'Ectropis_grisescens', 'Galleria_mellonella', 'Grapholita_molesta',
       'Heliconius_cydno', 'Heliconius_erato', 'Heliconius_himera',
       'Heliconius_melpomene', 'Heliconius_numata', 'Helicoverpa_armigera',
       'Helicoverpa_zea', 'Heliothis_subflexa', 'Heliothis_virescens',
       'Heortia_vitessoides', 'Hyphantria_cunea', 'Junonia_coenia',
       'Lerema_accius', 'Mamestra_configurata', 'Manduca_sexta',
       'Melitaea_cinxia', 'Noctua_pronuba', 

In [38]:
def count_gene_ids(cell):
    """
    统计单元格中基因ID的数量：
      - 对于空值返回 0
      - 按','拆分后，去除前后空格和空字符串，再统计数量
    """
    if pd.isnull(cell):
        return 0
    # 将 cell 转为字符串（防止非字符串类型），按','拆分
    # 使用 strip() 去除可能的空白字符，并过滤掉空串
    ids = [gene.strip() for gene in str(cell).split(',') if gene.strip()]
    return len(ids)

In [46]:
counts_df = df_filtered[gene_columns].map(count_gene_ids)


In [None]:
------

In [47]:
bombyx_column = "Bombyx_mori"
if bombyx_column in gene_columns:
    bombyx_mask = (counts_df[bombyx_column] == 1)
    df_filtered = df_filtered[bombyx_mask]
    counts_df = counts_df[bombyx_mask]


In [53]:
# 然后应用75%单基因的规则
single_gene_mask = counts_df == 1
percent_single = single_gene_mask.sum(axis=1) / len(gene_columns)
valid_rows = percent_single >= 0.90

In [54]:
# 最终过滤
df_final = df_filtered[valid_rows]

In [55]:
print(df_final)

      Orthogroup                 Actias_luna Adoxophyes_honmai  \
969    OG0000969  Alun004699.1, Alun088989.1      Ahon007598.1   
1150   OG0001150                Alun069997.1      Ahon009842.1   
2044   OG0002044  Alun012125.1, Alun089373.1      Ahon015653.1   
2380   OG0002380  Alun019226.1, Alun091529.1      Ahon014989.1   
2909   OG0002909  Alun079320.1, Alun088638.1      Ahon011402.1   
...          ...                         ...               ...   
11743  OG0011743                Alun043495.1      Ahon017398.1   
11745  OG0011745                Alun059588.1      Ahon019719.1   
11832  OG0011832                Alun009706.1      Ahon005736.1   
11944  OG0011944                         NaN      Ahon005731.1   
12123  OG0012123                Alun020316.1      Ahon005059.1   

                  Agrotis_ipsilon Antheraea_pernyi Antheraea_yamamai  \
969                  Aips005234.1     Aper008754.1      Ayam000134.1   
1150                 Aips023247.1     Aper015890.1      Ayam000

In [57]:
# 输出统计信息
print(f"原始行数: {len(df)}")
print(f"过滤后的行数: {len(df_filtered)}")
print(f"最终结果(家蚕列有值且为单基因，其他至少75%为单基因): {len(df_final)}")
print(f"所选物种列: {gene_columns.tolist()}")

原始行数: 280546
过滤后的行数: 3822
最终结果(家蚕列有值且为单基因，其他至少75%为单基因): 132
所选物种列: ['Orthogroup', 'Actias_luna', 'Adoxophyes_honmai', 'Agrotis_ipsilon', 'Antheraea_pernyi', 'Antheraea_yamamai', 'Arctia_plantaginis', 'Aricia_agestis', 'Autographa_gamma', 'Bicyclus_anynana', 'Bombyx_mandarina', 'Busseola_fusca', 'Carposina_sasakii', 'Chilo_suppressalis', 'Chrysodeixis_includens', 'Conogethes_punctiferalis', 'Conopomorpha_cramerella', 'Cydia_pomonella', 'Danaus_plexippus', 'Dendrolimus_punctatus', 'Diatraea_saccharalis', 'Dione_vanillae', 'Drepana_arcuata', 'Ectropis_grisescens', 'Galleria_mellonella', 'Grapholita_molesta', 'Heliconius_cydno', 'Heliconius_erato', 'Heliconius_himera', 'Heliconius_melpomene', 'Heliconius_numata', 'Helicoverpa_armigera', 'Helicoverpa_zea', 'Heliothis_subflexa', 'Heliothis_virescens', 'Heortia_vitessoides', 'Hyphantria_cunea', 'Junonia_coenia', 'Lerema_accius', 'Mamestra_configurata', 'Manduca_sexta', 'Melitaea_cinxia', 'Noctua_pronuba', 'Nymphalis_urticae', 'Nymphula_nitidu

In [58]:
df_final.to_csv('Le_bombyx_09.txt', sep='\t', index=False)


In [None]:
-----

In [40]:
----------------

SyntaxError: invalid syntax (2069451464.py, line 1)

In [41]:
single_gene_mask = df_filtered[gene_columns].applymap(lambda x: count_gene_ids(x) <= 1)
all_single_genes = single_gene_mask.all(axis=1)
df_single = df_filtered[all_single_genes]

  single_gene_mask = df_filtered[gene_columns].applymap(lambda x: count_gene_ids(x) <= 1)


In [42]:
bombyx_column = "Bombyx_mori"
df_final = df_single[~df_single[bombyx_column].isna() & (df_single[bombyx_column] != "")]

In [43]:
print(df_final)

       Orthogroup   Actias_luna Adoxophyes_honmai Agrotis_ipsilon  \
3034    OG0003034  Alun081947.1      Ahon015919.1    Aips001593.1   
3265    OG0003265           NaN      Ahon013721.1    Aips005842.1   
5582    OG0005582  Alun072887.1      Ahon013839.1    Aips012359.1   
5941    OG0005941  Alun002908.1      Ahon003168.1    Aips002363.1   
5969    OG0005969  Alun066504.1      Ahon005195.1    Aips000997.1   
...           ...           ...               ...             ...   
200452  OG0200452           NaN               NaN             NaN   
200754  OG0200754           NaN               NaN             NaN   
200819  OG0200819           NaN               NaN             NaN   
201077  OG0201077           NaN               NaN             NaN   
201087  OG0201087           NaN               NaN             NaN   

       Antheraea_pernyi Antheraea_yamamai           Arctia_plantaginis  \
3034       Aper014643.1      Ayam005807.1  unassigned_transcript_13695   
3265                NaN

In [56]:
# 保存结果
df_final.to_csv('Lepidoptera_bombyx_single.txt', sep='\t', index=False)


In [44]:
# 输出统计信息
print(f"原始行数: {len(df)}")
print(f"过滤后的行数: {len(df_filtered)}")
print(f"只有单个基因ID的行数: {len(df_single)}")
print(f"最终结果(家蚕列有值且都是单个基因): {len(df_final)}")

原始行数: 280546
过滤后的行数: 280546
只有单个基因ID的行数: 234238
最终结果(家蚕列有值且都是单个基因): 204


In [None]:
-----------------

In [16]:
print(counts_df)

        Orthogroup  Acromyrmex_echinatior  Actias_luna  Acyrthosiphon_pisum  \
0                1                     42           84                  333   
1                1                      6           28                   26   
2                1                     10           72                   81   
3                1                      3           18                   74   
4                1                     21           80                   30   
...            ...                    ...          ...                  ...   
280541           1                      0            0                    0   
280542           1                      0            0                    0   
280543           1                      0            0                    0   
280544           1                      0            0                    0   
280545           1                      0            0                    0   

        Adoxophyes_honmai  Aedes_aegypti  Aedes_alb

In [17]:
valid_rows = counts_df.apply(lambda row: (row == 1).sum() / len(row) >= 0.75, axis=1)


In [18]:
print(valid_rows)

0         False
1         False
2         False
3         False
4         False
          ...  
280541    False
280542    False
280543    False
280544    False
280545    False
Length: 280546, dtype: bool


In [19]:
df_valid = df_filtered[valid_rows]


In [20]:
print(df_valid)

     Orthogroup       Acromyrmex_echinatior  \
2533  OG0002533                Aech001627.1   
2783  OG0002783                Aech007536.1   
2874  OG0002874                Aech001846.1   
2877  OG0002877                Aech006221.1   
2896  OG0002896                Aech010570.1   
...         ...                         ...   
6806  OG0006806                Aech001611.1   
6809  OG0006809                Aech006244.1   
6838  OG0006838                Aech008955.1   
6843  OG0006843  Aech008862.1, Aech008907.1   
6881  OG0006881                         NaN   

                                            Actias_luna  \
2533  Alun006695.1, Alun013565.1, Alun021867.1, Alun...   
2783  Alun000131.1, Alun000132.1, Alun004792.1, Alun...   
2874           Alun023387.1, Alun038229.1, Alun090011.1   
2877  Alun000054.1, Alun000309.1, Alun004937.1, Alun...   
2896                                       Alun078766.1   
...                                                 ...   
6806                  

In [153]:
# 输出到文件（同样假设使用制表符分隔，且不保留索引）
df_valid.to_csv('apis_and_bombyx.txt', sep='\t', index=False)