### Sorting and observing metadata

In [16]:
%config Completer.use_jedi = False
import pandas as pd

In [17]:
species_df = pd.read_csv('./data/343taxa_species-name_clade-name_color-code.txt', sep='\t')
print(species_df.columns)
species_df

Index(['ID', 'original_genome_id', 'old_species_names', 'species_names_fig2',
       'hex', 'id_table S1', 'Species name', 'Major clade', 'clade_order',
       'Family', 'Genus', 'CUG usage'],
      dtype='object')


Unnamed: 0,ID,original_genome_id,old_species_names,species_names_fig2,hex,id_table S1,Species name,Major clade,clade_order,Family,Genus,CUG usage
0,1,saprochaete_clavata,Saprochaete_clavata,Saprochaete_clavata,#FF2800,22,Saprochaete clavata,Dipodascaceae/Trichomonascaceae,3,Dipodascaceae,Saprochaete,Leu
1,2,yHMPu5000034974_deakozyma_indianensis_160519,yHMPu5000034974_Deakozyma_indianensis,Deakozyma_indianensis,#FF2800,39,Deakozyma indianensis,Dipodascaceae/Trichomonascaceae,3,Saccharomycetales incertae sedis,Deakozyma,Leu
2,3,clavispora_lusitaniae,Clavispora_lusitaniae,Clavispora_lusitaniae,#FFd200,135,Clavispora lusitaniae,CUG-Ser1,8,Metschnikowiaceae,Clavispora,Ser
3,4,yHMPu5000034999_cephaloascus_fragrans_160519,yHMPu5000034999_Cephaloascus_fragrans,Cephaloascus_fragrans,#FFd200,204,Cephaloascus fragrans,CUG-Ser1,8,Cephaloascaceae,Cephaloascus,Ser
4,5,yHMPu5000034998_cephaloascus_albidus_160519,yHMPu5000034998_Cephaloascus_albidus,Cephaloascus_albidus,#FFd200,205,Cephaloascus albidus,CUG-Ser1,8,Cephaloascaceae,Cephaloascus,Ser
...,...,...,...,...,...,...,...,...,...,...,...,...
338,339,Saitoella_complicata,Saitoella_complicata,Saitoella_complicata,#050f07,339,Saitoella complicata,outgroup,0,outgroup,outgroup,Leu
339,340,sclerotinia_sclerotiorum,Sclerotinia_sclerotiorum,Sclerotinia_sclerotiorum,#050f07,340,Sclerotinia sclerotiorum,outgroup,0,outgroup,outgroup,Leu
340,341,stagonospora_nodorum,Stagonospora_nodorum,Stagonospora_nodorum,#050f07,341,Stagonospora nodorum,outgroup,0,outgroup,outgroup,Leu
341,342,xylona_heveae,Xylona_heveae,Xylona_heveae,#050f07,342,Xylona heveae,outgroup,0,outgroup,outgroup,Leu


In [18]:
filtered_df = species_df[['ID', 'Species name', 'Major clade']]
display(filtered_df.head())

Unnamed: 0,ID,Species name,Major clade
0,1,Saprochaete clavata,Dipodascaceae/Trichomonascaceae
1,2,Deakozyma indianensis,Dipodascaceae/Trichomonascaceae
2,3,Clavispora lusitaniae,CUG-Ser1
3,4,Cephaloascus fragrans,CUG-Ser1
4,5,Cephaloascus albidus,CUG-Ser1


In [19]:
filtered_df = filtered_df[filtered_df['Major clade'].isin([
'Pichiaceae',
'Dipodascaceae/Trichomonascaceae',
'Trigonopsidaceae',
'CUG-Ser1',
'CUG-Ser2',
'Saccharomycodaceae',
'Lipomycetaceae',
'Phaffomycetaceae',
'Saccharomycetaceae',
'Sporopachydermia clade',
'Alloascoideaceae',
'CUG-Ala'
])]

filtered_df[filtered_df['Major clade'] == 'outgroup']

Unnamed: 0,ID,Species name,Major clade


In [20]:
# Group clades together
species_df_sorted = filtered_df.sort_values('Major clade')
species_df_sorted.head(20)

Unnamed: 0,ID,Species name,Major clade
249,250,Alloascoidea hylecoeti,Alloascoideaceae
110,111,Nakazawaea holstii,CUG-Ala
109,110,Nakazawaea peltata,CUG-Ala
126,127,Pachysolen tannophilus,CUG-Ala
32,33,Peterozyma xylosa,CUG-Ala
33,34,Peterozyma toletana,CUG-Ala
67,68,Metschnikowia hawaiiensis,CUG-Ser1
73,74,Metschnikowia arizonensis,CUG-Ser1
72,73,Metschnikowia ipomoeae,CUG-Ser1
71,72,Spathaspora gorwiae,CUG-Ser1


In [21]:
from Bio import SeqIO
CLC_records = list(SeqIO.parse('./data/CLC1.fasta', 'fasta'))

In [22]:
# There are some species whose light chains couldn't be found
print(f'Number of species in original txt file DataFrame: {len(species_df)}')
print(f'Number of species in filtered and sorted DataFrame: {len(species_df_sorted)}')
print(f'Number of species in CLC FASTA file: {len(CLC_records)}')

Number of species in original txt file DataFrame: 343
Number of species in filtered and sorted DataFrame: 332
Number of species in CLC FASTA file: 311


In [23]:
species_df_sorted.reset_index(drop=True, inplace=True)
species_df_sorted.head()

Unnamed: 0,ID,Species name,Major clade
0,250,Alloascoidea hylecoeti,Alloascoideaceae
1,111,Nakazawaea holstii,CUG-Ala
2,110,Nakazawaea peltata,CUG-Ala
3,127,Pachysolen tannophilus,CUG-Ala
4,33,Peterozyma xylosa,CUG-Ala


### Grouping the CLC FASTA by clade

In [73]:
cleaned_CLC_records = list(SeqIO.parse('./data/CLC1-cleaned.fasta', 'fasta'))

species_order = species_df_sorted['Species name'].tolist()

formatted_species_order = []
for species_name in species_order:
    species_name = species_name.lower().replace(' ', '_')
    formatted_species_order.append(species_name)

clade_order = species_df_sorted['Major clade'].tolist()
ordered_CLCs = {}
unmatched_species = []

for species_name, clade in zip(formatted_species_order, clade_order):
    matched = False
    for record in cleaned_CLC_records:
        if species_name in record.id:
            ordered_CLCs[species_name] = record, clade
            matched = True
            break
    if not matched:
        print('Species not matched:', species_name)
        unmatched_species.append(species_name)

print('Number of species matched with CLC FASTA:', len(ordered_CLCs))
print('Number of species not found in FASTA (species to be searched on BLAST or whose names are outdated or misspelled):', len(unmatched_species))

Species not matched: spathaspora_gorwiae
Species not matched: candida_sojae
Species not matched: debaryomyces_prosopidis
Species not matched: debaryomyces_subglobosus
Species not matched: candida_corydali
Species not matched: teunomyces_gatunensis
Species not matched: candida_gorgasii
Species not matched: metschnikowia_shivogae
Species not matched: debaryomyces_nepalensis
Species not matched: danielozyma_ontarioensis
Species not matched: candida_gotoi
Species not matched: yamadazyma_philogaea
Species not matched: wickerhamia_fluorescens
Species not matched: spathaspora_arborariae
Species not matched: starmerella_apicola
Species not matched: ogataea_kodamae
Species not matched: saturnispora_mendoncae
Species not matched: torulaspora_microellipsoides
Species not matched: torulaspora_pretoriensis
Species not matched: zygosaccharomyces_bisporus
Species not matched: kazachstania_transvaalensis
Species not matched: hanseniaspora_clermontiae
Number of species matched with CLC FASTA: 310
Numbe