### Sorting and observing metadata

In [22]:
%config Completer.use_jedi = False
import pandas as pd

In [23]:
species_df = pd.read_csv('./data/343taxa_species-name_clade-name_color-code.txt', sep='\t')
print(species_df.columns)
species_df

Index(['ID', 'original_genome_id', 'old_species_names', 'species_names_fig2',
       'hex', 'id_table S1', 'Species name', 'Major clade', 'clade_order',
       'Family', 'Genus', 'CUG usage'],
      dtype='object')


Unnamed: 0,ID,original_genome_id,old_species_names,species_names_fig2,hex,id_table S1,Species name,Major clade,clade_order,Family,Genus,CUG usage
0,1,saprochaete_clavata,Saprochaete_clavata,Saprochaete_clavata,#FF2800,22,Saprochaete clavata,Dipodascaceae/Trichomonascaceae,3,Dipodascaceae,Saprochaete,Leu
1,2,yHMPu5000034974_deakozyma_indianensis_160519,yHMPu5000034974_Deakozyma_indianensis,Deakozyma_indianensis,#FF2800,39,Deakozyma indianensis,Dipodascaceae/Trichomonascaceae,3,Saccharomycetales incertae sedis,Deakozyma,Leu
2,3,clavispora_lusitaniae,Clavispora_lusitaniae,Clavispora_lusitaniae,#FFd200,135,Clavispora lusitaniae,CUG-Ser1,8,Metschnikowiaceae,Clavispora,Ser
3,4,yHMPu5000034999_cephaloascus_fragrans_160519,yHMPu5000034999_Cephaloascus_fragrans,Cephaloascus_fragrans,#FFd200,204,Cephaloascus fragrans,CUG-Ser1,8,Cephaloascaceae,Cephaloascus,Ser
4,5,yHMPu5000034998_cephaloascus_albidus_160519,yHMPu5000034998_Cephaloascus_albidus,Cephaloascus_albidus,#FFd200,205,Cephaloascus albidus,CUG-Ser1,8,Cephaloascaceae,Cephaloascus,Ser
...,...,...,...,...,...,...,...,...,...,...,...,...
338,339,Saitoella_complicata,Saitoella_complicata,Saitoella_complicata,#050f07,339,Saitoella complicata,outgroup,0,outgroup,outgroup,Leu
339,340,sclerotinia_sclerotiorum,Sclerotinia_sclerotiorum,Sclerotinia_sclerotiorum,#050f07,340,Sclerotinia sclerotiorum,outgroup,0,outgroup,outgroup,Leu
340,341,stagonospora_nodorum,Stagonospora_nodorum,Stagonospora_nodorum,#050f07,341,Stagonospora nodorum,outgroup,0,outgroup,outgroup,Leu
341,342,xylona_heveae,Xylona_heveae,Xylona_heveae,#050f07,342,Xylona heveae,outgroup,0,outgroup,outgroup,Leu


In [24]:
filtered_df = species_df[['ID', 'Species name', 'Major clade']]
display(filtered_df.head())

Unnamed: 0,ID,Species name,Major clade
0,1,Saprochaete clavata,Dipodascaceae/Trichomonascaceae
1,2,Deakozyma indianensis,Dipodascaceae/Trichomonascaceae
2,3,Clavispora lusitaniae,CUG-Ser1
3,4,Cephaloascus fragrans,CUG-Ser1
4,5,Cephaloascus albidus,CUG-Ser1


In [25]:
filtered_df = filtered_df[filtered_df['Major clade'].isin([
'Pichiaceae',
'Dipodascaceae/Trichomonascaceae',
'Trigonopsidaceae',
'CUG-Ser1',
'CUG-Ser2',
'Saccharomycodaceae',
'Lipomycetaceae',
'Phaffomycetaceae',
'Saccharomycetaceae',
'Sporopachydermia clade',
'Alloascoideaceae',
'CUG-Ala'
])]

filtered_df[filtered_df['Major clade'] == 'outgroup']

Unnamed: 0,ID,Species name,Major clade


In [26]:
# Group clades together
species_df_sorted = filtered_df.sort_values('Major clade')
species_df_sorted.head(20)

Unnamed: 0,ID,Species name,Major clade
249,250,Alloascoidea hylecoeti,Alloascoideaceae
110,111,Nakazawaea holstii,CUG-Ala
109,110,Nakazawaea peltata,CUG-Ala
126,127,Pachysolen tannophilus,CUG-Ala
32,33,Peterozyma xylosa,CUG-Ala
33,34,Peterozyma toletana,CUG-Ala
67,68,Metschnikowia hawaiiensis,CUG-Ser1
73,74,Metschnikowia arizonensis,CUG-Ser1
72,73,Metschnikowia ipomoeae,CUG-Ser1
71,72,Spathaspora gorwiae,CUG-Ser1


In [27]:
from Bio import SeqIO
CLC_records = list(SeqIO.parse('./data/CLC1.fasta', 'fasta'))

In [28]:
# There are some species whose light chains couldn't be found
print(f'Number of species in original txt file DataFrame: {len(species_df)}')
print(f'Number of species in filtered and sorted DataFrame: {len(species_df_sorted)}')
print(f'Number of species in CLC FASTA file: {len(CLC_records)}')

Number of species in original txt file DataFrame: 343
Number of species in filtered and sorted DataFrame: 332
Number of species in CLC FASTA file: 311


In [29]:
species_df_sorted.reset_index(drop=True, inplace=True)
species_df_sorted.head()

Unnamed: 0,ID,Species name,Major clade
0,250,Alloascoidea hylecoeti,Alloascoideaceae
1,111,Nakazawaea holstii,CUG-Ala
2,110,Nakazawaea peltata,CUG-Ala
3,127,Pachysolen tannophilus,CUG-Ala
4,33,Peterozyma xylosa,CUG-Ala


### Grouping the CLC FASTA by clade

In [30]:
cleaned_CLC_records = list(SeqIO.parse('./data/CLC1-cleaned.fasta', 'fasta'))

species_order = species_df_sorted['Species name'].tolist()

formatted_species_order = []
for species_name in species_order:
    species_name = species_name.lower().replace(' ', '_')
    formatted_species_order.append(species_name)

clade_order = species_df_sorted['Major clade'].tolist()
ordered_CLCs = {}
unmatched_species = []

for species_name, clade in zip(formatted_species_order, clade_order):
    matched = False
    for record in cleaned_CLC_records:
        if species_name in record.id:
            ordered_CLCs[species_name] = record, clade
            matched = True
            break
    if not matched:
        print('Species not matched:', species_name)
        unmatched_species.append(species_name)

print('Number of species matched with CLC FASTA:', len(ordered_CLCs))
print('Number of species to be searched on BLAST, whose names are outdated or misspelled, or lack a light chain in their genome:', len(unmatched_species))
# These last 4 species don't have sequenced light chains in their known genome, so I'm just going to ignore them.

Species not matched: candida_sojae
Species not matched: saturnispora_mendoncae
Species not matched: kazachstania_transvaalensis
Species not matched: hanseniaspora_clermontiae
Number of species matched with CLC FASTA: 328
Number of species to be searched on BLAST, whose names are outdated or misspelled, or lack a light chain in their genome: 4


## Confirming the order of the dictionary

In [31]:
formatted_species_order = [
    species for species in formatted_species_order if species not in [
        'candida_sojae', 
        'saturnispora_mendoncae', 
        'kazachstania_transvaalensis', 
        'hanseniaspora_clermontiae'
    ]
]

print(len(formatted_species_order))

328


In [32]:
dict_keys = list(ordered_CLCs.keys())

if set(formatted_species_order) == set(dict_keys):
    print('Same elements')
    if formatted_species_order == dict_keys:
        print('Elements are in the same order')
    else:
        print('Order differs')
else:
    print('Elements differ between the list and the dictionary keys.')

Same elements
Elements are in the same order


In [33]:
rows = []
for species, (seq_record, clade) in ordered_CLCs.items():
    rows.append({
        'species': species,
        'sequence': str(seq_record.seq),
        'id': seq_record.id,
        'description': seq_record.description
    })

CLC_dict_df = pd.DataFrame(rows)
CLC_dict_df.head()

(CLC_dict_df['species'].tolist() == formatted_species_order)

True

## Convert the ordered dictionary into a FASTA file for MSA

In [34]:
# Prepend clade name to each SeqRecord object's description to more easily split the aligned file into multiple FASTAs
for species, (seq_record, clade) in ordered_CLCs.items():
    clade = clade.replace('/', '_').replace(' ', '-')
    seq_record.description = f'{clade} | {seq_record.description}'

In [35]:
seq_records = [seq_record for (seq_record, clade) in ordered_CLCs.values()]

with open('./data/ordered-CLCs.fasta', 'w') as handle:
    SeqIO.write(seq_records, handle, 'fasta')

# NOTE: ordered-CLCs.fasta has the human CLC A manually added as well
print(f'Wrote {len(seq_records)} sequences to ordered_CLCs.fasta')

Wrote 328 sequences to ordered_CLCs.fasta


In [59]:
# These are the various CLC lengths:
# Note: I didn't just get the lengths of each object in the previously defined seq_records list because I had to tweak one of the yeast sequences later on.
CLC_lengths = [len(record) for record in SeqIO.parse('./data/ordered-CLCs.fasta', 'fasta')]
print(CLC_lengths)

[218, 221, 222, 221, 227, 215, 216, 213, 213, 213, 200, 218, 217, 213, 213, 213, 213, 212, 213, 212, 213, 214, 211, 211, 218, 213, 213, 213, 213, 217, 203, 221, 219, 217, 213, 217, 217, 207, 193, 192, 212, 213, 214, 210, 189, 210, 204, 213, 213, 210, 213, 220, 210, 208, 203, 218, 222, 221, 212, 203, 214, 214, 203, 214, 221, 214, 222, 229, 220, 222, 225, 225, 207, 227, 228, 209, 204, 217, 220, 225, 218, 210, 207, 214, 213, 212, 195, 216, 204, 215, 212, 217, 233, 218, 216, 217, 217, 221, 220, 204, 260, 217, 253, 232, 217, 217, 217, 239, 217, 236, 227, 226, 232, 217, 228, 225, 236, 228, 230, 225, 225, 228, 225, 227, 207, 224, 230, 217, 216, 222, 209, 230, 222, 212, 215, 222, 221, 231, 211, 220, 210, 244, 220, 213, 220, 226, 221, 220, 213, 225, 199, 209, 201, 199, 208, 198, 199, 197, 190, 204, 191, 191, 201, 194, 200, 194, 197, 196, 197, 200, 191, 197, 197, 201, 198, 209, 201, 199, 200, 204, 200, 200, 202, 194, 210, 212, 210, 218, 233, 217, 220, 217, 223, 227, 218, 214, 229, 230, 216, 269,

In [60]:
import subprocess

with open('./data/ordered-CLCs.fasta', 'r') as handle:
    ordered_seqs_str = handle.read()

cmd = [
    'mafft',
    '--localpair',
    '--maxiterate', '1000',
    '--thread', '10',
    '--quiet',
    '-'
]

process = subprocess.run(
    cmd,
    input=ordered_seqs_str,
    text=True,
    capture_output=True
)

if process.returncode != 0:
    print('MAFFT error:', process.stderr)
else:
    aligned_fasta = process.stdout
    print('Alignment completed successfully')
    with open('./data/CLC1-total-aligned.fasta', 'w') as out_f:
        out_f.write(aligned_fasta)

Alignment completed successfully


## Split aligned FASTA into FASTAs by clade

In [61]:
# First take out the human alignment
total_aligned_records = list(SeqIO.parse('./data/CLC1-total-aligned.fasta', 'fasta'))

total_aligned_records = total_aligned_records[1:]

with open('./data/CLC1-total-aligned.fasta', 'w') as output_handle:
    SeqIO.write(total_aligned_records, output_handle, 'fasta')

In [62]:
# Split into 12 files, each one representing a clade
from collections import defaultdict

clade_groups = defaultdict(list)

with open('./data/CLC1-total-aligned.fasta') as handle:
    for seq_record in SeqIO.parse(handle, 'fasta'):
        clade_name = seq_record.description.split(' ', 1)[1].split('|')[0].strip()
        clade_groups[clade_name].append(seq_record)

for clade, seqs in clade_groups.items():
    clade = clade.replace('/', '_').replace(' ', '-')
    with open(f'./data/CLC1-fastas-by-clade/{clade}_aligned_CLCs.fasta', 'w') as out_f:
        SeqIO.write(seqs, out_f, 'fasta')