## Sorting and observing metadata

In [1]:
%config Completer.use_jedi = False
import pandas as pd

In [2]:
species_df = pd.read_csv('./data/343taxa_species-name_clade-name_color-code.txt', sep='\t')
print(species_df.columns)
species_df

Index(['ID', 'original_genome_id', 'old_species_names', 'species_names_fig2',
       'hex', 'id_table S1', 'Species name', 'Major clade', 'clade_order',
       'Family', 'Genus', 'CUG usage'],
      dtype='object')


Unnamed: 0,ID,original_genome_id,old_species_names,species_names_fig2,hex,id_table S1,Species name,Major clade,clade_order,Family,Genus,CUG usage
0,1,saprochaete_clavata,Saprochaete_clavata,Saprochaete_clavata,#FF2800,22,Saprochaete clavata,Dipodascaceae/Trichomonascaceae,3,Dipodascaceae,Saprochaete,Leu
1,2,yHMPu5000034974_deakozyma_indianensis_160519,yHMPu5000034974_Deakozyma_indianensis,Deakozyma_indianensis,#FF2800,39,Deakozyma indianensis,Dipodascaceae/Trichomonascaceae,3,Saccharomycetales incertae sedis,Deakozyma,Leu
2,3,clavispora_lusitaniae,Clavispora_lusitaniae,Clavispora_lusitaniae,#FFd200,135,Clavispora lusitaniae,CUG-Ser1,8,Metschnikowiaceae,Clavispora,Ser
3,4,yHMPu5000034999_cephaloascus_fragrans_160519,yHMPu5000034999_Cephaloascus_fragrans,Cephaloascus_fragrans,#FFd200,204,Cephaloascus fragrans,CUG-Ser1,8,Cephaloascaceae,Cephaloascus,Ser
4,5,yHMPu5000034998_cephaloascus_albidus_160519,yHMPu5000034998_Cephaloascus_albidus,Cephaloascus_albidus,#FFd200,205,Cephaloascus albidus,CUG-Ser1,8,Cephaloascaceae,Cephaloascus,Ser
...,...,...,...,...,...,...,...,...,...,...,...,...
338,339,Saitoella_complicata,Saitoella_complicata,Saitoella_complicata,#050f07,339,Saitoella complicata,outgroup,0,outgroup,outgroup,Leu
339,340,sclerotinia_sclerotiorum,Sclerotinia_sclerotiorum,Sclerotinia_sclerotiorum,#050f07,340,Sclerotinia sclerotiorum,outgroup,0,outgroup,outgroup,Leu
340,341,stagonospora_nodorum,Stagonospora_nodorum,Stagonospora_nodorum,#050f07,341,Stagonospora nodorum,outgroup,0,outgroup,outgroup,Leu
341,342,xylona_heveae,Xylona_heveae,Xylona_heveae,#050f07,342,Xylona heveae,outgroup,0,outgroup,outgroup,Leu


In [3]:
filtered_df = species_df[['ID', 'Species name', 'Major clade']]
display(filtered_df.head())

Unnamed: 0,ID,Species name,Major clade
0,1,Saprochaete clavata,Dipodascaceae/Trichomonascaceae
1,2,Deakozyma indianensis,Dipodascaceae/Trichomonascaceae
2,3,Clavispora lusitaniae,CUG-Ser1
3,4,Cephaloascus fragrans,CUG-Ser1
4,5,Cephaloascus albidus,CUG-Ser1


In [4]:
filtered_df[filtered_df['Species name'].str.lower().isin(['cyberlindnera saturnus', 'lipomyces arxii'])]

Unnamed: 0,ID,Species name,Major clade
203,204,Lipomyces arxii,Lipomycetaceae
240,241,Cyberlindnera saturnus,Phaffomycetaceae


In [5]:
# This shows that the txt species file's rows are not grouped by clade
filtered_df.iloc[220:240]

Unnamed: 0,ID,Species name,Major clade
220,221,Candida vartiovaarae,Phaffomycetaceae
221,222,Candida orba,Phaffomycetaceae
222,223,Candida stellimalicola,Phaffomycetaceae
223,224,Cyberlindnera fabianii,Phaffomycetaceae
224,225,Cyberlindnera jadinii,Phaffomycetaceae
225,226,Cyberlindnera americana,Phaffomycetaceae
226,227,Komagataella populi,Pichiaceae
227,228,Komagataella pseudopastoris,Pichiaceae
228,229,Phaffomyces thermotolerans,Phaffomycetaceae
229,230,Phaffomyces antillensis,Phaffomycetaceae


In [6]:
# Get rid of outgroup species
filtered_df = filtered_df[filtered_df['Major clade'].isin([
'Pichiaceae',
'Dipodascaceae/Trichomonascaceae',
'Trigonopsidaceae',
'CUG-Ser1',
'CUG-Ser2',
'Saccharomycodaceae',
'Lipomycetaceae',
'Phaffomycetaceae',
'Saccharomycetaceae',
'Sporopachydermia clade',
'Alloascoideaceae',
'CUG-Ala'
])]

filtered_df[filtered_df['Major clade'] == 'outgroup']

Unnamed: 0,ID,Species name,Major clade


In [7]:
# Group clades together
species_df_sorted = filtered_df.sort_values('Major clade')
species_df_sorted.head(20)

Unnamed: 0,ID,Species name,Major clade
249,250,Alloascoidea hylecoeti,Alloascoideaceae
110,111,Nakazawaea holstii,CUG-Ala
109,110,Nakazawaea peltata,CUG-Ala
126,127,Pachysolen tannophilus,CUG-Ala
32,33,Peterozyma xylosa,CUG-Ala
33,34,Peterozyma toletana,CUG-Ala
67,68,Metschnikowia hawaiiensis,CUG-Ser1
73,74,Metschnikowia arizonensis,CUG-Ser1
72,73,Metschnikowia ipomoeae,CUG-Ser1
71,72,Spathaspora gorwiae,CUG-Ser1


In [8]:
from Bio import SeqIO
CHC_records = list(SeqIO.parse('./data/CHC1.fasta', 'fasta'))

In [9]:
# There are more species sequences in the CHC1.fasta than species in the informational DataFrame
# Update: These extra sequences turned out to be duplicates and fragmented sequences
print(f'Number of species in original txt file DataFrame: {len(species_df)}')
print(f'Number of species in filtered and sorted DataFrame: {len(species_df_sorted)}')
print(f'Number of species in unorganized CHC FASTA file: {len(CHC_records)}')

Number of species in original txt file DataFrame: 343
Number of species in filtered and sorted DataFrame: 332
Number of species in unorganized CHC FASTA file: 345


### Don't run these 2 cells again when rerunning the notebook:

In [30]:
# CHC1.fasta had multiple duplicates, so I created a file consisting only of them and aligned them
from Bio import SeqIO
import subprocess 
from io import StringIO 
from Bio.SeqRecord import SeqRecord 

duplicates = list(SeqIO.parse('./data/CHC1-duplicates.fasta', 'fasta'))
output = StringIO()
SeqIO.write(duplicates, output, 'fasta')
duplicates_str = output.getvalue()

child = subprocess.Popen(    
    ['mafft', '--localpair', '--maxiterate', '1000', '--ep', '0', '--quiet', '-'], 
    stdin=subprocess.PIPE,    
    stdout=subprocess.PIPE,   
    stderr=subprocess.PIPE      
)

child_out, child_err = child.communicate(input=duplicates_str.encode())

aligned_duplicates = list(SeqIO.parse(StringIO(child_out.decode()), "fasta"))

In [31]:
with open('./data/aligned-CHC1-duplicates.fasta', 'w') as output_handle:
    SeqIO.write(aligned_duplicates, output_handle, 'fasta')

### Grouping the cleaned CHC FASTA file's sequences by clade

In [10]:
cleaned_CHC_records = list(SeqIO.parse('./data/CHC1-cleaned.fasta', 'fasta'))

print(len(species_df))

# The cleaned heavy chain dataset now has as many species' sequences as the informational dataset has species:
print(len(species_df_sorted))
print(len(cleaned_CHC_records))

343
332
332


In [11]:
species_df_sorted.reset_index(drop=True, inplace=True)
species_df_sorted.head()

Unnamed: 0,ID,Species name,Major clade
0,250,Alloascoidea hylecoeti,Alloascoideaceae
1,111,Nakazawaea holstii,CUG-Ala
2,110,Nakazawaea peltata,CUG-Ala
3,127,Pachysolen tannophilus,CUG-Ala
4,33,Peterozyma xylosa,CUG-Ala


In [12]:
cleaned_CHC_records[1].id

'yHMPu5000034754_lipomyces_arxii_160519_Seq_4699'

In [13]:
species_order = species_df_sorted['Species name'].tolist()

formatted_species_order = []
for species_name in species_order:
    species_name = species_name.lower().replace(' ', '_')
    formatted_species_order.append(species_name)

clade_order = species_df_sorted['Major clade'].tolist()
ordered_CHCs = {}
unmatched_species = []

for species_name, clade in zip(formatted_species_order, clade_order):
    matched = False
    for record in cleaned_CHC_records:
        if species_name in record.id:
            ordered_CHCs[species_name] = record, clade
            matched = True
            break
    if not matched:
        print('Species not matched:', species_name)
        unmatched_species.append(species_name)

print('Number of species matched with CHC FASTA:', len(ordered_CHCs))
print('Number of species not found in FASTA (species to be searched on BLAST or whose names are outdated):', len(unmatched_species))

Number of species matched with CHC FASTA: 332
Number of species not found in FASTA (species to be searched on BLAST or whose names are outdated): 0


## Fixing outdated naming issues in CHC fasta file

This section of cells may show confusing outputs or have somewhat redundant blocks of code. This is because these cells intended to solve an issue with outdated names and finding one sequence whose name seems to be completely absent in the '343taxa' txt dataset in the CHC fasta file, and the code merely provided me with crucial information at the time that I was fixing the issues. There's no point in creating a completely new fasta file for each time I update the species names; thus, the current outputs are not the outputs I originally had when fixing the problem. These current outputs may seem unimportant or confusing, but they actually show that the issue has been fixed.

In [14]:
print(len(formatted_species_order))
print(len(cleaned_CHC_records))
print(len(ordered_CHCs))

332
332
332


In [15]:
species_df[species_df['Major clade'] == 'outgroup']

Unnamed: 0,ID,original_genome_id,old_species_names,species_names_fig2,hex,id_table S1,Species name,Major clade,clade_order,Family,Genus,CUG usage
332,333,arthrobotrys_oligospora,Arthrobotrys_oligospora,Arthrobotrys_oligospora,#050f07,333,Arthrobotrys oligospora,outgroup,0,outgroup,outgroup,Leu
333,334,aspergillus_nidulans,Aspergillus_nidulans,Aspergillus_nidulans,#050f07,334,Aspergillus nidulans,outgroup,0,outgroup,outgroup,Leu
334,335,botrytis_cinerea,Botrytis_cinerea,Botrytis_cinerea,#050f07,335,Botrytis cinerea,outgroup,0,outgroup,outgroup,Leu
335,336,Coccidioides_immitis,Coccidioides_immitis,Coccidioides_immitis,#050f07,336,Coccidioides immitis,outgroup,0,outgroup,outgroup,Leu
336,337,fusarium_graminearum,Fusarium_graminearum,Fusarium_graminearum,#050f07,337,Fusarium graminearum,outgroup,0,outgroup,outgroup,Leu
337,338,neurospora_crassa,Neurospora_crassa,Neurospora_crassa,#050f07,338,Neurospora crassa,outgroup,0,outgroup,outgroup,Leu
338,339,Saitoella_complicata,Saitoella_complicata,Saitoella_complicata,#050f07,339,Saitoella complicata,outgroup,0,outgroup,outgroup,Leu
339,340,sclerotinia_sclerotiorum,Sclerotinia_sclerotiorum,Sclerotinia_sclerotiorum,#050f07,340,Sclerotinia sclerotiorum,outgroup,0,outgroup,outgroup,Leu
340,341,stagonospora_nodorum,Stagonospora_nodorum,Stagonospora_nodorum,#050f07,341,Stagonospora nodorum,outgroup,0,outgroup,outgroup,Leu
341,342,xylona_heveae,Xylona_heveae,Xylona_heveae,#050f07,342,Xylona heveae,outgroup,0,outgroup,outgroup,Leu


In [16]:
outgroup_species = species_df[species_df['Major clade'] == 'outgroup']

outgroup_species = outgroup_species['Species name'].tolist()

formatted_outgroup_species = []
for species_name in outgroup_species:
    species_name = species_name.lower().replace(' ', '_')
    formatted_outgroup_species.append(species_name)

print(formatted_outgroup_species)

for species in formatted_outgroup_species:
    matched = False
    for record in cleaned_CHC_records:
        if species in record.id:
            print('Outgroup species matched:', species)
            matched = True
            break
    if not matched:
        print('Outgroup species not matched:', species)

['arthrobotrys_oligospora', 'aspergillus_nidulans', 'botrytis_cinerea', 'coccidioides_immitis', 'fusarium_graminearum', 'neurospora_crassa', 'saitoella_complicata', 'sclerotinia_sclerotiorum', 'stagonospora_nodorum', 'xylona_heveae', 'schizosaccharomyces_pombe']
Outgroup species not matched: arthrobotrys_oligospora
Outgroup species not matched: aspergillus_nidulans
Outgroup species not matched: botrytis_cinerea
Outgroup species not matched: coccidioides_immitis
Outgroup species not matched: fusarium_graminearum
Outgroup species not matched: neurospora_crassa
Outgroup species not matched: saitoella_complicata
Outgroup species not matched: sclerotinia_sclerotiorum
Outgroup species not matched: stagonospora_nodorum
Outgroup species not matched: xylona_heveae
Outgroup species not matched: schizosaccharomyces_pombe


In [17]:
rows = []
for species, (seq_record, clade) in ordered_CHCs.items():
    rows.append({
        'species': species,
        'sequence': str(seq_record.seq),
        'id': seq_record.id,
        'description': seq_record.description
    })

CHC_dict_df = pd.DataFrame(rows)
CHC_dict_df.head()

Unnamed: 0,species,sequence,id,description
0,alloascoidea_hylecoeti,MSNSSAPPSRCGIWEVDDFSDCFRHSYIETYFPLIILSLSVLFILI...,alloascoidea_hylecoeti_Seq_6913,alloascoidea_hylecoeti_Seq_6913 y1000_id=0_691...
1,nakazawaea_holstii,MGSEIPIDFTELAQLTQLGIQPQSLDFRSTTLESDKYVCIREQTAN...,yHMPu5000034918_nakazawaea_holstii_160519_Seq_...,yHMPu5000034918_nakazawaea_holstii_160519_Seq_...
2,nakazawaea_peltata,MSDIPIEFTELAQLTALGVAPLSLEFRSTTLESDRYVCVREQGAGG...,nakazawaea_peltata_Seq_3684,nakazawaea_peltata_Seq_3684 y1000_id=99_3683 g...
3,pachysolen_tannophilus,MASEIPIDFTELAQLTSLGIQSQSLDFKSTTLESDKYVCVRESTAN...,pachysolen_tannophilus_Seq_5663,pachysolen_tannophilus_Seq_5663 y1000_id=106_5...
4,peterozyma_xylosa,MSNDIPIDFTELAQLTSLGIQPNSLDFMSTTLESDKYVCIRETTAS...,yHMPu5000034883_peterozyma_xylosa_160519_Seq_2679,yHMPu5000034883_peterozyma_xylosa_160519_Seq_2...


In [18]:
ordered_CHCs_list = []
for key in ordered_CHCs:
    ordered_CHCs_list.append(str(ordered_CHCs[key][0].seq))

CHCs_fasta_list = []
for seq_record in cleaned_CHC_records:
    CHCs_fasta_list.append(str(seq_record.seq))

for ordered_seq in ordered_CHCs_list:
    matched = False
    for seq in CHCs_fasta_list:
        if ordered_seq == seq:
            matched = True
            break
    if not matched:
        print('Sequence not found in ordered_CHCs_list:', seq)

In [19]:
missing_count = 0
missing_seqs = []
for seq in CHCs_fasta_list:
    matched = False
    for ordered_seq in ordered_CHCs_list:
        if ordered_seq == seq:
            matched = True
            break
    if not matched:
        print('Not found:', seq[0:100] + '...')
        missing_seqs.append(seq)
        missing_count += 1
print(f'{missing_count} missing sequences')

0 missing sequences


In [20]:
missing_seq_names = []
for missing_seq in missing_seqs:
    for seq_record in cleaned_CHC_records:
        if missing_seq == seq_record.seq:
            print(seq_record.id)
            missing_seq_names.append(seq_record.id)

In [21]:
# Many of the species names in the fasta file have an outdated version of their name. 
# I'm using this to get rid of them (and also potentially find that one missing corresponding sequence).
formatted_missing_seq_names = []
for name in missing_seq_names:
    parts = name.split('_')
    for part in parts:
        if 'Seq' in part:
            del parts[-2:]
        elif part.startswith('yHMP'):
            del parts[3]
    name = '_'.join(parts)
    print(name)
    formatted_missing_seq_names.append(name.lower())

print(formatted_missing_seq_names)
print()

old_names_in_fasta = species_df[species_df['old_species_names'].str.lower().isin(name for name in formatted_missing_seq_names)]
print('Number names in fasta file that aren`t in the "Species name" column of the txt file (outdated names):', len(old_names_in_fasta))
old_names_in_fasta

[]

Number names in fasta file that aren`t in the "Species name" column of the txt file (outdated names): 0


Unnamed: 0,ID,original_genome_id,old_species_names,species_names_fig2,hex,id_table S1,Species name,Major clade,clade_order,Family,Genus,CUG usage


In [22]:
# Originally filtered this one out; turns out it, too, had an outdated name. The 3 cells below show it's properly added and ordered now:
print(ordered_CHCs['wickerhamomyces_sp.'][0].seq)
print(species_df_sorted[species_df_sorted['Species name'] == 'Wickerhamomyces sp.'])

MSDIPIEFTELTDLTSLGINQQSLEFRSTTLESDHYVVVREQVNGANTVAIVDLHNNNQVTRKNMSADNAILHPKQFIISLRANGTTLQIFNLETKEKLKSYNLEEPVIFWKWLNDEQLGLITATSIFTWNVFDGQPTAGPTKLTARHANLNNAQIINFVANKNFDWFAVVGITQENGRIAGKIQLYSKQRNVSQPIEGHVAGFGGITLEGASSPTQLFVCGNRTATGGQLHIIEIDHDNNNPHFQKKSVEIFFPPDATNDFPISIQISNKYGVIYLLTKYGFIHLYDLESGSNLFVNRITATPVFIASSYDDNNGILAINKTGQVLAVEISKDKLIPYILNKLANIPLALALASRGGLPGAENLFLQQFDTLLSQGDYANAAKVAASSEQLRTPQTIQKLKNVHAAPGAISPILQYFSTLLDKGKLNQFETIELAKPVLQQDRKQLFEKWLKEDKLTSSEELGDIVKPFDLNLALAIYLRAGTHAKVISALAELGQFDKIIPYSEKVGYQPNFIVLISNLLRSNPDKASEFAISLLNSPATSGQIEVEKISDIFFSQNFIQQGTSFLLDALKDDSPNQGHLQTRLLEINLLHAPQVADAILGNDMFHHYDRPTIAQLSEKAGLFQRALENYSDIKDIKRVIVHTNAIPADWLVAYFGKLNVEQSLVALRELLDKNIAQNLQIVIQVATKFSDLIGSATLIKLFEEFKSFEGLYYYLASTVNLTDDKDVVFKYIQSAAKLGQFKEIERVVKDNNVYDPEKVKNFLKDANLPDQLPLIIVCDRYNYVHDLILYLYKHQFFKFIEVYVQQVNPSKTAQVVAALLDVDCDEKVIQNLLQSVLGQVPVAELTTEVEKRNRLKLLLPFLEATLNSGSQDQAVFNTLAKIYIDSNNNPEKFLKENDSYDTKEVGHYCEKRDPYLAYIAYEKGKNDEELIRITNENSMYKYQARYLLARSDPSLWNVVLSEENIHRRQLIDQVVGVAVPEAVDAEPISLTVKAFMDN

In [23]:
i = 0
for key, value in ordered_CHCs.items():
    i += 1
    if i == 157:
        print(key, value)
        break

wickerhamomyces_sp. (SeqRecord(seq=Seq('MSDIPIEFTELTDLTSLGINQQSLEFRSTTLESDHYVVVREQVNGANTVAIVDL...NGF'), id='yHMPu5000035286_wickerhamomyces_sp._160928_Seq_128', name='yHMPu5000035286_wickerhamomyces_sp._160928_Seq_128', description='yHMPu5000035286_wickerhamomyces_sp._160928_Seq_128 y1000_id=283_127 gene_full=genemark-NODE_26_length_125553_cov_16.6396_ID_2725-processed-gene-0.16 CDS=1-4983', dbxrefs=[]), 'Phaffomycetaceae')


In [24]:
species_df_sorted.iloc[156]

ID                              188
Species name    Wickerhamomyces sp.
Major clade        Phaffomycetaceae
Name: 156, dtype: object

## Confirming the order of the dictionary

In [26]:
match_count = 0
match_found = False
for species in formatted_species_order:
    if species in ordered_CHCs:
        match_found = True
        match_count += 1

if not match_found:
    print('No matches found')

print(f'{match_count} matches found')

332 matches found


In [27]:
dict_keys = list(ordered_CHCs.keys())

if set(formatted_species_order) == set(dict_keys):
    print('Same elements')
    if formatted_species_order == dict_keys:
        print('Elements are in the same order')
    else:
        print('Order differs')
else:
    print('Elements differ between the list and the dictionary keys.')

Same elements
Elements are in the same order


In [28]:
(CHC_dict_df['species'].tolist() == formatted_species_order)

True

## Convert the ordered dictionary into a FASTA file for MSA

In [29]:
# Prepend clade name to each SeqRecord object's description to more easily split the aligned file into multiple FASTAs
for species, (seq_record, clade) in ordered_CHCs.items():
    clade = clade.replace('/', '_').replace(' ', '-')
    seq_record.description = f'{clade} | {seq_record.description}'

In [30]:
seq_records = [seq_record for (seq_record, clade) in ordered_CHCs.values()]

with open('./data/ordered-CHCs.fasta', 'w') as handle:
    SeqIO.write(seq_records, handle, 'fasta')

# NOTE: ordered-CHCs.fasta has the human CHC as well
print(f'Wrote {len(seq_records)} sequences to ordered_CHCs.fasta')

Wrote 332 sequences to ordered_CHCs.fasta


### Note: Make sure to add the human CHC to ordered-CHCs.fasta before the following cells

In [33]:
# These are the various CHC lengths. They vary quite a lot, so --localpair will be used with MAFFT.
print([len(seq_record.seq) for seq_record in seq_records])

[3287, 1672, 1674, 1667, 1668, 1668, 1665, 1658, 1665, 1523, 1673, 1673, 1673, 1665, 1665, 1665, 1665, 1665, 1665, 1665, 1665, 1665, 1665, 1665, 1666, 1665, 1665, 1665, 1665, 1667, 1657, 2016, 1666, 1666, 1667, 1666, 1666, 1668, 1676, 1667, 1661, 1662, 1671, 1664, 1658, 2015, 1662, 1665, 1665, 1658, 1665, 1660, 1665, 1665, 1669, 1669, 1669, 1661, 1667, 1664, 1664, 1664, 1662, 1665, 2018, 1662, 1635, 1673, 1669, 1635, 1660, 1671, 1663, 1668, 1666, 1606, 2055, 1669, 1663, 1661, 1665, 1658, 1667, 1664, 1660, 1668, 1668, 2033, 1667, 1672, 1667, 1668, 1676, 1667, 1667, 1667, 1667, 1672, 1668, 1662, 1706, 1691, 1719, 1692, 1643, 1647, 1643, 1670, 1643, 1647, 1670, 1670, 1643, 1643, 1675, 1660, 1674, 1678, 1670, 1656, 1660, 1655, 1670, 1676, 1653, 1672, 1669, 1643, 1656, 1662, 1642, 1648, 1663, 1656, 1643, 1427, 2199, 1680, 1654, 1668, 1666, 1659, 1660, 1655, 1660, 1662, 1661, 1660, 1671, 1658, 1662, 2034, 1662, 1660, 1661, 1662, 1661, 1661, 1663, 2085, 1666, 1663, 1661, 1660, 1661, 1661, 207

In [34]:
import subprocess

with open('./data/ordered-CHCs.fasta', 'r') as handle:
    ordered_seqs_str = handle.read()

cmd = [
    'mafft',
    '--localpair',
    '--maxiterate', '1000',
    '--thread', '10',
    '--quiet',
    '-'
]

process = subprocess.run(
    cmd,
    input=ordered_seqs_str,
    text=True,
    capture_output=True
)

if process.returncode != 0:
    print('MAFFT error:', process.stderr)
else:
    aligned_fasta = process.stdout
    print('Alignment completed successfully')
    with open('./data/CHC1-total-aligned.fasta', 'w') as out_f:
        out_f.write(aligned_fasta)

Alignment completed successfully


## Split aligned FASTA into FASTAs by clade

In [35]:
# First take out the human alignment
total_aligned_records = list(SeqIO.parse('./data/CHC1-total-aligned.fasta', 'fasta'))

total_aligned_records = total_aligned_records[1:]

with open('./data/CHC1-total-aligned.fasta', 'w') as output_handle:
    SeqIO.write(total_aligned_records, output_handle, 'fasta')

In [36]:
# Split into 12 files, each one representing a clade
from collections import defaultdict

clade_groups = defaultdict(list)

with open('./data/CHC1-total-aligned.fasta') as handle:
    for seq_record in SeqIO.parse(handle, 'fasta'):
        clade_name = seq_record.description.split(' ', 1)[1].split('|')[0].strip()
        clade_groups[clade_name].append(seq_record)

for clade, seqs in clade_groups.items():
    clade = clade.replace('/', '_').replace(' ', '-')
    with open(f'./data/CHC1-fastas-by-clade/{clade}_aligned_CHCs.fasta', 'w') as out_f:
        SeqIO.write(seqs, out_f, 'fasta')