In [None]:
import pandas as pd
import numpy as np
import os
os.getcwd() 

In [None]:
christmas = pd.read_csv("netvor++_0.3_2023-03-21.csv")
christmas.head()

In [None]:
# numeric symbols encoding structure of the staff should be removed from the volpiano string before translation 
# and alignment
# some g-clefs are encoded 1-- rather than 1---
christmas.volpiano = christmas.volpiano.str.replace('1---', '')
christmas.volpiano = christmas.volpiano.str.replace('1--', '')
christmas.volpiano = christmas.volpiano.str.replace('---3', '')
christmas.volpiano = christmas.volpiano.str.replace('---4', '')
christmas.volpiano = christmas.volpiano.str.replace('4', '') # this appears as an erroneous instance of the previous code
christmas.volpiano = christmas.volpiano.str.replace('---5', '') # this one is not documented but appears in some chants at the end
christmas.volpiano = christmas.volpiano.str.replace('777', '')
christmas.volpiano = christmas.volpiano.str.replace('77', '')
christmas.volpiano = christmas.volpiano.str.replace('7', '')
christmas.volpiano = christmas.volpiano.str.replace('6------6', '')
christmas.volpiano = christmas.volpiano.str.replace('---6', '')
christmas.volpiano = christmas.volpiano.str.replace('-----6', '')
christmas.volpiano = christmas.volpiano.str.replace('------6', '')
christmas.volpiano = christmas.volpiano.str.replace('-------6', '')
christmas.volpiano = christmas.volpiano.str.replace('--------6', '')
christmas.volpiano = christmas.volpiano.str.replace('---------6', '')


In [None]:
# deep copy to preserve the original information
christmas_translated = christmas.copy(deep=True)
# translate the --- to ~ and -- to |
christmas_translated.volpiano = christmas_translated.volpiano.str.replace('---', '~')
christmas_translated.volpiano = christmas_translated.volpiano.str.replace('--', '|')
# check that translation is not affecting the original object
christmas.volpiano.head()

In [None]:
christmas_translated.volpiano.head()

In [None]:
# create a new column with the combination of cantus id and the incipit

# first check that either cantus_id or corpus_id has only uniques
christmas.cantus_id.value_counts()

In [None]:
# use the cantus id to pick comparable melodies. Remove the :cz particle as this is just text in 
# czech of essentially the same melody
christmas.loc[:,'cantus_id_clean'] = christmas.loc[:,'cantus_id'].apply(lambda row: row.replace(":cs", ""))
christmas.cantus_id_clean.value_counts()

In [None]:
# repeat it for the translated version
christmas_translated.loc[:,'cantus_id_clean'] = christmas_translated.loc[:,'cantus_id'].apply(lambda row: row.replace(":cs", ""))
christmas_translated.cantus_id_clean.value_counts()

In [None]:
# make just another column using the unique_id, then newline, and then the volpiano string
christmas.loc[:, 'idx'] = list(christmas.index)
christmas.loc[:,'cantusidclean_idx'] = christmas[['cantus_id_clean','idx']].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)
christmas.loc[:,'fasta'] = christmas[['cantusidclean_idx','volpiano']].apply(lambda row: '\n'.join(row.values.astype(str)), axis=1)
christmas.head()

In [None]:
# make just another column using the unique_id, then newline, and then the volpiano string
christmas_translated.loc[:, 'idx'] = list(christmas_translated.index)
christmas_translated.loc[:,'cantusidclean_idx'] = christmas_translated[['cantus_id_clean','idx']].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)
christmas_translated.loc[:,'fasta'] = christmas_translated[['cantusidclean_idx','volpiano']].apply(lambda row: '\n'.join(row.values.astype(str)), axis=1)
christmas_translated.head()

In [None]:
# subsect each dataset per cantus_id_clean
# 003511 (judaea et jerusalem), 
# 004195 (orietur sicut sol salvator), 
# 007040a (constantes estote videbitis), 
# 605019 (judaea et jerusalem II), 
# 001737 (bethelem non est minima), 
# 002000 (cum esset desponsata)
judjer1 = christmas[christmas['cantus_id_clean'] == '003511']
orisic = christmas[christmas['cantus_id_clean'] == '004195']
consest = christmas[christmas['cantus_id_clean'] == '007040a']
judjer2 = christmas[christmas['cantus_id_clean'] == '605019']
bethnon = christmas[christmas['cantus_id_clean'] == '001737']
cumesset = christmas[christmas['cantus_id_clean'] == '002000']
# and their translated forms
judjer1_tr = christmas_translated[christmas_translated['cantus_id_clean'] == '003511']
orisic_tr = christmas_translated[christmas_translated['cantus_id_clean'] == '004195']
consest_tr = christmas_translated[christmas_translated['cantus_id_clean'] == '007040a']
judjer2_tr = christmas_translated[christmas_translated['cantus_id_clean'] == '605019']
bethnon_tr = christmas_translated[christmas_translated['cantus_id_clean'] == '001737']
cumesset_tr = christmas_translated[christmas_translated['cantus_id_clean'] == '002000']
cumesset_tr.head()

In [None]:
# write just the fasta column of interest to a fasta file

with open('judjer1.fasta', 'w') as f:
    for i in judjer1.index:
        f.write('> ' + judjer1.loc[i,'cantusidclean_idx'] + '\n')
        f.write(judjer1.loc[i,'volpiano'] + '\n')

with open('orisic.fasta', 'w') as f:
    for i in orisic.index:
        f.write('> ' + orisic.loc[i,'cantusidclean_idx'] + '\n')
        f.write(orisic.loc[i,'volpiano'] + '\n')

with open('consest.fasta', 'w') as f:
    for i in consest.index:
        f.write('> ' + consest.loc[i,'cantusidclean_idx'] + '\n')
        f.write(consest.loc[i,'volpiano'] + '\n')

with open('judjer2.fasta', 'w') as f:
    for i in judjer2.index:
        f.write('> ' + judjer2.loc[i,'cantusidclean_idx'] + '\n')
        f.write(judjer2.loc[i,'volpiano'] + '\n')

with open('bethnon.fasta', 'w') as f:
    for i in bethnon.index:
        f.write('> ' + bethnon.loc[i,'cantusidclean_idx'] + '\n')
        f.write(bethnon.loc[i,'volpiano'] + '\n')

with open('cumesset.fasta', 'w') as f:
    for i in cumesset.index:
        f.write('> ' + cumesset.loc[i,'cantusidclean_idx'] + '\n')
        f.write(cumesset.loc[i,'volpiano'] + '\n')

In [None]:
# write just the fasta column of interest to a fasta file
# and their translated forms

with open('judjer1_tr.fasta', 'w') as f:
    for i in judjer1_tr.index:
        f.write('> ' + judjer1_tr.loc[i,'cantusidclean_idx'] + '\n')
        f.write(judjer1_tr.loc[i,'volpiano'] + '\n')

with open('orisic_tr.fasta', 'w') as f:
    for i in orisic_tr.index:
        f.write('> ' + orisic_tr.loc[i,'cantusidclean_idx'] + '\n')
        f.write(orisic_tr.loc[i,'volpiano'] + '\n')

with open('consest_tr.fasta', 'w') as f:
    for i in consest_tr.index:
        f.write('> ' + consest_tr.loc[i,'cantusidclean_idx'] + '\n')
        f.write(consest_tr.loc[i,'volpiano'] + '\n')

with open('judjer2_tr.fasta', 'w') as f:
    for i in judjer2_tr.index:
        f.write('> ' + judjer2_tr.loc[i,'cantusidclean_idx'] + '\n')
        f.write(judjer2_tr.loc[i,'volpiano'] + '\n')

with open('bethnon_tr.fasta', 'w') as f:
    for i in bethnon_tr.index:
        f.write('> ' + bethnon_tr.loc[i,'cantusidclean_idx'] + '\n')
        f.write(bethnon_tr.loc[i,'volpiano'] + '\n')

with open('cumesset_tr.fasta', 'w') as f:
    for i in cumesset_tr.index:
        f.write('> ' + cumesset_tr.loc[i,'cantusidclean_idx'] + '\n')
        f.write(cumesset_tr.loc[i,'volpiano'] + '\n')

In [None]:
# run mafft on the fasta just generated for the gapped sequences
# only uncomment if realignment is necessary to be written to the output file
#judjer1 = christmas[christmas['cantus_id_clean'] == '003511']
#orisic = christmas[christmas['cantus_id_clean'] == '004195']
#consest = christmas[christmas['cantus_id_clean'] == '007040a']
#judjer2 = christmas[christmas['cantus_id_clean'] == '605019']
#bethnon = christmas[christmas['cantus_id_clean'] == '001737']
#cumesset = christmas[christmas['cantus_id_clean'] == '002000']

os.system('mafft --text --textmatrix textmatrix_complete --globalpair --maxiterate 1000 judjer1.fasta > judjer1.aligned.fasta')
os.system('mafft --text --textmatrix textmatrix_complete --globalpair --maxiterate 1000 orisic.fasta > orisic.aligned.fasta')
os.system('mafft --text --textmatrix textmatrix_complete --globalpair --maxiterate 1000 consest.fasta > consest.aligned.fasta')
os.system('mafft --text --textmatrix textmatrix_complete --globalpair --maxiterate 1000 judjer2.fasta > judjer2.aligned.fasta')
os.system('mafft --text --textmatrix textmatrix_complete --globalpair --maxiterate 1000 bethnon.fasta > bethnon.aligned.fasta')
os.system('mafft --text --textmatrix textmatrix_complete --globalpair --maxiterate 1000 cumesset.fasta > cumesset.aligned.fasta')

In [None]:
# run mafft on the fasta just generated for the translated sequences
# only uncomment if realignment is necessary to be written to the output file
#judjer1_tr = christmas_translated[christmas_translated['cantus_id_clean'] == '003511']
#orisic_tr = christmas_translated[christmas_translated['cantus_id_clean'] == '004195']
#consest_tr = christmas_translated[christmas_translated['cantus_id_clean'] == '007040a']
#judjer2_tr = christmas_translated[christmas_translated['cantus_id_clean'] == '605019']
#bethnon_tr = christmas_translated[christmas_translated['cantus_id_clean'] == '001737']
#cumesset_tr = christmas_translated[christmas_translated['cantus_id_clean'] == '002000']

os.system('mafft --text --textmatrix textmatrix_complete --globalpair --maxiterate 1000 judjer1_tr.fasta > judjer1_tr.aligned.fasta')
os.system('mafft --text --textmatrix textmatrix_complete --globalpair --maxiterate 1000 orisic_tr.fasta > orisic_tr.aligned.fasta')
os.system('mafft --text --textmatrix textmatrix_complete --globalpair --maxiterate 1000 consest_tr.fasta > consest_tr.aligned.fasta')
os.system('mafft --text --textmatrix textmatrix_complete --globalpair --maxiterate 1000 judjer2_tr.fasta > judjer2_tr.aligned.fasta')
os.system('mafft --text --textmatrix textmatrix_complete --globalpair --maxiterate 1000 bethnon_tr.fasta > bethnon_tr.aligned.fasta')
os.system('mafft --text --textmatrix textmatrix_complete --globalpair --maxiterate 1000 cumesset_tr.fasta > cumesset_tr.aligned.fasta')

In [None]:
# convert from fasta to nexus using phyx, replace protein by standard and remove the intermediate nexus
os.system('for i in `ls *.aligned.fasta`; do pxs2nex -s $i -o $i.nexus; done; rm phyx.logfile')
os.system('for i in `ls *nexus`; do sed \'s/PROTEIN/STANDARD/g\' $i > $i.std; rm $i; done')

In [None]:
christmas.volpiano.iloc[91]

In [None]:
# the alignment above looks better, but numeric characters seem to be useless in this context 
# as they encode bars and these even seem to be interfering with the alignment
# a resume of numeric codes in volpiano: https://cantus.uwaterloo.ca/sites/default/files/documents/2.%20Volpiano%20Protocols.pdf
# 1--- : g-clef
# ---3 : single bar
# ---4 : double bar
# possible erroneous version of the above: 4
# ---5 present in some chants but undocumented
# 7 : line break
# 77 : page break
# 777 : column break
# 9 : Gamma-ut
# 6------6 : missing pitches