In [1]:
from music21 import *
import numpy as np
import pandas as pd
import os
import shutil
from sklearn import preprocessing
import html

Read in csv containing list of works (path, krn file name, etc.)

In [2]:
workList = pd.read_excel("workList.xlsx")

In [3]:
#calculate features and get metadata for each piece
featureList=[]

for index, row in workList.iterrows():
    if row['include']==1:
        file=row['file']
        folder=row['folder']
        #location of original krn file
        fp='data/classical/' + folder + '/' + file
        print(fp)
        
        filename = folder.split("/")[0] + '_' + file
        #copy the ones we are using into the corpus folder with composer name as a prefix
        if os.path.exists("corpus/kernfiles/" + filename)==False:
            shutil.copyfile(fp, "corpus/kernfiles/" + filename)
        
        #parse file and get flattened score
        s = converter.parse(fp)
        sflat=s.flat
        
        #export midi
        #filename=file.split('.')[0]
        #filename = folder.split("/")[0] + '_' + file
        midiFileName = filename.split('.')[0] + '.mid'
        midifp='docs/midi/' + midiFileName
        if os.path.exists(midifp)==False:
            midifp=s.write('midi',midifp)
               
        ####manual edits
        #insert missing composer info
        #for Susato piece, insert missing instrumentation (SATB recorders from .krn not recognized)
        #music21 does not distinguish between type of recorder (SATB) so just assign all parts as recorder.
    
        if folder == 'susato/danserye' and file =='reihentaenze-07.krn':
            s.metadata.composer='Susato, Tielman'
            for p in s.parts.stream():
                p.insert(instrument.Recorder())
        elif folder =='petrus' and file =='annun.krn':
            s.metadata.composer='Petrus de Cruce'
      
          
        composer=html.unescape(s.metadata.composer)
        title=html.unescape(s.metadata.title)
        date=s.metadata.date
        spotifyID=row['spotifyID']

        #average number of independent voices sounding simultaneously
        avgNumVoices = round(features.jSymbolic.AverageNumberOfIndependentVoicesFeature(s).extract().vector[0],2)
        
        #key and type (e.g. C major)
        key = s.analyze('key')
        
        #key type
        keyType=key.type
        
        #number of sharps or flats (flats will be negative, sharps positive)
        sharpsOrFlats=abs(key.sharps)

        #initial tempo
        tempo = features.jSymbolic.InitialTempoFeature(s).extract().vector[0]
        
        #initial time signature
        timeSig = features.jSymbolic.InitialTimeSignatureFeature(s).extract().vector
        
        #note density (average number of notes per second, taking local tempo into account)
        noteDensity = round(features.jSymbolic.NoteDensityFeature(s).extract().vector[0],2)
        
        #1 if the initial meter is compound, 0 otherwise
        compoundMeter = features.jSymbolic.CompoundOrSimpleMeterFeature(s).extract().vector[0]
        
        #1 if there was at least one meter change, 0 otherwise
        meterChanges = features.jSymbolic.ChangesOfMeterFeature(s).extract().vector[0]
        
        #length of piece in minutes
        minutes = round(sflat.seconds/60,2)
        
        #number of parts
        nParts=len(s.parts)
        
        #list of parts as text
        partList=[]
        for p in s.parts.stream():
            partList.append(p.partName)
        #print(partList)
        
        #Create indicators for instrument families

        if len(sflat.getElementsByClass(instrument.StringInstrument))>0:
            strings=1
        else:
            strings=0
            
        if (len(sflat.getElementsByClass(instrument.WoodwindInstrument))>0 or
           len(sflat.getElementsByClass(instrument.Recorder)))>0:
            woodwinds=1
        else:
            woodwinds=0
            
        if len(sflat.getElementsByClass(instrument.BrassInstrument))>0:
            brass=1
        else:
            brass=0
            
        if len(sflat.getElementsByClass(instrument.Vocalist))>0:
            voice=1
        else:
            voice=0
            
        if len(sflat.getElementsByClass(instrument.KeyboardInstrument))>0:
            keys=1
        else:
            keys=0
            
        if len(sflat.getElementsByClass(instrument.Percussion))>0:
            percussion=1
        else:
            percussion=0
              
        #this piece has a hurdy-gurdy, which is not a standard music21 instrument.
        #should it be keys or strings?
        if folder == 'vaqueiras' and file =='kalenda_maya.krn':
            keys=1;
        
        #print('strings:',strings,', keys:',keys,', woodwinds:',woodwinds,', percussion:',percussion,
        #     ', voice:',voice)
        
        #identify which notes are accidentals, range per part, durations of notes in terms of quarter lengths, 
        #and duration in terms of seconds in 
              
        p_accidentals_list=[]
        partRange_list=[]
        duration_list=[]
        secs_list=[]
        
        for part in s.parts:
            acc=0
            notes=0
           
            for n in part.recurse().getElementsByClass('Note'):
                if n.pitch.accidental is None:
                    acc+=0
                else: 
                    acc+=1
                notes+=1
                duration_list.append(n.duration.quarterLength)
                secs_list.append(n.seconds)
            if notes>0: #some scores have spines with no notes (e.g. victoria)
                p_accidentals_list.append(round(acc/notes,2))
                partRange=part.analyze('ambitus').semitones
                partRange_list.append(partRange)
        
        #proportion of notes that are accidentals (max across parts)
        max_p_accidentals=max(p_accidentals_list)
        #range in semitones within each part (max across parts)
        max_partRange=max(partRange_list)
        
        #proportion of notes with note length equal to the shortest duration in the piece
        #example: if shortest note in piece is a sixteenth note, find the proportion of notes that are sixteenths
        durationSeries = pd.Series(duration_list)
        vals, counts = np.unique(durationSeries, return_counts=True)
        p_shortestDuration = counts[np.argmin(vals)]/sum(counts)
        
        #percentage of notes that last <=0.25 seconds, taking local tempo into account
        p_fastNotes=sum(1 for i in secs_list if i <= 0.25)/len(secs_list)

        featureList.append([composer, title, date, minutes,
                            key, keyType, sharpsOrFlats, max_p_accidentals, max_partRange,
                            tempo, timeSig, compoundMeter, meterChanges, noteDensity, p_shortestDuration, p_fastNotes,
                            avgNumVoices, nParts, partList, strings, keys, woodwinds, percussion, brass, voice, 
                            filename, midiFileName, spotifyID])
    

data/classical/adam/fimaris.krn
data/classical/alkan/op38/02-fa_edited.krn
data/classical/bach/violin/partita2-1.krn
data/classical/bachcpe/vol01/Wq117-37_edited.krn
data/classical/beethoven/piano/sonata/sonata08-2.krn
data/classical/bononcini/perlagloria.krn
data/classical/brahms/op39/op39-02_edited.krn
data/classical/buxtehude/op1/op1-3-1.krn
data/classical/byrd/aveverum.krn
data/classical/chopin/scherzo/scherzo2.krn
data/classical/clementi/op36/sonatina-36-3-1.krn
data/classical/dufay/omnes_amici.krn
data/classical/dunstable/veni.krn
data/classical/faure/apres.krn
data/classical/flecha/bomba.krn
data/classical/frescobaldi/canzoni/canzoni14.krn
data/classical/gabrieli/mysterium_edited.krn
data/classical/gabrielia/contrafacta/coppini23.krn
data/classical/gibbons/silverswan.krn
data/classical/giovannelli/contrafacta/coppini15.krn
data/classical/grieg/op46/op46-4_edited.krn
data/classical/handel/largo.krn
data/classical/haydn/keyboard/uesonatas/sonata33-3_edited.krn
data/classical/haydn

In [4]:
#create dataframe
df=pd.DataFrame(featureList, columns=['composer', 'title', 'date', 'minutes',
                            'key', 'keyType', 'sharpsOrFlats', 'max_p_accidentals', 'max_partRange',
                            'tempo', 'timeSig', 'compoundMeter', 'meterChanges', 'noteDensity', 'p_shortestDuration','p_fastNotes',
                            'avgNumVoices', 'nParts', 'partList', 'strings', 'keys', 'woodwinds', 'percussion', 'brass', 'voice', 
                            'filename', 'midiFileName','spotifyID'])

In [5]:
df

Unnamed: 0,composer,title,date,minutes,key,keyType,sharpsOrFlats,max_p_accidentals,max_partRange,tempo,...,partList,strings,keys,woodwinds,percussion,brass,voice,filename,midiFileName,spotifyID
0,Adam de la Halle,Fi Maris de vostre Amour,,0.23,e minor,minor,1,0.28,9,80.0,...,"[Voice, Voice, Voice]",0,0,0,0,0,1,adam_fimaris.krn,adam_fimaris.mid,0XKQIVQ97pPTJg2uDEivD4
1,"Alkan, Charles-Valentin","Fa, Op. 38, No. 2",,2.94,d minor,minor,1,0.26,36,72.0,...,"[Piano, Piano, Piano]",0,1,0,0,0,0,alkan_02-fa_edited.krn,alkan_02-fa_edited.mid,3jS5RqvjbgnThgzu9iqDD8
2,"Bach, Johann Sebastian",Movement 1: Allemande,,1.32,d minor,minor,1,0.2,30,94.0,...,[Violin],1,0,0,0,0,0,bach_partita2-1.krn,bach_partita2-1.mid,2shAgoNd95hE6pAjAxxor7
3,"Bach, Carl Philipp Emanuel",La Gause,,0.51,F major,major,1,0.22,26,110.0,...,"[Piano, Piano]",0,1,0,0,0,0,bachcpe_Wq117-37_edited.krn,bachcpe_Wq117-37_edited.mid,0LSMmwJL5Nym0L7Vxt2FeH
4,"Beethoven, Ludwig van",Piano Sonata no. 8 in C minor,,3.65,A- major,major,4,0.73,45,40.0,...,"[Piano, Piano]",0,1,0,0,0,0,beethoven_sonata08-2.krn,beethoven_sonata08-2.mid,14YeWgIwIqyFnHuOkd7fyP
5,"Bononcini, Giovanni",For the love my heart doth prize,,3.9,D major,major,2,0.28,29,80.0,...,"[Voice, Piano, Piano]",0,1,0,0,0,1,bononcini_perlagloria.krn,bononcini_perlagloria.mid,2PhaOWw5VeChQCJcDBXpfD
6,"Brahms, Johannes","Waltz in E Major, Op.39 No.2",,0.46,E major,major,4,0.58,31,172.0,...,"[Piano, Piano, Piano]",0,1,0,0,0,0,brahms_op39-02_edited.krn,brahms_op39-02_edited.mid,5mwWvvJa6IubcMr4QzaFVu
7,"Buxtehude, Dietrich","Sonata in A Minor, Op. 1, No. 3",,1.17,a minor,minor,0,0.14,24,72.0,...,"[Violin, Viola, Harpsichord]",1,1,0,0,0,0,buxtehude_op1-3-1.krn,buxtehude_op1-3-1.mid,3xBWKAOczzfaLalXRzxntl
8,"Byrd, William",Ave verum corpus,,2.0,a minor,minor,0,0.1,19,120.0,...,"[Voice, Voice, Voice, Voice]",0,0,0,0,0,1,byrd_aveverum.krn,byrd_aveverum.mid,7J6b58JOnf4RGGqJmxq0bD
9,"Chopin, Frederic",Scherzo in B-flat Minor,,7.81,C# major,major,7,0.77,65,300.0,...,"[Piano, Piano]",0,1,0,0,0,0,chopin_scherzo2.krn,chopin_scherzo2.mid,6OwavBgVS1N4lAXZ2zKUfc


In [6]:
df.to_csv('features.csv',index=False)

In [7]:
diffFeatures=df[['minutes','max_p_accidentals','sharpsOrFlats', 'max_partRange','tempo','compoundMeter','meterChanges','noteDensity',
    'p_fastNotes']]

In [8]:
diffFeatures.describe()

Unnamed: 0,minutes,max_p_accidentals,sharpsOrFlats,max_partRange,tempo,compoundMeter,meterChanges,noteDensity,p_fastNotes
count,58.0,58.0,58.0,58.0,58.0,58.0,58.0,58.0,58.0
mean,2.654483,0.337069,1.741379,28.896552,114.586207,0.051724,0.241379,6.068966,0.3676
std,2.463444,0.239773,1.691676,15.259958,44.979541,0.223404,0.431657,2.812541,0.32228
min,0.2,0.0,0.0,9.0,40.0,0.0,0.0,1.04,0.0
25%,0.985,0.175,1.0,17.0,80.0,0.0,0.0,3.9,0.119485
50%,2.115,0.265,1.0,24.5,120.0,0.0,0.0,5.685,0.239632
75%,3.4175,0.4925,2.0,35.75,132.0,0.0,0.0,7.5125,0.656129
max,12.4,1.0,7.0,75.0,300.0,1.0,1.0,12.3,0.988889


In [9]:
#scale each difficulty component to [0,1]
min_max_scaler=preprocessing.MinMaxScaler()
diffFeaturesScaled=pd.DataFrame(min_max_scaler.fit_transform(diffFeatures), columns=['minutes','max_p_accidentals', 'sharpsOrFlats', 'max_partRange','tempo','compoundMeter','meterChanges','noteDensity',
    'p_fastNotes'])

In [10]:
diffFeaturesScaled.describe()

Unnamed: 0,minutes,max_p_accidentals,sharpsOrFlats,max_partRange,tempo,compoundMeter,meterChanges,noteDensity,p_fastNotes
count,58.0,58.0,58.0,58.0,58.0,58.0,58.0,58.0,58.0
mean,0.201187,0.337069,0.248768,0.301463,0.28687,0.051724,0.241379,0.446622,0.37173
std,0.201922,0.239773,0.241668,0.231211,0.172998,0.223404,0.431657,0.249782,0.325901
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.064344,0.175,0.142857,0.121212,0.153846,0.0,0.0,0.253996,0.120828
50%,0.156967,0.265,0.142857,0.234848,0.307692,0.0,0.0,0.412522,0.242325
75%,0.26373,0.4925,0.285714,0.405303,0.353846,0.0,0.0,0.574822,0.663501
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [11]:
#calculate continuous measure of difficulty as sum of 6 scaled variables
diffSum=diffFeaturesScaled.sum(axis=1)

In [12]:
diffSum.describe()

count    58.000000
mean      2.486813
std       1.234957
min       0.610302
25%       1.671199
50%       2.474144
75%       2.958371
max       6.411344
dtype: float64

In [13]:
#quintiles of continuous difficulty measure
p20=diffSum.quantile(.2)
p40=diffSum.quantile(.4)
p60=diffSum.quantile(.6)
p80=diffSum.quantile(.8)
p100=diffSum.quantile(1)
print(p20, p40, p60, p80, p100)

1.3740896993895266 2.1202099993952452 2.6538704018905217 3.214926145600467 6.411344163138507


In [14]:
#categorize into difficulty levels bins 1-5 
difficulty=pd.cut(diffSum,bins=[0,p20,p40,p60,p80,p100],labels=[1,2,3,4,5])

In [15]:
#check distribution of bins (should be approximately uniform)
difficulty.value_counts()

5    12
3    12
1    12
4    11
2    11
dtype: int64

In [16]:
#select subset of variables that will go into the JSON file
df_subset=df[['composer', 'title', 'date', 'minutes',
                            'keyType', 
                            'tempo', 'timeSig',  'noteDensity', 
                            'nParts','strings', 'keys', 'woodwinds', 'percussion', 'brass', 'voice', 
                            'filename', 'midiFileName','spotifyID']]

In [17]:
#append difficulty measure
df_subset.loc[:,'difficulty']=difficulty

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [18]:
df_subset

Unnamed: 0,composer,title,date,minutes,keyType,tempo,timeSig,noteDensity,nParts,strings,keys,woodwinds,percussion,brass,voice,filename,midiFileName,spotifyID,difficulty
0,Adam de la Halle,Fi Maris de vostre Amour,,0.23,minor,80.0,"[3, 4]",3.76,3,0,0,0,0,0,1,adam_fimaris.krn,adam_fimaris.mid,0XKQIVQ97pPTJg2uDEivD4,1
1,"Alkan, Charles-Valentin","Fa, Op. 38, No. 2",,2.94,minor,72.0,"[3, 8]",11.78,3,0,1,0,0,0,0,alkan_02-fa_edited.krn,alkan_02-fa_edited.mid,3jS5RqvjbgnThgzu9iqDD8,4
2,"Bach, Johann Sebastian",Movement 1: Allemande,,1.32,minor,94.0,"[4, 4]",6.82,1,1,0,0,0,0,0,bach_partita2-1.krn,bach_partita2-1.mid,2shAgoNd95hE6pAjAxxor7,3
3,"Bach, Carl Philipp Emanuel",La Gause,,0.51,major,110.0,"[2, 4]",5.99,2,0,1,0,0,0,0,bachcpe_Wq117-37_edited.krn,bachcpe_Wq117-37_edited.mid,0LSMmwJL5Nym0L7Vxt2FeH,2
4,"Beethoven, Ludwig van",Piano Sonata no. 8 in C minor,,3.65,major,40.0,"[2, 4]",5.47,2,0,1,0,0,0,0,beethoven_sonata08-2.krn,beethoven_sonata08-2.mid,14YeWgIwIqyFnHuOkd7fyP,4
5,"Bononcini, Giovanni",For the love my heart doth prize,,3.9,major,80.0,"[3, 4]",4.7,3,0,1,0,0,0,1,bononcini_perlagloria.krn,bononcini_perlagloria.mid,2PhaOWw5VeChQCJcDBXpfD,2
6,"Brahms, Johannes","Waltz in E Major, Op.39 No.2",,0.46,major,172.0,"[3, 4]",6.65,3,0,1,0,0,0,0,brahms_op39-02_edited.krn,brahms_op39-02_edited.mid,5mwWvvJa6IubcMr4QzaFVu,4
7,"Buxtehude, Dietrich","Sonata in A Minor, Op. 1, No. 3",,1.17,minor,72.0,"[4, 4]",3.79,3,1,1,0,0,0,0,buxtehude_op1-3-1.krn,buxtehude_op1-3-1.mid,3xBWKAOczzfaLalXRzxntl,1
8,"Byrd, William",Ave verum corpus,,2.0,minor,120.0,"[4, 4]",5.04,4,0,0,0,0,0,1,byrd_aveverum.krn,byrd_aveverum.mid,7J6b58JOnf4RGGqJmxq0bD,1
9,"Chopin, Frederic",Scherzo in B-flat Minor,,7.81,major,300.0,"[3, 4]",10.09,2,0,1,0,0,0,0,chopin_scherzo2.krn,chopin_scherzo2.mid,6OwavBgVS1N4lAXZ2zKUfc,5


In [25]:
df_subset.to_json('docs/features.json',orient='records')

In [19]:
df_subset['difficulty'].value_counts()
#,'timeSig','nParts','strings','keys','woodwinds','percussion','brass','voice']].value_counts()]

5    12
3    12
1    12
4    11
2    11
Name: difficulty, dtype: int64