## Accessing Saraga database

This notebook demonstrates the use of Dunya api for downloading Saraga dataset files which includes manual annotation files accompanying recordings with Creative Commons licence(in mp3 format).

Saraga dataset is also available for direct download on Zenodo: https://doi.org/10.5281/zenodo.1256126

Saraga dataset is composed of two collections:
- [Hindustani collection](https://musicbrainz.org/collection/6adc54c6-6605-4e57-8230-b85f1de5be2b)
- [Carnatic collection](https://musicbrainz.org/collection/a163c8f2-b75f-4655-86be-1504ea2944c2) 

The notebook creates two subfolders and saves all data in these folders. Each annotation is saved in a separate text file. 

To be able to download sounds from Dunya, you would need to have a user and obtain an API authentification key(token). Please create a user: http://dunya.compmusic.upf.edu/social/register/ 
In order to get your API token you have to log in to dunya and then go to your profile where you will find your token. 

For example visualisations of the annotations of this data, please refer to the 'visualizeAnnotations' notebook  

Authors: Sankalp Gulati, Baris Bozkurt

In [1]:
import codecs
import json, os, sys
import numpy as np
import compmusic
from compmusic import dunya as dn
from compmusic.dunya import hindustani as hi
from compmusic.dunya import carnatic as ca
from compmusic.dunya import docserver as ds
from compmusic import musicbrainz
import codecs
import pickle
import csv
import time
import datetime

In [2]:
#Setting Dunya token and types of the annotations
token = '...yourTokenGoesHere...'
dn.set_token(token)#setting the token
#Features list
features_dunya_all = [{'type': 'pitch', 'subtype': 'pitch', 'extension': '.pitch', 'version': 'noguessunv'},
                         {'type': 'ctonic', 'subtype': 'tonic', 'extension': '.tonic', 'version': '0.3'},
                         {'type': 'sama-manual', 'subtype': None, 'extension': '.sama', 'version': None},
                         {'type': 'sections-manual', 'subtype': None, 'extension': '.sections', 'version': None},
                         {'type': 'tempo-manual', 'subtype': None, 'extension': '.tempo', 'version': None},
                         {'type': 'pitch-vocal', 'subtype': None, 'extension': '.mpitch', 'version': None},
                         {'type': 'mphrases-manual', 'subtype': None, 'extension': '.mphrases', 'version': None},
                         {'type': 'sections-manual-p', 'subtype': None, 'extension': '.sections_p', 'version': None},
                         {'type': 'bpm-manual', 'subtype': None, 'extension': '.bpm', 'version': None}
                         ]

### Functions for accessing files, computing statistics and writing/saving files

In [3]:
def getStatsDunyaCorpus(collection):
    """
    This function will fetch all the stats/numbers for a given collection/corpus/set 
    using function 'getDatasetStats'
    
    Args:
        collection (dict): dictionary containig name and id of the collection 
    
    Outputs:
        Calls getDatasetStats and generatePretyReport functions to save statistics to files
    """

    if collection['music_tradition'] == 'hindustani':
        tradition = hi
    elif collection['music_tradition'] == 'carnatic':
        tradition = ca        
    else:
        print("Please specify a valid music tradition")

    tradition.set_collections(collection['collectionId'])
    recs = tradition.get_recordings()
    mbids = [r['mbid'] for r in recs]
    
    output_file = 'stats_'+collection['music_tradition']+'_cc.pkl'
    output_file_pretty = 'stats_'+collection['music_tradition']+'_cc.txt'
    
    getDatasetStats(mbids, output_file, collection['music_tradition'])
    generatePretyReport(output_file, output_file_pretty)
    
def getDatasetStats(mbids, output_file, music_tradition = ''):
    """
    This function obtains a set of statistics/numbers on the dataset.  
    Basically number of unique artists, duration, releases, recordings, raga, tala etc 

    Args:
        mbids (list): list of mbids
        output_file (str): file path for saving statistics 
        music_tradition (str): name of the tradition ('hindustani' or 'carnatic')
    
    Outputs:
        Saves statistics to a pickle file
    """
    entities = []
    failure = 0
    if music_tradition == 'hindustani':
        tradition = hi
        concert = 'release'  # in hindustani album level items are referred by 'release'
        work = 'works'
        raga = 'raags'
        tala = 'taals'
        form = 'forms'
        laya = 'layas'
        lead_artists = 'album_artists'
        artists = 'artists'
        entities = [concert, work, raga, tala, form, laya, artists, lead_artists, 'length']
        object_to_fetch = {concert: 'mbid', work: 'mbid', raga: 'uuid', tala: 'uuid', form: 'name', laya: 'uuid', lead_artists: 'mbid', artists: 'mbid'}

    elif music_tradition == 'carnatic':
        tradition = ca
        concert = 'concert'  # in carnatic album level items are referred by 'concerts'
        work = 'work'
        raga = 'raaga'
        tala = 'taala'
        form = 'form'
        laya = 'laya'        
        lead_artists = 'album_artists'
        artists = 'artists'
        entities = [concert, work, raga, tala, form, artists, lead_artists, 'length']
        object_to_fetch = {concert: 'mbid', work: 'mbid', raga: 'uuid', tala: 'uuid', form: 'name', lead_artists: 'mbid', artists: 'mbid'}
    else:
        print("Please specify a valid music tradition")


    stats = {}
    for e in entities:
        stats[e] = []


    for mbid in mbids:
        try:
            rec_info = tradition.get_recording(mbid)
            #print("Success: %s"%mbid) 
        except:
            failure+=1
            print("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%")
            print("Failed to fetch info for file %s"%mbid) 
            print("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%")
            break
        for e in entities:
            if e in rec_info:
                #special case for parsing artist field
                if e == 'artists':
                    rec_info[e] = [a['artist'] for a in rec_info[e]]
                if isinstance(rec_info[e], int):
                    stats[e].append(rec_info[e])
                elif isinstance(rec_info[e], list):
                    temp = []
                    for item in rec_info[e]:
                        temp.append(item[object_to_fetch[e]])
                    if len(temp)>0:
                        stats[e].append(temp)

    output_stats = {}
    for e in entities:
        if e == 'length':
            output_stats[e] = {'total_length': np.sum(stats[e]), 'total_recs': len(stats[e])}
        else:
            output_stats[e] = {'total_unique': len(np.unique(sum(stats[e], []))), 'unique_elems': np.unique(sum(stats[e], [])).tolist(), 'total_rels': len(sum(stats[e], [])), 'total_recs': len(stats[e])}
    pickle.dump(output_stats, codecs.open(output_file, 'wb'))
    
def generatePretyReport(stats_file, out_file):
    '''Creating text version of the statistics file
    Args:
        stats_file (str): file path for pickle file containing statistics
        output_file (str): file path for saving statistics in text format
    Outputs:
        Saves statistics to a text file
    '''

    fid = codecs.open(out_file, 'w')

    data = pickle.load(codecs.open(stats_file, 'rb'))
    for key1 in data.keys():
        fid.write('------------ %s ------------\n'%str(key1))
        if key1 == 'length':
            for key2 in data[key1].keys():
                fid.write('%s\t%f\n'%(str(key2), float(data[key1][key2])/(1000.0*3600.0)))
        else:
            for key2 in data[key1].keys():
                if key2 == 'unique_elems':
                    fid.write('%s\t%d\n'%(str(key2), len(data[key1][key2])))
                else:
                    fid.write('%s\t%d\n'%(str(key2), data[key1][key2]))
        fid.write('\n')
    fid.close()    

def saveSections(content, output_file):
    """
    This function saves the content(section annotations) into a file in a structured manner
    Annotations are already stored nicely but due to differences in the delimiters of Hindustani and Carnatic
    we needed this function
    
    Args:
        content (str): data read from dunya api
        output_file (str): file path for output file
    Outputs:
        Saves statistics to a text file
    """
    
    #detecting delimiter automatically
    snf = csv.Sniffer()
    delimiter = snf.sniff(content).delimiter
    rows = [k.split(delimiter) for k in content.split('\n') if k != '']
    csv.writer(output_file, rows, delimiter = '\t')
    
def downloadHindustaniMp3(recordingid):
    '''Function to download mp3 file from the Hindustani collection and also save its metadata
    Alternative to:
    https://github.com/MTG/pycompmusic/blob/e2ce93ee0c00c1cf242f93c202d5d8a8675cbb74/compmusic/dunya/hindustani.py#L298
    which fails for one specific file due to file title containing the character '/'
    
    Args:
        recordingid (str): musicbrainz id of a recording
    Outputs:
        Saves audio(mp3), metadata(json) and returns the name of the file (str)
    
    '''
    recording = dn.hindustani.get_recording(recordingid)
    release = dn.hindustani.get_release(recording["release"][0]["mbid"])
    title = recording["title"]
    artists = " and ".join([a["name"] for a in release["release_artists"]])
    contents = dn.docserver.get_mp3(recordingid)
    name = "%s - %s.mp3" % (artists, title)
    name=name.replace('/','-')#resolving an exception due to inclusion of '/' in titles
    path = os.path.join('hindustani/', name)
    open(path, "wb").write(contents)
    
    #also save metada in a json file
    json_file = path.replace('.mp3','.json')
    with open(json_file, 'w') as outfile:
        json.dump(recording, outfile)
    
    return name

def downloadAllFiles4Collection(collection,features,numFiles=2):
    '''Download all files of a collection
    Args:
        collection (dict): dictionary containig name and id of the collection
        features (list of dicts): feature types
        numFiles (int): number of files to download
    Outputs:
        Saves mp3 and annotation files of the collection
    '''
    #Setting path and creating the directory if not exists
    dataDir=collection['music_tradition']+'/'
    if not os.path.exists(dataDir):
        os.mkdir(dataDir);

    if collection['music_tradition'] == 'hindustani':
        tradition = hi
    elif collection['music_tradition'] == 'carnatic':
        tradition = ca
    
    tradition.set_collections(collection['collectionId'])
    recs = tradition.get_recordings()
    numFiles=min(numFiles,len(recs))
    print('Number of files in collection ',collection['music_tradition'],' :',len(recs))
    print('...will download ',numFiles,' files')
    
    #Creating data structure for keeping list of missing files
    missingData={}
    missingData[collection['music_tradition']]={}
    for feature in features:
        missingData[collection['music_tradition']][feature['type']]=0
    
    #Downloading data
    for fileInd in range(numFiles):
        mbid=recs[fileInd]['mbid']
        print(fileInd,'/',numFiles,' : ',mbid)
        if collection['music_tradition'] == 'hindustani':
            #mp3FileURI = dn.hindustani.download_mp3(mbid,dataDir)#fails for a file whose title contains '/'
            mp3FileURI = downloadHindustaniMp3(mbid)
        elif collection['music_tradition'] == 'carnatic':
            mp3FileURI = dn.carnatic.download_mp3(mbid,dataDir)
            #also save metadata in a json file
            json_file=mp3FileURI.replace('.mp3','.json')
            with open(os.path.join('carnatic/',json_file), 'w') as outfile:
                json.dump(dn.carnatic.get_recording(mbid), outfile)
        
        print(mp3FileURI)

        for feature in features:
            contentRead=False
            try:
                content = ds.file_for_document(mbid, feature['type'], feature['subtype'], version=feature['version'])
                contentRead=True
            except Exception:
                #print('Does not have ',feature['type'],' content for :',mbid)
                missingData[collection['music_tradition']][feature['type']]=missingData[collection['music_tradition']][feature['type']]+1

            if contentRead:
                out_file = dataDir+mp3FileURI.replace('.mp3','.'+feature['type']+'.txt')
                if feature['type'] == 'pitch':
                    content = json.loads(content.decode())
                    content = np.array(content)
                    np.savetxt(out_file, content, fmt='%.7f', delimiter='\t')
                #elif feature['type'] == 'sections-manual' or feature['type'] == 'sections-manual-p':
                #    saveSections(content.decode(), out_file)
                else:
                    fid2 = open(out_file, 'w')
                    fid2.write(content.decode())
                    fid2.close()
    
    print('Collection download finished.')
    print('----------------------------------------------------------')
    return missingData

### Setting collections to be downloaded
Collections are specified with a name and musicBrainz id. [All CompMusic collections are listed here](https://musicbrainz.org/user/compmusic/collections)

ID refers to the last part of a musicBrainz link for the collection such as
https://musicbrainz.org/collection/a163c8f2-b75f-4655-86be-1504ea2944c2 for the Carnatic collection

In [4]:
collections=[]
collection1={}
collection1['music_tradition'] = 'hindustani'
collection1['collectionId'] = ['6adc54c6-6605-4e57-8230-b85f1de5be2b']
collections.append(collection1)
collection2={}
collection2['music_tradition'] = 'carnatic'
collection2['collectionId'] = ['a163c8f2-b75f-4655-86be-1504ea2944c2']
collections.append(collection2)


In [5]:
#Calling functions to produce statistics and download data
missingDatas=[]
NUM_FILES=5#set to 200 if you like to download all data (CAUTION: 8Gb)
for collection in collections:
    ts = time.time()
    print('New collection process, time:',datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S'))
    #Getting statistics of data
    print('Collecting statistics of the collection: ',collection['music_tradition'])
    getStatsDunyaCorpus(collection)
    print('Statistics collected and saved in files')
    
    features = features_dunya_all
    #Downloading the corpus
    print('Downloading files ... ')
    missingData=downloadAllFiles4Collection(collection,features,NUM_FILES)
    missingDatas.append(missingData)

pickle.dump(missingDatas, codecs.open('missingData.pkl', 'wb'))
print('Missing data list stored in missingData.pkl')

ts = time.time()
print('Finished! ',datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S'))

New collection process, time: 2018-05-02 14:49:40
Collecting statistics of the collection:  hindustani
Statistics collected and saved in files
Downloading files ... 
Number of files in collection  hindustani  : 108
...will download  108  files
0 / 108  :  450a6fcc-3c0a-483d-a31b-dde91413dcdd
Ajoy Chakrabarty - Shrutinandan Concept - an Introduction by Pandit Ajoy Chakrabarty.mp3
1 / 108  :  b71c2774-2532-4692-8761-5452e2a83118
Ajoy Chakrabarty - Bairagi.mp3
2 / 108  :  3124479b-5118-4cf3-823f-8fefad45e586
Ajoy Chakrabarty - Bilaskhani Todi.mp3
3 / 108  :  6a2c841d-5a0e-4886-a5c0-f856fccbb938
Ajoy Chakrabarty - Nat Bhairon.mp3
4 / 108  :  51656b20-295c-40f9-8dab-005b9b90fa98
Ajoy Chakrabarty - Aahir Bhairon.mp3
5 / 108  :  204008ee-89fe-44a4-a0c7-7bd7d64cf8a4
Ajoy Chakrabarty - Todi.mp3
6 / 108  :  b3849502-db24-4042-9d61-299de0211423
Ajoy Chakrabarty - Lyrics of all the Bandishes.mp3
7 / 108  :  2e5c159d-3dea-4a47-b871-76536866d643
Ajoy Chakrabarty - Taal and Sur of Thekas.mp3
8 / 108 

Kaustuv Kanti Ganguli - Raag Bilaskhani Todi.mp3
90 / 108  :  047abeeb-88f5-4395-8542-f7448668a281
Kaustuv Kanti Ganguli - Raag Lalit.mp3
91 / 108  :  001932a8-85a2-4f3f-9497-5e322c85abcd
Kaustuv Kanti Ganguli - Raag Miyan Malhar.mp3
92 / 108  :  dd875fec-377f-4565-98ca-cc9cf6397995
Kaustuv Kanti Ganguli - Raag Marwa.mp3
93 / 108  :  82f80a73-a455-408a-a1bd-241b4b4f2388
Kaustuv Kanti Ganguli - Raag Yaman.mp3
94 / 108  :  89224046-f086-4530-b2a3-70f9ae697faf
Kaustuv Kanti Ganguli - Raag Madhukauns.mp3
95 / 108  :  a01ac873-dff4-420f-9fd8-490dd87c58e9
Ajoy Chakrabarty - Raag Abhogi.mp3
96 / 108  :  9a2ce15a-4b28-4d10-abc7-9e05310c2e3b
Ajoy Chakrabarty - Raag Rageshri.mp3
97 / 108  :  c556b631-111f-42e8-aa98-e02f0777be9e
Ajoy Chakrabarty - Raag Khamaj.mp3
98 / 108  :  43957847-9bdd-4f81-a540-3db26a0ad652
Sameehan Kashalkar - Raag Poorva.mp3
99 / 108  :  aea2a29d-9ca0-4cd5-b3e9-9dcacd95165c
Sameehan Kashalkar - Raag Gaud Malhar.mp3
100 / 108  :  c5692011-a1e4-469b-a978-15a516c7b1c4
Ulhas K

68 / 197  :  a0f50392-60df-4833-ae06-f7c81929fc8f
Rithvik Raja - Madhava Mamava.mp3
69 / 197  :  e3c44bc6-3c57-4d23-b710-66cd73c36b85
Rithvik Raja - Sapasyat Kausalya.mp3
70 / 197  :  ca4ee2d1-401b-45d9-aa0c-bd8912b7292b
Rithvik Raja - Thillana Senchurutti.mp3
71 / 197  :  3caa7147-315d-46f3-96da-b2df7f94a496
Rithvik Raja - Ramachandraya Mangalam.mp3
72 / 197  :  2e87ed14-4a88-48b9-8c07-9856a5bab19d
Vasundara Rajagopal - Sri Jalandara.mp3
73 / 197  :  6e99cfc6-b123-4311-87d4-41c7e31c5009
Vasundara Rajagopal - Chandrashekharam Sada Bhajeham.mp3
74 / 197  :  6feaf61d-1ac3-4f35-a73f-7f4ea6647056
Vasundara Rajagopal - Ninnuvina Marigalada.mp3
75 / 197  :  1c3cec4e-1c46-42a2-b83d-db8d6f39ed97
Vasundara Rajagopal - Pranamamyaham Sri Prananatham.mp3
76 / 197  :  c956d1f5-190f-484d-9693-99139b3cd032
Vasundara Rajagopal - Amba Nilambari.mp3
77 / 197  :  0bc86ae5-dcfc-42ec-916e-bd9a1288f31d
Vasundara Rajagopal - Munnu Ravana.mp3
78 / 197  :  4c0158a2-679e-4958-a079-f4b95283d049
Vasundara Rajagop

158 / 197  :  32806112-2104-4df2-972e-a527be20e82a
Sanjay Subrahmanyan - Kamakshi.mp3
159 / 197  :  e7776320-8766-4568-9b08-05369bf2ad64
Sanjay Subrahmanyan - Entara Nitana.mp3
160 / 197  :  84bc41f0-7301-4206-86c4-88f2aa59bc53
Sanjay Subrahmanyan - Soundararajam.mp3
161 / 197  :  8b7dddb1-0dec-434d-b461-153b8af03be9
Sanjay Subrahmanyan - Ozhukkam Uyirinum.mp3
162 / 197  :  ca0ab807-3c90-4fde-8612-260729350a69
Sanjay Subrahmanyan - Raagam Thaanam Pallavi.mp3
163 / 197  :  cc475479-1358-41de-a76a-5f792b89a02d
Sanjay Subrahmanyan - Maname Ramanai Paada.mp3
164 / 197  :  46b47787-9c78-4e7a-b608-66fdac079866
Sanjay Subrahmanyan - Teerthakarayinile.mp3
165 / 197  :  1e6e54c5-1a73-4498-a296-0e6dd70ccc2f
Sanjay Subrahmanyan - Payum Oli Nee Yenakku.mp3
166 / 197  :  501f50aa-891c-4ab5-9dca-1fac271a33ae
Sanjay Subrahmanyan - Thillana Dhanashri.mp3
167 / 197  :  2bb36f85-b493-4b9b-aeb0-5848a9163cce
Sanjay Subrahmanyan - Pavamana Suthudu.mp3
168 / 197  :  373d0719-1251-40be-983d-5a12dec2d522
Ashw