# "Filtered" in the .csv files names means that the file only contains mappings FOR SONGS FOR WHICH THERE HAVE BEEN ACTUALLY LISTENINGS FROM USERS. Note also that with the 'w' flag in the open() Python's built-in method, if an already file exists at the specified path, is truncated before writing into it. Note also that I will try not to store data in data structures before dumping them into .csv files, or, if I do, I will make sure that the data structure are only temporary (internal to a for loop scope, and then deallocated)

In [None]:
import json # extract data from the monthly json lines files
import time # measuring time for .csv files for loop iterations
# import numpy
'''
Data serialization/de-serialization
Once I compute the extraction of some data of interest
out of a larger data-set, I wish not to repeat that operation
over and over each time I run this notebook.
I tested it and de-serialization is pretty fast
(4000000 3 keys dictionaries de-serialized in 4 seconds)
'''
import pickle # for each recording in Listenbrainz 2022 month
import csv # for reading -and creating a smaller "version" of- listenbrainz_msid_mapping.csv

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


# Please input here the absolute path of the ListenBrainz 2022 data January file. The script will compute the rest of the months data file paths


In [None]:
month_1_listenBrainz_data_file_path = '/content/gdrive/MyDrive/Colab Notebooks/Audio and Music Processing Lab/Audio and Music processing lab - assignment/Task 1/ListenBrainzData/listenbrainz-2022/1.listens'
!head '/content/gdrive/MyDrive/Colab Notebooks/Audio and Music Processing Lab/Audio and Music processing lab - assignment/Task 1/ListenBrainzData/listenbrainz-2022/1.listens'

# create a list where we will store the paths of the 2022 ListenBrainz data files (12 files -> 12 items in the list)
listenBrainz_data_from_2022_files_paths = []
listenBrainz_data_from_2022_files_paths.append(month_1_listenBrainz_data_file_path)
# delete '1.listens' from month_1_data_file
listenBrainzData_parentFolder = listenBrainz_data_from_2022_files_paths[0][:-9]
for month_n in range(1,12):
  listenBrainz_data_from_2022_files_paths.append(str(str(listenBrainzData_parentFolder) + str(month_n+1) + '.listens'))

{"user_id":17240,"user_name":"Winterbay","timestamp":1640995200,"track_metadata":{"artist_name":"Tiken Jah Fakoly","release_name":"Coup De Gueule","additional_info":{"artist_msid":"f03fa31b-a428-47e2-8dcb-bceec9ca1221","release_msid":"238bd863-a293-4718-a66b-093ea54bf8f3","listening_from":"lastfm","recording_msid":"f216e2fb-784d-470a-81b2-6e27cd532204","lastfm_artist_mbid":"edef3cfa-4e5e-4d64-8bd8-20f9dc1d8cad","lastfm_release_mbid":"9dc7fe6a-3fa4-4461-8975-ecb7218b39a3"},"track_name":"Alou Maye"},"recording_msid":"f216e2fb-784d-470a-81b2-6e27cd532204"}
{"user_id":16930,"user_name":"kazoo","timestamp":1640995200,"track_metadata":{"artist_name":"Fraktus","release_name":"Millennium Edition","additional_info":{"artist_msid":"afe0b08d-f47d-4adf-bd78-3e95ca276f7c","tracknumber":11,"release_msid":"9cd2285c-4860-4e83-b8bb-5fef7328b224","recording_msid":"7a490913-f4d5-40a8-b34a-02166fbc511e"},"track_name":"Computerliebe"},"recording_msid":"7a490913-f4d5-40a8-b34a-02166fbc511e"}
{"user_id":1513

Here I test the speed of 3 different approaches in JSON line to vocabulary (filtering only the keys we need) conversion;

- Dictionary comprehension
- dict() constructor
- filter method

(https://www.geeksforgeeks.org/python-extract-specific-keys-from-dictionary/)

In [None]:
# Run this cell only for test purposes
##### "MASKED" DICTIONARY COMPREHENSION

'''
user_info_and_messyBrainz_ids_dictList = []
user_info_and_messyBrainz_ids_dict = {}
line_counter = 0
start = time.monotonic()
with open(listenBrainz_data_from_2022_files_paths[0]) as json_lines_file:
  for json_line in json_lines_file:
    if line_counter >= batch_size:
      break
    user_info_and_messyBrainz_ids_dict = json.loads(json_line)
    user_info_and_messyBrainz_ids_dictList.append({key: user_info_and_messyBrainz_ids_dict[key] for key in user_info_and_messyBrainz_ids_dict.keys() & {'user_id', 'recording_msid'}})
    line_counter += 1
end = time.monotonic()

print(f'{len(user_info_and_messyBrainz_ids_dictList)} json lines processed in {end - start} seconds.')
# 100000 json lines processed in 1.4730764449996059 seconds.


##### DICTIONARY CONSTRUCTOR

user_info_and_messyBrainz_ids_dictList = []
user_info_and_messyBrainz_ids_dict = {}
line_counter = 0
start = time.monotonic()
with open(listenBrainz_data_from_2022_files_paths[0]) as json_lines_file:
  for json_line in json_lines_file:
    if line_counter >= batch_size:
      break
    user_info_and_messyBrainz_ids_dict = json.loads(json_line)
    user_info_and_messyBrainz_ids_dictList.append(dict((k, user_info_and_messyBrainz_ids_dict[k]) for k in ['user_id', 'recording_msid'] if k in user_info_and_messyBrainz_ids_dict))
    line_counter += 1
end = time.monotonic()

print(f'{len(user_info_and_messyBrainz_ids_dictList)} json lines processed in {end - start} seconds.')
# 100000 json lines processed in 2.5338072109998393 seconds.


##### FILTER METHOD

user_info_and_messyBrainz_ids_dictList = []
user_info_and_messyBrainz_ids_dict = {}
line_counter = 0
start = time.monotonic()
with open(listenBrainz_data_from_2022_files_paths[0]) as json_lines_file:
  for json_line in json_lines_file:
    if line_counter >= batch_size:
      break
    user_info_and_messyBrainz_ids_dictList.append(dict(filter(lambda item: item[0] in ['user_id', 'recording_msid'], json.loads(json_line).items())))
    line_counter += 1
end = time.monotonic()

print(f'{len(user_info_and_messyBrainz_ids_dictList)} json lines processed in {end - start} seconds.')
# 100000 json lines processed in 1.43607589200019 seconds.
'''

I realized that testing with a small batch size tends to noisy (very variate, but also very similar to each other) results. As the batch size increases, results are still non-deterministic but tend to stabilize (that is, the difference between the speeds tends to be more noticeable in terms of results).

Dictionary comprehension seems to be the fastest in average over 4 different tests (the same 3 tests repeated 4 times).

# user_id <-> MessyBrainzID for whole 2022 
Perform the actual retrieval of user id and MessyBrainz ids (with no batch and for the whole 2022 year)
Despite having a bigger .csv file for the whole 2022 year rather than month by month, I think this will come more useful later, when a "total" count of listenings will have to be computed

In [None]:
unique_MessyBrainzIDs = set()
total_time_elapsed = 0
new_UserID_to_MessyBrainzID_mappingFilePath = '/content/gdrive/MyDrive/Colab Notebooks/Audio and Music Processing Lab/Audio and Music processing lab - assignment/Task 1/UserID_to_MessyBrainzID_mappings_filtered.csv'
with open(new_UserID_to_MessyBrainzID_mappingFilePath, "w") as new_UserID_to_MessyBrainzID_mappingFile:
  fieldnames = ['user_id', 'recording_msid']
  csvWriter = csv.DictWriter(new_UserID_to_MessyBrainzID_mappingFile, fieldnames=fieldnames)
  csvWriter.writeheader()
  for month_n in range(12):
    start = int(time.monotonic())
    with open(listenBrainz_data_from_2022_files_paths[month_n]) as json_lines_file:
      for json_line in json_lines_file:
        user_info_and_messyBrainz_ids_dict = json.loads(json_line)
        csvWriter.writerow({key: user_info_and_messyBrainz_ids_dict[key] for key in user_info_and_messyBrainz_ids_dict.keys() & {'user_id', 'recording_msid'}})
        unique_MessyBrainzIDs.add(user_info_and_messyBrainz_ids_dict['recording_msid'])
    end = int(time.monotonic())
    total_time_elapsed += (end - start)
    print(f'Added user_id <-> recording_msid mappings to .csv file {new_UserID_to_MessyBrainzID_mappingFilePath} in {end - start} seconds for month n. {month_n+1}.')
print(f'Operation completed. It took {total_time_elapsed} seconds.')
print(f'There are {len(unique_MessyBrainzIDs)} unique MessyBrainzIDs to which user have been listening to in 2022.')

# pickle dump
unique_MessyBrainzIDs_filtered_file_path = str('/content/gdrive/MyDrive/Colab Notebooks/Audio and Music Processing Lab/Audio and Music processing lab - assignment/Task 1/SetOfUnique_MessyBrainzIDs_filtered.p')
pickle.dump(unique_MessyBrainzIDs, open(unique_MessyBrainzIDs_filtered_file_path, "wb" ) )
# FREE MEMORY UP
# del unique_MessyBrainzIDs

# This code saves unique_MessyBrainzIDs as .csv file rather than .p (pickle) file
'''
new_Unique_MessyBrainzIDs_filteredFilePath = '/content/gdrive/MyDrive/Colab Notebooks/Audio and Music Processing Lab/Audio and Music processing lab - assignment/Task 1/Unique_MessyBrainzIDs_filtered.csv'
with open(new_Unique_MessyBrainzIDs_filteredFilePath, "w") as unique_MessyBrainzIDs_filteredFile:
  fieldnames = ['recording_msid']
  csvWriter = csv.DictWriter(unique_MessyBrainzIDs_filteredFile, fieldnames=fieldnames)
  csvWriter.writeheader()
  # csvWriter = csv.writer(unique_MessyBrainzIDs_filteredFile)
  list_unique_MessyBrainzIDs = list(unique_MessyBrainzIDs)
  start = int(time.monotonic())
  for ID in list_unique_MessyBrainzIDs:
    csvWriter.writerow({'recording_msid': ID})
  end = int(time.monotonic())
print(f'Created a .csv file with unique MessiBrainzIDs for which people listened to in 2022 at {new_Unique_MessyBrainzIDs_filteredFilePath}.')
'''

Added user_id <-> recording_msid mappings to .csv file /content/gdrive/MyDrive/Colab Notebooks/Audio and Music Processing Lab/Audio and Music processing lab - assignment/Task 1/UserID_to_MessyBrainzID_mappings_filtered.csv in 88 seconds for month n. 1.
Added user_id <-> recording_msid mappings to .csv file /content/gdrive/MyDrive/Colab Notebooks/Audio and Music Processing Lab/Audio and Music processing lab - assignment/Task 1/UserID_to_MessyBrainzID_mappings_filtered.csv in 86 seconds for month n. 2.
Added user_id <-> recording_msid mappings to .csv file /content/gdrive/MyDrive/Colab Notebooks/Audio and Music Processing Lab/Audio and Music processing lab - assignment/Task 1/UserID_to_MessyBrainzID_mappings_filtered.csv in 78 seconds for month n. 3.
Added user_id <-> recording_msid mappings to .csv file /content/gdrive/MyDrive/Colab Notebooks/Audio and Music Processing Lab/Audio and Music processing lab - assignment/Task 1/UserID_to_MessyBrainzID_mappings_filtered.csv in 87 seconds for 

'\nnew_Unique_MessyBrainzIDs_filteredFilePath = \'/content/gdrive/MyDrive/Colab Notebooks/Audio and Music Processing Lab/Audio and Music processing lab - assignment/Task 1/Unique_MessyBrainzIDs_filtered.csv\'\nwith open(new_Unique_MessyBrainzIDs_filteredFilePath, "w") as unique_MessyBrainzIDs_filteredFile:\n  fieldnames = [\'recording_msid\']\n  csvWriter = csv.DictWriter(unique_MessyBrainzIDs_filteredFile, fieldnames=fieldnames)\n  csvWriter.writeheader()\n  # csvWriter = csv.writer(unique_MessyBrainzIDs_filteredFile)\n  list_unique_MessyBrainzIDs = list(unique_MessyBrainzIDs)\n  start = int(time.monotonic())\n  for ID in list_unique_MessyBrainzIDs:\n    csvWriter.writerow({\'recording_msid\': ID})\n  end = int(time.monotonic())\nprint(f\'Created a .csv file with unique MessiBrainzIDs for which people listened to in 2022 at {new_Unique_MessyBrainzIDs_filteredFilePath}.\')\n'

In [None]:
# TEST unique_MessyBrainzIDs set
!head '/content/gdrive/MyDrive/Colab Notebooks/Audio and Music Processing Lab/Audio and Music processing lab - assignment/Task 1/UserID_to_MessyBrainzID_mappings_filtered.csv'
print('f216e2fb-784d-470a-81b2-6e27cd532204' in unique_MessyBrainzIDs)

user_id,recording_msid
17240,f216e2fb-784d-470a-81b2-6e27cd532204
16930,7a490913-f4d5-40a8-b34a-02166fbc511e
15136,bc94c2a8-f3e3-4539-b2e9-a775a431cdd2
1626,30f3f6b6-3d76-48e9-b802-dcd5200958f3
15276,f0890cbf-2aa1-44dd-b3d9-ba385f2f4855
1139,51bcd76a-c452-425e-a804-cf59b78cd71e
249,1160f670-c889-4f09-9e34-e4e22d5717d8
1139,6a4fb4ca-52eb-4962-a9a6-0f3b22fb265c
11257,f603f6ef-e476-4dda-8ad5-54835b1c074e
True


In [None]:
# TEST
!head '/content/gdrive/MyDrive/Colab Notebooks/Audio and Music Processing Lab/Audio and Music processing lab - assignment/Task 1/ListenBrainzData/listenbrainz_msid_mapping.csv'

recording_msid,recording_mbid,match_type
00000737-3a59-4499-b30a-31fe2464555d,1fe669c9-5a2b-4dcb-9e95-77480d1e732e,exact_match
000013b3-dbb4-43a0-8fd4-ca92ff5ed033,c5bfd98d-ccde-4cf3-8abb-63fad1b6065a,exact_match
00002714-6f74-409d-9fa4-441c8dfb195f,c6acc112-3df7-4716-b5b6-953b5e93743f,exact_match
00003a81-2a6c-4d6c-ad43-990c0806458b,007770e2-90c5-49f2-b894-690db7ebea40,exact_match
00005660-7eb0-4592-a74b-14f3de9cc4cb,67bcde07-bfb1-4b30-88ba-6b995ec04123,exact_match
00006a3b-babd-4bb0-89b3-e2835aba6425,7d52ec8a-3d84-4e3d-8463-657d749ba4d4,exact_match
0000825d-b547-43a7-b294-948e8e472766,99687951-9b5d-4918-b08e-39921d736d68,high_quality
00008dc7-e09c-451e-85a0-7ce1db8fbe19,e44e1b73-9750-4712-a1c3-f57ae2a84f5a,med_quality
00009fc5-4f28-4020-b02c-966d6c5e4202,1dd0bc8f-f8c3-4658-b397-ec9ec87e618f,exact_match


# Filter listenbrainz_msid_mapping.csv (MessyBrainzID <-> MusicBrainzID) in a new .csv file

Extract from listenbrainz_msid_mapping.csv ONLY the mappings for which there has been a listening, and create a new .csv file out of those data.
The third row of listenbrainz_msid_mapping.csv can always be discarded.

Create also a set of unique MusicBrainz recordings IDs, will come useful later when mapping each MusicBrainz recording to an Artist credit ID

In [None]:
unique_MessyBrainzIDs_pickle_file_path = str('/content/gdrive/MyDrive/Colab Notebooks/Audio and Music Processing Lab/Audio and Music processing lab - assignment/Task 1/SetOfUnique_MessyBrainzIDs_filtered.p')
unique_MessyBrainzIDs = pickle.load(open(unique_MessyBrainzIDs_pickle_file_path, "rb") )
unique_MusicBrainzIDs = set()
num_matches = 0
with open('/content/gdrive/MyDrive/Colab Notebooks/Audio and Music Processing Lab/Audio and Music processing lab - assignment/Task 1/listenbrainz_msid_mapping_filtered.csv', "w") as listenbrainz_msid_mapping_filtered_file:
  fieldnames = ['recording_msid', 'recording_mbid']
  csvWriter = csv.DictWriter(listenbrainz_msid_mapping_filtered_file, fieldnames=fieldnames)
  csvWriter.writeheader()
  with open('/content/gdrive/MyDrive/Colab Notebooks/Audio and Music Processing Lab/Audio and Music processing lab - assignment/Task 1/ListenBrainzData/listenbrainz_msid_mapping.csv') as listenbrainz_msid_mapping_file:
    csvReader = csv.DictReader(listenbrainz_msid_mapping_file)
    start = int(time.monotonic())
    for csvLine in csvReader:
      # first column is MessyBrainz id
      if csvLine['recording_msid'] in unique_MessyBrainzIDs:
        # If a recording_msid has actually been listened to, add it to the new .csv file
        # print(csvLine[1])
        csvWriter.writerow({'recording_msid': csvLine['recording_msid'], 'recording_mbid': csvLine['recording_mbid']})
        unique_MusicBrainzIDs.add(csvLine['recording_mbid'])
        num_matches += 1
    end = int(time.monotonic()) # end of csvReader iteration
    print(f'Finished creating new filtered .csv file with only the MessyBrainzIDs <-> ListenBrainzIDs mappings for which there has been a listening')
    print(f'It took {end - start} seconds, and {num_matches} matches (listenings) were found.')
    print(f'A set of {len(unique_MusicBrainzIDs)} unique MusicBrainz recording IDs has been created, this is the real number of songs that have been listened to in 2022 by all users.')

# pickle dump
unique_MusicBrainzIDs_filtered_file_path = str('/content/gdrive/MyDrive/Colab Notebooks/Audio and Music Processing Lab/Audio and Music processing lab - assignment/Task 1/SetOfUnique_MusicBrainzIDs_filtered.p')
pickle.dump(unique_MusicBrainzIDs, open(unique_MusicBrainzIDs_filtered_file_path, "wb" ) )
# FREE MEMORY UP
del unique_MessyBrainzIDs

Finished creating new filtered .csv file with only the MessyBrainzIDs <-> ListenBrainzIDs mappings for which there has been a listening
It took 331 seconds, and 11291747 matches (listenings) were found.
A set of 3313731 unique MusicBrainz recording IDs has been created, this is the real number of songs that have been listened to in 2022 by all users.


In [None]:
# TEST unique_MusicBrainzIDs set
!head '/content/gdrive/MyDrive/Colab Notebooks/Audio and Music Processing Lab/Audio and Music processing lab - assignment/Task 1/listenbrainz_msid_mapping_filtered.csv'
print('c5bfd98d-ccde-4cf3-8abb-63fad1b6065a' in unique_MusicBrainzIDs)

recording_msid,recording_mbid
000013b3-dbb4-43a0-8fd4-ca92ff5ed033,c5bfd98d-ccde-4cf3-8abb-63fad1b6065a
0000c7ce-5855-4259-823f-fd2e1e4615ce,22e1f70a-df0a-4de3-aa65-2694b7308b2b
00019a12-08fa-46dc-bf78-b1dbfb5b9ae2,a3d61638-2eb4-4833-8f07-47578247a480
00021542-d0f2-4331-bb9e-aba42f350d8e,86c0470c-f497-43b5-b790-7fe6e46e5e73
0002dd17-b9ec-497d-89be-837c25b6dc82,96f4933a-d82d-4219-ab38-b038ee9ce538
0005a74c-2c36-4951-a771-bc2b8ca903ba,9b94fe85-0b72-43e3-94d1-bcb9ea25212d
000631fd-9e97-4c5a-9d71-c9bd579c6c3b,9fd7ad7f-5f03-40eb-a4c7-126c4e3914ae
000658a9-ec8f-4ead-baeb-4580ed29799b,9fb06ce0-6220-49a3-be50-c609ff7ac143
000682e9-e235-4168-9bbf-cfd015c17116,83125987-7b99-4934-89b7-234eec16eb19
True


Create a dictionary out of MessyBrainzID <-> MusicBrainzID mappings.
MessyBrainzID is key and MusicBrainzID is value
This will make the creation of the user_id <-> MusicBrainzID mappings .csv file much faster.

In [None]:
# TEST file before opening it
!head '/content/gdrive/MyDrive/Colab Notebooks/Audio and Music Processing Lab/Audio and Music processing lab - assignment/Task 1/listenbrainz_msid_mapping_filtered.csv'

recording_msid,recording_mbid
000013b3-dbb4-43a0-8fd4-ca92ff5ed033,c5bfd98d-ccde-4cf3-8abb-63fad1b6065a
0000c7ce-5855-4259-823f-fd2e1e4615ce,22e1f70a-df0a-4de3-aa65-2694b7308b2b
00019a12-08fa-46dc-bf78-b1dbfb5b9ae2,a3d61638-2eb4-4833-8f07-47578247a480
00021542-d0f2-4331-bb9e-aba42f350d8e,86c0470c-f497-43b5-b790-7fe6e46e5e73
0002dd17-b9ec-497d-89be-837c25b6dc82,96f4933a-d82d-4219-ab38-b038ee9ce538
0005a74c-2c36-4951-a771-bc2b8ca903ba,9b94fe85-0b72-43e3-94d1-bcb9ea25212d
000631fd-9e97-4c5a-9d71-c9bd579c6c3b,9fd7ad7f-5f03-40eb-a4c7-126c4e3914ae
000658a9-ec8f-4ead-baeb-4580ed29799b,9fb06ce0-6220-49a3-be50-c609ff7ac143
000682e9-e235-4168-9bbf-cfd015c17116,83125987-7b99-4934-89b7-234eec16eb19


In [None]:
messyBrainzID_to_MusicBrainzID_mappingsDict = dict()
with open('/content/gdrive/MyDrive/Colab Notebooks/Audio and Music Processing Lab/Audio and Music processing lab - assignment/Task 1/listenbrainz_msid_mapping_filtered.csv') as listenbrainz_msid_mapping_filtered_file:
  csvReader = csv.reader(listenbrainz_msid_mapping_filtered_file)
  start = int(time.monotonic())
  for csvLine in csvReader:
    # first column is MessyBrainz id
    messyBrainzID_to_MusicBrainzID_mappingsDict[csvLine[0]] = csvLine[1]
  end = int(time.monotonic()) # end of csvReader iteration
  print(f'Finished creating a dictionary of MessyBrainzID (keys) <-> MusicBrainzID (values) mappings')

# Test the dictionary is consistent with the data
print(messyBrainzID_to_MusicBrainzID_mappingsDict['000013b3-dbb4-43a0-8fd4-ca92ff5ed033'])

# pickle dump
messyBrainzID_to_MusicBrainzID_mappingsDict_file_path = str('/content/gdrive/MyDrive/Colab Notebooks/Audio and Music Processing Lab/Audio and Music processing lab - assignment/Task 1/Dictionary_of_MessyBrainzID_to_MusicBrainzID_mappings_filtered.p')
pickle.dump(messyBrainzID_to_MusicBrainzID_mappingsDict, open(messyBrainzID_to_MusicBrainzID_mappingsDict_file_path, "wb" ) )
# FREE MEMORY UP
# del messyBrainzID_to_MusicBrainzID_mappingsDict

Finished creating a dictionary of MessyBrainzID (keys) <-> MusicBrainzID (values) mappings
c5bfd98d-ccde-4cf3-8abb-63fad1b6065a


# Create a user_id <-> MusicBrainzID mapping .csv file

In [None]:
# TEST file before opening it
!head '/content/gdrive/MyDrive/Colab Notebooks/Audio and Music Processing Lab/Audio and Music processing lab - assignment/Task 1/UserID_to_MessyBrainzID_mappings_filtered.csv'

user_id,recording_msid
17240,f216e2fb-784d-470a-81b2-6e27cd532204
16930,7a490913-f4d5-40a8-b34a-02166fbc511e
15136,bc94c2a8-f3e3-4539-b2e9-a775a431cdd2
1626,30f3f6b6-3d76-48e9-b802-dcd5200958f3
15276,f0890cbf-2aa1-44dd-b3d9-ba385f2f4855
1139,51bcd76a-c452-425e-a804-cf59b78cd71e
249,1160f670-c889-4f09-9e34-e4e22d5717d8
1139,6a4fb4ca-52eb-4962-a9a6-0f3b22fb265c
11257,f603f6ef-e476-4dda-8ad5-54835b1c074e


In [None]:
messyBrainzID_to_MusicBrainzID_mappingsDict_file_path = str('/content/gdrive/MyDrive/Colab Notebooks/Audio and Music Processing Lab/Audio and Music processing lab - assignment/Task 1/Dictionary_of_MessyBrainzID_to_MusicBrainzID_mappings_filtered.p')
messyBrainzID_to_MusicBrainzID_mappingsDict = pickle.load(open(messyBrainzID_to_MusicBrainzID_mappingsDict_file_path, "rb") )

count = 0
with open('/content/gdrive/MyDrive/Colab Notebooks/Audio and Music Processing Lab/Audio and Music processing lab - assignment/Task 1/UserID_to_MusicBrainzID_mappings_filtered.csv', "w") as userID_to_MusicBrainzID_mappings_filtered_file:
  fieldnames = ['user_id', 'recording_mbid']
  csvWriter = csv.DictWriter(userID_to_MusicBrainzID_mappings_filtered_file, fieldnames=fieldnames)
  csvWriter.writeheader()
  with open('/content/gdrive/MyDrive/Colab Notebooks/Audio and Music Processing Lab/Audio and Music processing lab - assignment/Task 1/UserID_to_MessyBrainzID_mappings_filtered.csv') as userID_to_MessyBrainzID_mappings_filtered_file:
    csvReader = csv.reader(userID_to_MessyBrainzID_mappings_filtered_file)
    start = int(time.monotonic())
    for csvLine in csvReader:
      if count >= 1:
        # print(f'user_id: {csvLine[0]}')
        # print(f'recording_msid: {csvLine[1]}')
        # print(f'recording_mbid: {messyBrainzID_to_MusicBrainzID_mappingsDict[csvLine[1]]}')
        # use this .csv file's row's recording_msid to retrieve its correspondent recording_mbid from the dictionary
        csvWriter.writerow({'user_id': csvLine[0], 'recording_mbid': messyBrainzID_to_MusicBrainzID_mappingsDict[csvLine[1]]})
      count += 1
    end = int(time.monotonic()) # end of csvReader iteration
    print(f'Finished creating .csv file with user_id <-> MusicBrainzID (recording_mbid) mappings, it took {end - start} seconds.')

'''
# pickle dump
unique_MusicBrainzIDs_filtered_file_path = str('/content/gdrive/MyDrive/Colab Notebooks/Audio and Music Processing Lab/Audio and Music processing lab - assignment/Task 1/SetOfUnique_MusicBrainzIDs_filtered.p')
pickle.dump(unique_MusicBrainzIDs, open(unique_MusicBrainzIDs_filtered_file_path, "wb" ) )
'''
# FREE MEMORY UP
del messyBrainzID_to_MusicBrainzID_mappingsDict

Finished creating .csv file with user_id <-> MusicBrainzID (recording_mbid) mappings, it took 233 seconds.


# Filter canonical_musicbrainz_data.csv (MusicBrainzID <-> ArtistID) in a new .csv file

Extract from canonical_musicbrainz_data.csv ONLY the mappings for which there has been a listening (using the set of unique MusicBrainzIDs from a .pickle file), and create a new .csv file out of those data.
The other rows of canonical_musicbrainz_data.csv can always be discarded.

Create also a dictionary data structure (which will be stored in a .pickle file) mapping MusicBrainzIDs <-> ArtistIDs. This will make the creation of user_id <-> Artist_IDs mappings faster.

In [None]:
# TEST check file before reading it
!head '/content/gdrive/MyDrive/Colab Notebooks/Audio and Music Processing Lab/Audio and Music processing lab - assignment/Task 1/ListenBrainzData/metabrainz-metadata-dump-20230117-172210/metabrainz/canonical_musicbrainz_data.csv'

id,artist_credit_id,artist_mbids,artist_credit_name,release_mbid,release_name,recording_mbid,recording_name,combined_lookup,score,year
28939355,1415161,{5e3071a8-8c56-4ab2-91f6-c76d35388dbd},Michie One,430bd180-0f13-4144-9ab6-ad50067303ee,Power of One,5f5f649f-1938-4a3e-a879-a95693a99a71,Heavenly Flow,michieoneheavenlyflow,371181,2006
28939356,1415161,{5e3071a8-8c56-4ab2-91f6-c76d35388dbd},Michie One,430bd180-0f13-4144-9ab6-ad50067303ee,Power of One,6ee381af-e9b4-46c4-a4f0-541dce2f03ea,Party,michieoneparty,371181,2006
28939357,1415161,{5e3071a8-8c56-4ab2-91f6-c76d35388dbd},Michie One,430bd180-0f13-4144-9ab6-ad50067303ee,Power of One,8b371ea0-dee1-4fbf-bacd-3aa87a4aef13,Free Like Jah,michieonefreelikejah,371181,2006
28939358,1415161,{5e3071a8-8c56-4ab2-91f6-c76d35388dbd},Michie One,430bd180-0f13-4144-9ab6-ad50067303ee,Power of One,9d904f0f-314b-4089-b151-d81e0857c431,People,michieonepeople,371181,2006
28939359,1415161,{5e3071a8-8c56-4ab2-91f6-c76d35388dbd},Michie One,430bd180-0f13-4144-

In [None]:
unique_MusicBrainzIDs_pickle_file_path = str('/content/gdrive/MyDrive/Colab Notebooks/Audio and Music Processing Lab/Audio and Music processing lab - assignment/Task 1/SetOfUnique_MusicBrainzIDs_filtered.p')
unique_MusicBrainzIDs = pickle.load(open(unique_MusicBrainzIDs_pickle_file_path, "rb"))
musicBrainzID_to_ArtistID_mappingsDict = dict()
num_matches = 0
with open('/content/gdrive/MyDrive/Colab Notebooks/Audio and Music Processing Lab/Audio and Music processing lab - assignment/Task 1/canonical_musicbrainz_data_filtered.csv', "w") as canonical_musicbrainz_data_filtered_file:
  fieldnames = ['recording_mbid', 'artist_mbids']
  csvWriter = csv.DictWriter(canonical_musicbrainz_data_filtered_file, fieldnames=fieldnames)
  csvWriter.writeheader()
  with open('/content/gdrive/MyDrive/Colab Notebooks/Audio and Music Processing Lab/Audio and Music processing lab - assignment/Task 1/ListenBrainzData/metabrainz-metadata-dump-20230117-172210/metabrainz/canonical_musicbrainz_data.csv') as canonical_musicbrainz_data_file:
    csvReader = csv.DictReader(canonical_musicbrainz_data_file)
    start = int(time.monotonic())
    for csvLine in csvReader:
      # column 6 is MusicBrainz id, column 2 is artist_mbids
      if csvLine['recording_mbid'] in unique_MusicBrainzIDs:
        # If an artist_mbids has actually been listened to, add it to the new .csv file
        csvWriter.writerow({'recording_mbid': csvLine['recording_mbid'], 'artist_mbids': csvLine['artist_mbids']})
        musicBrainzID_to_ArtistID_mappingsDict[csvLine['recording_mbid']] = csvLine['artist_mbids']
        num_matches += 1
    end = int(time.monotonic()) # end of csvReader iteration
    print(f'Finished creating new filtered .csv file with only the MusicBrainzIDs <-> ArtistIDs mappings for which there has been a listening')
    print(f'It took {end - start} seconds, and {num_matches} matches (listenings) were found.')
    print(f'A dictionary of MusicBrainzIDs <-> ArtistID pickle file will be created.')

# pickle dump
musicBrainzID_to_ArtistID_mappingsDict_file_path = str('/content/gdrive/MyDrive/Colab Notebooks/Audio and Music Processing Lab/Audio and Music processing lab - assignment/Task 1/Dictionary_Of_MusicBrainzID_to_ArtistID_mappings_filtered.p')
pickle.dump(musicBrainzID_to_ArtistID_mappingsDict, open(musicBrainzID_to_ArtistID_mappingsDict_file_path, "wb" ) )
# FREE MEMORY UP
del unique_MusicBrainzIDs

NameError: ignored

# Create a user_id <-> ArtistID mapping .csv file

Get the UserID_to_MusicBrainzID_mappings_filtered.csv file and create another one like UserID_to_ArtistID_mappings_filtered.csv by using the musicBrainzID_to_ArtistID_mappingsDict dictionary.

On the way, create also a set of unique ArtistIDs (the ones that have actually been listening to, which are the ones we caure about)

In [None]:
# TEST file content before opening it
!head '/content/gdrive/MyDrive/Colab Notebooks/Audio and Music Processing Lab/Audio and Music processing lab - assignment/Task 1/UserID_to_MusicBrainzID_mappings_filtered.csv'

user_id,recording_mbid
user_id,recording_mbid
17240,89346a2d-a595-4afe-a4c8-722fc6f93c61
16930,755014b7-f235-4172-991d-f2eaf8450e9d
15136,ae4f8491-0828-4857-936a-ccc1e87f6573
1626,8a7e3912-9df2-49ba-9ed0-1a4fb5dd9ba4
15276,6c48fb39-691d-4959-b1e9-a830df2f1090
1139,1fc2055b-2493-44a8-9ee2-c5cf17b00b17
249,edda2b10-c034-42ed-be7f-d9ccf4254fe1
1139,4670a763-cf40-4921-bd7d-3d92616d76dd


In [None]:
musicBrainzID_to_ArtistID_mappingsDict_file_path = '/content/gdrive/MyDrive/Colab Notebooks/Audio and Music Processing Lab/Audio and Music processing lab - assignment/Task 1/Dictionary_Of_MusicBrainzID_to_ArtistID_mappings_filtered.p'
musicBrainzID_to_ArtistID_mappingsDict = pickle.load(open(musicBrainzID_to_ArtistID_mappingsDict_file_path, "rb") )
unique_ArtistIDs = set()
num_matches = 0

with open('/content/gdrive/MyDrive/Colab Notebooks/Audio and Music Processing Lab/Audio and Music processing lab - assignment/Task 1/UserID_to_ArtistID_mappings_filtered.csv', "w") as userID_to_ArtistID_mappings_filtered_file:
  fieldnames = ['user_id', 'artist_mbids']
  csvWriter = csv.DictWriter(userID_to_ArtistID_mappings_filtered_file, fieldnames=fieldnames)
  csvWriter.writeheader()
  with open('/content/gdrive/MyDrive/Colab Notebooks/Audio and Music Processing Lab/Audio and Music processing lab - assignment/Task 1/UserID_to_MusicBrainzID_mappings_filtered.csv') as userID_to_MusicBrainzID_mappings_filtered_file:
    csvReader = csv.DictReader(userID_to_MusicBrainzID_mappings_filtered_file)
    start = int(time.monotonic())
    for csvLine in csvReader:
        # print({'user_id': csvLine['user_id'], 'artist_mbids': musicBrainzID_to_ArtistID_mappingsDict[csvLine['recording_mbid']]})
        # THIS IF SHOULD NOT EXIST SINCE musicBrainzID_to_ArtistID_mappingsDict IS ALREADY FILTERED
        if csvLine['recording_mbid'] in musicBrainzID_to_ArtistID_mappingsDict:
          csvWriter.writerow({'user_id': csvLine['user_id'], 'artist_mbids': musicBrainzID_to_ArtistID_mappingsDict[csvLine['recording_mbid']]})
          unique_ArtistIDs.add(musicBrainzID_to_ArtistID_mappingsDict[csvLine['recording_mbid']])
          num_matches += 1
    end = int(time.monotonic()) # end of csvReader iteration
    print(f'Finished creating .csv file with user_id <-> ArtistID mappings, it took {end - start} seconds for {num_matches} matches out of {len(musicBrainzID_to_ArtistID_mappingsDict.keys())} supposed matches.')
    print(f'A set of {len(unique_ArtistIDs)} unique ArtisIDs has been created and will be stored in a .p file.')


# pickle dump
set_of_unique_ArtistIDs_file_path = str('/content/gdrive/MyDrive/Colab Notebooks/Audio and Music Processing Lab/Audio and Music processing lab - assignment/Task 1/SetOfUnique_ArtistIDs_filtered.p')
pickle.dump(unique_ArtistIDs, open(set_of_unique_ArtistIDs_file_path, "wb" ) )

# FREE MEMORY UP
del musicBrainzID_to_ArtistID_mappingsDict

Finished creating .csv file with user_id <-> ArtistID mappings, it took 304 seconds for 37462867 matches out of 3227855 supposed matches.
A set of 456934 unique ArtisIDs has been created and will be stored in a .p file.


# !!!!!!!!!!!!!!!!!!!!!!!
The cell above contains a weird bug
# !!!!!!!!!!!!!!!!!!!!!!!

# Create a dictionary out of musicbrainz_artist_mbid_name.csv, taking only the rows of interest (we can do this using the set of unique ArtistIDs unique_ArtistIDs just created)

In [None]:
set_of_unique_ArtistIDs_file_path = '/content/gdrive/MyDrive/Colab Notebooks/Audio and Music Processing Lab/Audio and Music processing lab - assignment/Task 1/SetOfUnique_ArtistIDs_filtered.p'
unique_ArtistIDs = pickle.load(open(set_of_unique_ArtistIDs_file_path, "rb") )
artistID_to_ArtistName_mapping_dict = dict()

with open('/content/gdrive/MyDrive/Colab Notebooks/Audio and Music Processing Lab/Audio and Music processing lab - assignment/Task 1/ListenBrainzData/musicbrainz_artist_mbid_name.csv') as musicbrainz_artist_mbid_name_file:
      csvReader = csv.DictReader(musicbrainz_artist_mbid_name_file)
      start = int(time.monotonic())
      for csvLine in csvReader:
        # csvLine[0] = mbid (artist id)
        # print(csvLine['mbid'])
        if csvLine['mbid'] in unique_ArtistIDs:
          artistID_to_ArtistName_mapping_dict[csvLine['mbid']] = csvLine['name']
          print({[csvLine['mbid']]: csvLine['name']})

# Create a user_id <-> ArtistName mapping .csv file

Get the UserID_to_ArtistID_mappings_filtered.csv file and create another one like UserID_to_ArtistName_mappings_filtered.csv by using musicbrainz_artist_mbid_name.csv.


In [None]:
# TEST
!head '/content/gdrive/MyDrive/Colab Notebooks/Audio and Music Processing Lab/Audio and Music processing lab - assignment/Task 1/ListenBrainzData/musicbrainz_artist_mbid_name.csv'

mbid,name
fadeb38c-833f-40bc-9d8c-a6383b38b1be,Доктор Сатана
49add228-eac5-4de8-836c-d75cde7369c3,Pete Moutso
c112a400-af49-4665-8bba-741531d962a1,Zachary
ca3f3ee1-c4a7-4bac-a16a-0b888a396c6b,The Silhouettes
7b4a548e-a01a-49b7-82e7-b49efeb9732c,Aric Leavitt
60aca66f-e91a-4cb5-9308-b6e293cd833e,Fonograff
3e1bd546-d2a7-49cb-b38d-d70904a1d719,Al Street
df120895-f6c6-4a66-b9cf-73350f0beb61,Love .45
c14f8d3f-ee81-416f-800f-8eff7e77a2e1,Sintellect


In [None]:
with open('/content/gdrive/MyDrive/Colab Notebooks/Audio and Music Processing Lab/Audio and Music processing lab - assignment/Task 1/UserID_to_ArtistName_mappings_filtered.csv', "w") as  userID_to_ArtistName_mappings_filtered_file:
  fieldnames = ['user_id', 'artist_mbids']
  csvWriter = csv.DictWriter(userID_to_ArtistName_mappings_filtered_file, fieldnames=fieldnames)
  csvWriter.writeheader()
  with open('/content/gdrive/MyDrive/Colab Notebooks/Audio and Music Processing Lab/Audio and Music processing lab - assignment/Task 1/ListenBrainzData/musicbrainz_artist_mbid_name.csv') as musicbrainz_artist_mbid_name_file:
    musicbrainz_artist_mbid_name_csvReader = csv.DictReader(musicbrainz_artist_mbid_name_file)
    with open('/content/gdrive/MyDrive/Colab Notebooks/Audio and Music Processing Lab/Audio and Music processing lab - assignment/Task 1/UserID_to_ArtistID_mappings_filtered.csv') as userID_to_ArtistID_mappings_filtered_file:
      userID_to_ArtistID_mappings_csvReader = csv.DictReader(userID_to_ArtistID_mappings_filtered_file)
      start = int(time.monotonic())
      for csvLine in musicbrainz_artist_mbid_name_csvReader:
          # print({'user_id': csvLine['user_id'], 'artist_mbids': musicBrainzID_to_ArtistID_mappingsDict[csvLine['recording_mbid']]})
          # THIS IF SHOULD NOT EXIST SINCE musicBrainzID_to_ArtistID_mappingsDict IS ALREADY FILTERED
          if csvLine['recording_mbid'] in musicBrainzID_to_ArtistID_mappingsDict:
            csvWriter.writerow({'user_id': csvLine['user_id'], 'artist_mbids': musicBrainzID_to_ArtistID_mappingsDict[csvLine['recording_mbid']]})
            unique_ArtistIDs.add(musicBrainzID_to_ArtistID_mappingsDict[csvLine['recording_mbid']])
            num_matches += 1
    end = int(time.monotonic()) # end of csvReader iteration
    print(f'Finished creating .csv file with user_id <-> ArtistID mappings, it took {end - start} seconds for {num_matches} matches out of {len(musicBrainzID_to_ArtistID_mappingsDict.keys())} supposed matches.')
    print(f'A set of {len(unique_ArtistIDs)} unique ArtisIDs has been created and will be stored in a .p file.')

'''
# pickle dump
unique_MusicBrainzIDs_filtered_file_path = str('/content/gdrive/MyDrive/Colab Notebooks/Audio and Music Processing Lab/Audio and Music processing lab - assignment/Task 1/SetOfUnique_MusicBrainzIDs_filtered.p')
pickle.dump(unique_MusicBrainzIDs, open(unique_MusicBrainzIDs_filtered_file_path, "wb" ) )
'''
# FREE MEMORY UP
del musicBrainzID_to_ArtistID_mappingsDict