90% of Da-TACOS cliques found in Discogs-VI

91% of SHS100K cliques found in Discogs-VI

In [1]:
import os
import sys
import csv
import json
from collections import defaultdict

import numpy as np

PROJECT_DIR = os.path.dirname(os.path.dirname(os.getcwd()))
sys.path.append(PROJECT_DIR)

from utilities.utils import soft_clean_text, hard_clean_text, clean_parentheses, collect_writer_artists, collect_performance_artists

DATA_DIR = os.path.join(PROJECT_DIR, "data")
METADATA_DIR = os.path.join(DATA_DIR, "discogs_metadata")

In [2]:
# NOTE: My goal was to load these existing methods and variables but relative imports etc. were difficult to handle
# from discogs_vi.parse_releases_to_tracks import remove_disogs_pattern
# from discogs_vi.variables import EXCLUDE_ARTISTS
# from discogs_vi_yt.query_yt.utils_query import get_youtube_id

import re
def remove_disogs_pattern(artist):
    """Removes the Discogs pattern from the artist name. 'Oguz (3)' type of Discogs convention"""
    return re.sub(r"\s\(\d+?\)", "", artist)

VARIOUS = "194"
UNKNOWN = "355"
NO_ARTIST = "118760"
ARTIST_WITHOUT_PAGE = '0'

EXCLUDE_ARTISTS = {VARIOUS, UNKNOWN, NO_ARTIST, ARTIST_WITHOUT_PAGE}

def get_youtube_id(url):
    return url.split("/watch?v=")[1]

In [3]:
dump_date = "20240701"

use_discogs_vi = True # If false use discogs_vi_yt

In [4]:
dump_name = f"discogs_{dump_date}"

dump_dir = os.path.join(METADATA_DIR, dump_name)
print(os.path.abspath(dump_dir))

artists_json = os.path.join(dump_dir, f"{dump_name}_artists.xml.json.clean")

discogs_vi_json = os.path.join(dump_dir, f"Discogs-VI-{dump_date}.jsonl")
discogs_vi_yt_json = os.path.join(dump_dir, f"Discogs-VI-YT-{dump_date}.jsonl")

datacos_benchmark_json_path = os.path.join(DATA_DIR, "other_datasets", 'da-tacos_benchmark_subset_metadata.json')
shs100k_csv_path = os.path.join(DATA_DIR, "other_datasets", 'SHS100K2', 'meta', 'list')
shs100k_train_csv_path = os.path.join(DATA_DIR, "other_datasets", 'SHS100K2', 'meta', 'SHS100K-TRAIN')
shs100k_test_csv_path = os.path.join(DATA_DIR, "other_datasets", 'SHS100K2', 'meta', 'SHS100K-TEST')
shs100k_val_csv_path = os.path.join(DATA_DIR, "other_datasets", 'SHS100K2', 'meta', 'SHS100K-VAL')

shs100k_test_cliques_json_path = os.path.join(DATA_DIR, "other_datasets", 'SHS100K2-TEST-cliques.json')

if use_discogs_vi:
    discogs_json = discogs_vi_json
    output_txt = os.path.join(dump_dir, f'Discogs-VI-{dump_date}-DaTACOS-SHS100K2_TEST-lost_cliques.txt')
else:
    discogs_json = discogs_vi_yt_json
    output_txt = os.path.join(dump_dir, f'Discogs-VI-YT-{dump_date}-DaTACOS-SHS100K2_TEST-lost_cliques.txt')

/home/oaraz/discotube-dataset/data/discogs_metadata/discogs_20240701


## Discogs-VI-YT

Here we gather detailed track artist and writer artist information from the discogs metadata and store them in a clique related dict. We use discogs_vi_yt_json for simplicity.

In [5]:
# Load the artists data
artists_dict = {}
with open(artists_json, encoding='utf-8') as infile:
    for jsonline in infile:
        artist = json.loads(jsonline)
        if artist['name'] is not None:
            artist['name'] = soft_clean_text(remove_disogs_pattern(artist['name']))
        artists_dict[artist["id"]] = artist
        del artists_dict[artist["id"]]["id"]

In [6]:
discogs_cliques = defaultdict(list)
discogs_clique_sizes = dict()
with open(discogs_json, encoding='utf-8') as in_f:
    for jline in in_f:
        clique = json.loads(jline)

        # Record the clique sizes for saving time
        discogs_clique_sizes[clique['clique_id']] = len(clique['versions'])

        # For Da-TACOS we do not use the versions but for SHS100K we have to
        versions = []

        # Collect all the writers of the work. Since between the versions of a clique, 
        # the writers of the work might not be exactly the same
        work_writers = set()
        for version in clique['versions']:

            # Collect all the performers of the work. Since between the versions of a clique,
            # the performers of the work might not be exactly the same
            performer_artists = set()
            for track in version['tracks']:

                # Get all the writer artists of the track
                writer_ids = collect_writer_artists(track, artists_dict)
                # Skip the version if it has any of the excluded artists
                if writer_ids & EXCLUDE_ARTISTS:
                    continue
                # Get the names of the writer artists
                work_writers.update(
                    {artists_dict[artist_id]['name'] for artist_id in writer_ids if artist_id in artists_dict}
                    )

                # Get the title of the track. This can change between the tracks of a 
                # version but we do not use this information
                t_title = soft_clean_text(track['track_title'])

                # Get all the performer artists of the track
                t_artist_ids = collect_performance_artists(track, artists_dict)
                # Get the names of the artists
                t_artist_names = {
                    artists_dict[artist_id]['name'] for artist_id in t_artist_ids if artist_id in artists_dict
                    }
                performer_artists.update(t_artist_names)

            # A version in this case is the title of the track and the performer artists
            versions.append((t_title, performer_artists))

        # All the members of the clique share the same track_title_cleaned
        discogs_cliques[track['track_title_cleaned']].append({
            'work_writer': work_writers, 
            'clique_id': clique['clique_id'],
            'versions': versions
            })

## Da-TACOS Benchmark

Here we analyze the datacos benchmark dataset and compare it with Discogs-VI.

In [7]:
# Load the datacos benchmark json file
with open(datacos_benchmark_json_path) as f:
    benchmark_json = json.load(f)

In [8]:
# Format the metadata and collect the datacos info
datacos_info = []
for work_id, performances_dict in benchmark_json.items():
    # Differencing between the noise and clique performances
    if len(performances_dict) == 1:
        _type = 'noise'
    else:
        _type = 'clique'

    titles = set()
    for performance_id, performance in performances_dict.items():
        work_title = hard_clean_text(clean_parentheses(performance['work_title']))
        performance_title = hard_clean_text(clean_parentheses(performance['perf_title']))
        # Add both the work title and the performance title
        titles.add(work_title)
        titles.add(performance_title)
    work_artist = soft_clean_text(performance['work_artist'])
    # Each work artist has a single name but the performance can have multiple artists and names
    datacos_info.append((titles, work_artist, _type, work_id))

In [9]:
datacos_matched_cliques_ids, datacos_matched_noise_cliques_ids = set(), set()
datacos_matched_work_ids, datacos_matched_noise_work_ids = set(), set()

# Iterate over the datacos works and find the matching Discogs-VI cliques
for datacos_work_titles, datacos_work_writer, _type, work_id in datacos_info:
    # Compare each work title with the Discogs-VI cliques
    for datacos_work_title in datacos_work_titles:
        # There can be multiple cliques in dicsogs with the same title
        for discogs_clique in discogs_cliques.get(datacos_work_title, []):
            # Check if the work writer is in the clique
            if datacos_work_writer in discogs_clique['work_writer']:
                # Seperate into noise and music works
                if _type == 'noise':
                    datacos_matched_noise_cliques_ids.add(discogs_clique['clique_id'])
                    datacos_matched_noise_work_ids.add(work_id)
                else:
                    datacos_matched_cliques_ids.add(discogs_clique['clique_id'])
                    datacos_matched_work_ids.add(work_id)

print(f'{len(datacos_matched_work_ids):>5,} Datacos works are matched to Discogs-VI-YT cliques.')
print(f'{len(datacos_matched_noise_work_ids):>5,} Datacos noise works are matched to Discogs-VI-YT cliques.')

print()
print(f'{len(datacos_matched_cliques_ids):>5,} Discogs-VI-YT cliques are matched with Datacos works.')
print(f'{len(datacos_matched_noise_cliques_ids):>5,} Discogs-VI-YT cliques are matched with Datacos noise works.')

datacos_matched_cliques_ids = set(datacos_matched_cliques_ids)
datacos_matched_noise_cliques_ids = set(datacos_matched_noise_cliques_ids)

all_datacos_matched_clique_ids = datacos_matched_cliques_ids | datacos_matched_noise_cliques_ids
print(f'{len(all_datacos_matched_clique_ids):>5,} Discogs-VI-YT cliques are matched with Datacos works and noise works.')

  935 Datacos works are matched to DiscogsVI-YT cliques.
1,412 Datacos noise works are matched to DiscogsVI-YT cliques.

1,231 DiscogsVI-YT cliques are matched with Datacos works.
1,472 DiscogsVI-YT cliques are matched with Datacos noise works.
2,703 DiscogsVI-YT cliques are matched with Datacos works and noise works.


In [10]:
datacos_lost_versions = 0
for clique_id in datacos_matched_cliques_ids:
    datacos_lost_versions += discogs_clique_sizes[clique_id]
print(f"{datacos_lost_versions:>5,} versions are lost to the real works of Da-TACOS benchmark subset.")

datacos_noise_lost_versions = 0
for clique_id in datacos_matched_noise_cliques_ids:
    datacos_noise_lost_versions += discogs_clique_sizes[clique_id]
print(f"{datacos_noise_lost_versions:>5,} versions are lost to the noise works of Da-TACOS benchmark subset.")

195,743 versions are lost to the real works of Da-TACOS benchmark subset.
30,190 versions are lost to the noise works of Da-TACOS benchmark subset.


## SHS100K-TEST

Here we find the intersection between SHS100K-Test with Discogs-VI-YT

In [5]:
# Read the csv file it is tab separated
with open(shs100k_csv_path, 'r') as f:
    reader = csv.reader(f, delimiter='\t')
    shs100k_data = list(reader)

shs100K_dict = dict()
for row in shs100k_data:
    if row[0] not in shs100K_dict:
        shs100K_dict[row[0]] = [row[1]]
    else:
        shs100K_dict[row[0]].append(row[1])
SHS100K_clique_sizes = [len(v) for v in shs100K_dict.values()]
print(sum(SHS100K_clique_sizes)/len(SHS100K_clique_sizes))
print(max(SHS100K_clique_sizes))
print(np.median(SHS100K_clique_sizes))

with open(shs100k_test_csv_path, 'r') as f:
    reader = csv.reader(f, delimiter='\t')
    shs100k_test_indices = list(reader)
print(f'{len(set([int(i[0]) for i in shs100k_test_indices])):>6,} SHS100K2-TEST cliques are used.')
print(f'{len(shs100k_test_indices):>6,} SHS100K2-TEST versions are used.')

12.432204295330697
387
8.0
 1,692 SHS100K2-TEST cliques are used.
10,547 SHS100K2-TEST versions are used.


In [12]:
# Seperate the test data
shs100k_test_data = []
for row_i in shs100k_test_indices:
    for row_j in shs100k_data:
        if row_i[0] == row_j[0] and row_i[1] == row_j[1]:
            shs100k_test_data.append(row_j)
            break
assert len(shs100k_test_data) == len(shs100k_test_indices)

# Create a clique data structure for the test data
shs_test_cliques = dict()
for i, (clique_id, version_id, _, _, url, _) in enumerate(shs100k_test_data):
    yt_id = get_youtube_id(url)
    dct = {'version_id': version_id, 'youtube_id': yt_id}
    if clique_id not in shs_test_cliques:
        shs_test_cliques[clique_id] = [dct]
    else:
        shs_test_cliques[clique_id].append(dct)

# Format the test data
shs100k_test_dict = {}
for row in shs100k_test_data:
    clique_id = row[0]
    title = hard_clean_text(clean_parentheses(row[2]))
    artist = soft_clean_text(row[3])
    if clique_id in shs100k_test_dict:
        shs100k_test_dict[clique_id][0].add(title)
        shs100k_test_dict[clique_id][1].add(artist)
    else:
        shs100k_test_dict[clique_id] = [{title}, {artist}]

In [13]:
# Match the SHS100K2 test data with the Discogs-VI cliques
shs_matched_clique_ids, discogs_matched_shs_clique_ids = set(), set()
for shs_clique_id, (shs_titles, shs_artists) in shs100k_test_dict.items():
    for shs_title in shs_titles:
        for discogs_clique in discogs_cliques.get(shs_title, []):
            # Compare each SHS artist
            for shs_artist in shs_artists:
                # With each Discogs version's artists
                for _, discogs_artists in discogs_clique['versions']:
                    if shs_artist in discogs_artists:
                        shs_matched_clique_ids.add(discogs_clique['clique_id'])
                        discogs_matched_shs_clique_ids.add(shs_clique_id)
print(f'{len(discogs_matched_shs_clique_ids):<5,} SHS100K2-TEST cliques are matched with Discogs-VI-YT cliques')
print(f'{len(shs_matched_clique_ids):<5,} Discogs-VI-YT cliques are matched with SHS100K2-TEST cliques')

1,555 SHS100K2-TEST cliques are matched with DiscogsVI-YT cliques
1,996 DiscogsVI-YT cliques are matched with SHS100K2-TEST cliques


In [14]:
shs100k_lost_versions = 0
for clique_id in shs_matched_clique_ids:
    shs100k_lost_versions += discogs_clique_sizes[clique_id]
print(f"{shs100k_lost_versions:>5,} versions in Discogs-VI-YT are lost to the SHS100K2-TEST..")

50,258 versions in DiscogsVI-YT are lost to the SHS100K2-TEST..


## Merge Da-TACOS and SHS100K

In [15]:
all_lost_clique_ids = set(shs_matched_clique_ids).union(set(all_datacos_matched_clique_ids))
print(f"{len(all_lost_clique_ids):>6,} cliques are lost to the SHS100K2-TEST set and Da-TACOS benchmark subset in total.")

all_lost_versions = 0
for clique_id in all_lost_clique_ids:
    all_lost_versions += discogs_clique_sizes[clique_id]
print(f"{all_lost_versions:>6,} versions in Discogs-VI-YT are lost to the SHS100K2-TEST and Da-TACOS benchmark sets in total.")

print(f'Writing the lost clique IDs to {output_txt}')
with open(output_txt, 'w') as out_f:
    for clique_id in all_lost_clique_ids:
        out_f.write(clique_id+'\n')

 4,557 cliques are lost to the SHS100K2-TEST set and Da-TACOS benchmark subset in total.
264,566 versions in DiscogsVI-YT are lost to the SHS100K2-TEST and Da-TACOS benchmark sets in total.
Writing the lost clique IDs to /home/oaraz/discotube-dataset/data/discogs_metadata/discogs_20240701/DiscogsVI-20240701-DaTACOS-SHS100K2_TEST-lost_cliques.txt


In [16]:
all_lost_versions_list = []
with open(discogs_json, encoding='utf-8') as in_f:
    for jline in in_f:
        clique = json.loads(jline)
        if clique['clique_id'] in all_lost_clique_ids:
            all_lost_versions_list.append(len(clique['versions']))
print()
print(np.mean(all_lost_versions_list))
print(np.median(all_lost_versions_list))
print(np.max(all_lost_versions_list))
print(np.min(all_lost_versions_list))


58.05705508009655
16.0
1837
2


## Format for testing

In [6]:
# Seperate the test data
shs100k_test_data = []
for row_i in shs100k_test_indices:
    for row_j in shs100k_data:
        if row_i[0] == row_j[0] and row_i[1] == row_j[1]:
            shs100k_test_data.append(row_j)
            break
assert len(shs100k_test_data) == len(shs100k_test_indices)

In [7]:
# Create a clique data structure for the test data
shs_test_cliques = dict()
for i, (clique_id, version_id, _, _, url, _) in enumerate(shs100k_test_data):
    yt_id = get_youtube_id(url)
    dct = {'version_id': version_id, 'youtube_id': yt_id}
    if clique_id not in shs_test_cliques:
        shs_test_cliques[clique_id] = [dct]
    else:
        shs_test_cliques[clique_id].append(dct)

In [8]:
shs_test_cliques

{'8110': [{'version_id': '0', 'youtube_id': 'pesIGuV9DDk'},
  {'version_id': '1', 'youtube_id': '6vZyiOsOAOY'},
  {'version_id': '2', 'youtube_id': 'cZPdr5MY6Gw'},
  {'version_id': '3', 'youtube_id': 'VCjeTJu-tfY'},
  {'version_id': '4', 'youtube_id': 'AcDmo4yTDHE'}],
 '8112': [{'version_id': '0', 'youtube_id': 'eq7jlJyz8-8'},
  {'version_id': '1', 'youtube_id': 'lMCKULtrRRA'},
  {'version_id': '2', 'youtube_id': 'xADupK4xL-Y'},
  {'version_id': '3', 'youtube_id': 'R8A8WgMcVqg'},
  {'version_id': '4', 'youtube_id': 'S4St3Jk1u9M'}],
 '8113': [{'version_id': '0', 'youtube_id': '40bTOCv3_ak'},
  {'version_id': '1', 'youtube_id': 'T_V0S-7Modw'},
  {'version_id': '2', 'youtube_id': '1mlY7djK-hU'},
  {'version_id': '3', 'youtube_id': 'WEnC3fyDVWw'},
  {'version_id': '4', 'youtube_id': 'ksBGApo0UvI'},
  {'version_id': '5', 'youtube_id': '-blXxtQQvWs'},
  {'version_id': '6', 'youtube_id': '2Vh3noCDx5E'},
  {'version_id': '7', 'youtube_id': 'Um8nXe-ZRCE'},
  {'version_id': '8', 'youtube_id': '1

In [11]:
with open(shs100k_test_cliques_json_path, 'w') as out_f:
    json.dump(shs_test_cliques, out_f)

## Analyze

This part is just for inspection

### Discogs-VI-YT

In [14]:
discogs_vi_yt_artist_ids_single = set()
discogs_vi_yt_artist_ids_all = set()

with open(discogs_vi_yt_json, encoding='utf-8') as in_f:
    for jline in in_f:
        clique = json.loads(jline)
        for version in clique['versions']:

            track = version['tracks'][0]
            artist_id = track["track_artist_ids"][0] if len(track["track_artist_ids"])>0 else track['release_artist_ids'][0]
            discogs_vi_yt_artist_ids_single.add(int(artist_id))
            feat_artist_ids = track.get("track_feat_ids", [])
            if len(feat_artist_ids) > 0:
                discogs_vi_yt_artist_ids_single.add(int(feat_artist_ids[0]))

            for track in version['tracks']:
                artist_ids = track["track_artist_ids"] if len(track["track_artist_ids"])>0 else track['release_artist_ids']
                artist_ids = {int(_id) for _id in artist_ids}
                discogs_vi_yt_artist_ids_all.update(artist_ids)
                feat_artist_ids = track.get("track_feat_ids", [])
                feat_artist_ids = {int(_id) for _id in feat_artist_ids}
                discogs_vi_yt_artist_ids_all.update(feat_artist_ids)

print(len(discogs_vi_yt_artist_ids_single))
print(len(discogs_vi_yt_artist_ids_all))

67345
74928


In [15]:
discogs_vi_artist_ids_single = set()
discogs_vi_artist_ids_all = set()

with open(discogs_vi_json, encoding='utf-8') as in_f:
    for jline in in_f:
        clique = json.loads(jline)
        for version in clique['versions']:
            track = version['tracks'][0]
            artist_id = track["track_artist_ids"][0] if len(track["track_artist_ids"])>0 else track['release_artist_ids'][0]
            discogs_vi_artist_ids_single.add(int(artist_id))
            feat_artist_ids = track.get("track_feat_ids", [])
            if len(feat_artist_ids) > 0:
                discogs_vi_artist_ids_single.add(int(feat_artist_ids[0]))

            for track in version['tracks']:
                artist_ids = track["track_artist_ids"] if len(track["track_artist_ids"])>0 else track['release_artist_ids']
                artist_ids = {int(_id) for _id in artist_ids}
                discogs_vi_artist_ids_all.update(artist_ids)
                feat_artist_ids = track.get("track_feat_ids", [])
                feat_artist_ids = {int(_id) for _id in feat_artist_ids}
                discogs_vi_artist_ids_all.update(feat_artist_ids)
print(len(discogs_vi_artist_ids_single))
print(len(discogs_vi_artist_ids_all))

239949
271152


### Da-TACOS

In [16]:
with open(datacos_benchmark_json_path) as f:
    benchmark_json = json.load(f)

works = list(benchmark_json.keys())
print(f'Found {len(works)} works in benchmark subset')

Found 3000 works in benchmark subset


In [19]:
datacos_artists = set()
for work, performances in benchmark_json.items():
    if len(performances) == 1:
        continue
    for performance in performances.values():
        datacos_artists.add(performance['perf_artist'])
print(len(datacos_artists))

6375


In [None]:
noise_work_ids, music_work_ids = [], []
for work, performences in benchmark_json.items():
    # print(f'Work: {work} {len(performences)} performences')
    if len(performences) == 1:
        noise_work_ids.append(work)
    else:
        music_work_ids.append(work)
print(f'Found {len(noise_work_ids)} noise works and {len(music_work_ids)} music works')

In [None]:
for noise_work_id in noise_work_ids:
    if len(benchmark_json[noise_work_id]) > 1:
        print('Error: Noise work has more than 1 performance')
        continue

In [None]:
benchmark_json[noise_work_id]

In [None]:
mismatches = defaultdict(list)
for work_id, performances_dict in benchmark_json.items():
    for performance_id, performance in performances_dict.items():
        if performance['perf_title'] != performance['work_title']:
            if performance['perf_title'] not in mismatches[performance['work_title']]:
                mismatches[performance['work_title']].append(performance['perf_title'])
            #print(performance['perf_title'], performance['work_title'])

In [None]:
mismatches

In [None]:
for work_id, performances_dict in benchmark_json.items():
    for performance_id, performance in performances_dict.items():
        print(performance['work_artist'])

### SHS100K

In [20]:
# Read the csv file it is tab separated
with open(shs100k_csv_path, 'r') as f:
    reader = csv.reader(f, delimiter='\t')
    shs100k_data = list(reader)

In [23]:
shs100k_data

[['0',
  '0',
  'Summertime',
  'Abbie Mitchell',
  'https://www.youtube.com/watch?v=TMHWEcSKEPI',
  'True'],
 ['0',
  '1',
  'Summertime',
  'Helen Jepson',
  'https://www.youtube.com/watch?v=E3lw8gaHs04',
  'True'],
 ['0',
  '2',
  'Summertime',
  'Billie Holiday and Her Orchestra',
  'https://www.youtube.com/watch?v=aPpJPTJc1lU',
  'True'],
 ['0',
  '3',
  'Summertime',
  'Jerry Kruger and Her Orchestra',
  'https://www.youtube.com/watch?v=J9JctRqcDso',
  'True'],
 ['0',
  '4',
  'Overture and Summertime',
  'Anne Brown',
  'https://www.youtube.com/watch?v=R44waInkjgI',
  'True'],
 ['0',
  '5',
  'Summertime',
  'Saunders King Rhythm',
  'https://www.youtube.com/watch?v=NPRSkMIqgQA',
  'True'],
 ['0',
  '6',
  'Summertime',
  'The Ravens',
  'https://www.youtube.com/watch?v=piVtpRkhJTA',
  'True'],
 ['0',
  '7',
  'Summertime',
  'Ethel Waters',
  'https://www.youtube.com/watch?v=YU0Ip_AefMQ',
  'True'],
 ['0',
  '8',
  'Summertime',
  'Jane Powell',
  'https://www.youtube.com/watch

In [26]:
shs100k_artists = set()
for row in shs100k_data:
    artist_name = row[3]
    shs100k_artists.add(artist_name)
print(len(shs100k_artists))

34170


In [25]:
artist_name

'The Queers'

In [36]:
from collections import Counter

In [42]:
with open(shs100k_train_csv_path, 'r') as f:
    reader = csv.reader(f, delimiter='\t')
    shs100k_train_indices = list(reader)

with open(shs100k_test_csv_path, 'r') as f:
    reader = csv.reader(f, delimiter='\t')
    shs100k_test_indices = list(reader)

with open(shs100k_val_csv_path) as f:
    reader = csv.reader(f, delimiter='\t')
    shs100k_val_indices = list(reader)

In [47]:
print(len(shs100k_val_indices))
print(f'{len(set([int(i[0]) for i in shs100k_val_indices])):>6,} SHS100K2-VAL cliques are used.')

10884
 1,842 SHS100K2-VAL cliques are used.


In [43]:
shs100k_train_clique_sizes = np.array([[int(i) for i in row] for row in shs100k_train_indices])
shs100k_train_clique_sizes = shs100k_train_clique_sizes[:, 0]
shs100k_train_clique_sizes = np.array(list(Counter(shs100k_train_clique_sizes.tolist()).values()))

print(f'Max: {shs100k_train_clique_sizes.max()}')
print(f'Median {np.median(shs100k_train_clique_sizes)}')
print(f'Mean: {shs100k_train_clique_sizes.mean()}')
print()

shs100k_val_clique_sizes = np.array([[int(i) for i in row] for row in shs100k_val_indices])
shs100k_val_clique_sizes = shs100k_val_clique_sizes[:, 0]
shs100k_val_clique_sizes = np.array(list(Counter(shs100k_val_clique_sizes.tolist()).values()))

print(f'Max: {shs100k_val_clique_sizes.max()}')
print(f'Median {np.median(shs100k_val_clique_sizes)}')
print(f'Mean: {shs100k_val_clique_sizes.mean()}')
print()

shs100k_test_clique_sizes = np.array([[int(i) for i in row] for row in shs100k_test_indices])
shs100k_test_clique_sizes = shs100k_test_clique_sizes[:, 0]
shs100k_test_clique_sizes = np.array(list(Counter(shs100k_test_clique_sizes.tolist()).values()))

print(f'Max: {shs100k_test_clique_sizes.max()}')
print(f'Median {np.median(shs100k_test_clique_sizes)}')
print(f'Mean: {shs100k_test_clique_sizes.mean()}')

Max: 359
Median 12.0
Mean: 16.35818933132983

Max: 17
Median 6.0
Mean: 5.908794788273616

Max: 162
Median 5.0
Mean: 6.233451536643026


In [None]:
shs100k_train_data = []
for row_i in shs100k_train_indices:
    for row_j in shs100k_data:
        if row_i[0] == row_j[0] and row_i[1] == row_j[1]:
            shs100k_train_data.append(row_j)
            break
print(len(shs100k_train_data))
assert len(shs100k_train_data) == len(shs100k_train_indices)

shs100k_train_dict = {}
for row in shs100k_train_data:
    clique_id = row[0]
    # version_id = row[1]
    title = hard_clean_text(clean_parentheses(row[2]))
    artist = soft_clean_text(row[3])
    if clique_id in shs100k_train_dict:
        shs100k_train_dict[clique_id][0].add(title)
        shs100k_train_dict[clique_id][1].add(artist)
    else:
        shs100k_train_dict[clique_id] = [{title}, {artist}]

In [None]:
dataset_path = os.path.join(dump_dir, 'dataset.json')
with open(dataset_path+'.test') as in_f:
    test_cliques = json.load(in_f)

In [None]:
shs_matched_clique_ids = set()
discogs_matched_shs_clique_ids = set()
for shs_clique_id, (shs_titles, shs_artists) in shs100k_train_dict.items():
    for shs_title in shs_titles:
        if shs_title in discogs_cliques:
            for discogs_clique in discogs_cliques[shs_title]:
                # Compare each SHS artist with the version artists of Discogs
                for shs_artist in shs_artists:
                    for discogs_title, discogs_artists in discogs_clique['versions']:
                        if shs_artist in discogs_artists:
                            # print(shs_title, shs_artist, discogs_artists)
                            shs_matched_clique_ids.add(discogs_clique['clique_id'])
                            discogs_matched_shs_clique_ids.add(shs_clique_id)
                #             break
                # break
print(f'{len(shs_matched_clique_ids)} SHS100K2 cliques are matched with Discogs cliques')
print(f'{len(discogs_matched_shs_clique_ids)} Discogs cliques are matched with SHS100K2 cliques')

In [None]:
len(shs_matched_clique_ids.intersection(set(test_cliques.keys()))) / len(test_cliques) * 100

In [None]:
val_intersect_versions = []
for clique_id in shs_matched_clique_ids.intersection(set(test_cliques.keys())):
    val_intersect_versions.append(len(test_cliques[clique_id]))

In [None]:
sum(val_intersect_versions)