## Exploratory analysis for differences between performers and genres

Small little notebook that creates a few different plots showing numbers of genre/performer tags and basic features (sliding pitch class entropy, notes-per-second) between genres

In [1]:
import os
from collections import Counter, defaultdict

import pandas as pd
import numpy as np
from tqdm import tqdm

from jazz_style_conditioned_generation import utils, plotting
from jazz_style_conditioned_generation.data.conditions import validate_condition_values, INCLUDE
from jazz_style_conditioned_generation.data.scores import load_score, preprocess_score, get_notes_from_score

  from .autonotebook import tqdm as notebook_tqdm


### Genre weight counts

In [2]:
js = utils.get_data_files_with_ext("data/raw", "**/*_tivo.json")
genre_count = Counter()
albums = set()

for p in js:
    read = utils.read_json_cached(p)
    dataset = p.split(os.path.sep)[-3]
    album = read["pianist"] + read["tivo_album_name"]
    if album not in albums:
        if dataset != "jja" and dataset != "bushgrafts":
            for gen in read["genres"]:
                genre_count[gen["weight"]] += 1
        albums.add(album)

In [3]:
bp = plotting.BarPlotWeightDistribution(dict(genre_count))
bp.create_plot()
fpath = os.path.join(utils.get_project_root(), "outputs/figures/tivo_metadata/barplot_genre_weights")
bp.save_fig(fpath)

### Performer/genre bar charts

In [2]:
metadata = utils.get_data_files_with_ext("data/raw", "**/*_tivo.json")
metadata_read = [utils.read_json_cached(js) for js in metadata]

In [3]:
def get_genres(metadat):
    track_genres = [(i["name"], i["weight"]) for i in metadat["genres"]]
    validated = validate_condition_values(track_genres, "genres")
    return [g for g, _ in validated]

In [5]:
pianist_res = Counter()
genre_res = Counter()
albums = set()

for track_fp in metadata:
    track = utils.read_json_cached(track_fp)
    pianist = track["pianist"]
    album = track["pianist"] + track["tivo_album_name"]
    if album in albums:
        continue
    if pianist != "Doug McKenzie":
        pianist_res[pianist] += 1
    for genre in track["genres"]:
        genre = genre["name"]
        genre_res[genre] += 1
    albums.add(album)

# Subset to get N top genres
sorted_gen_res = Counter()
for g, n in sorted(genre_res.items(), key=lambda d: d[1], reverse=True)[:25]:
    for _ in range(n):
        sorted_gen_res[g] += 1

sorted_pin_res = Counter()
for g, n in sorted(pianist_res.items(), key=lambda d: d[1], reverse=True)[:25]:
    for _ in range(n):
        sorted_pin_res[g] += 1

In [5]:
bp = plotting.BarPlotPianistGenreCount([sorted_pin_res, sorted_gen_res])
bp.create_plot()
fpath = os.path.join(utils.get_project_root(), "outputs/figures/tivo_metadata/barplot_performer_genre_counts")
bp.save_fig(fpath)

### Grouped genre bar chart

In [4]:
genre_res = Counter()
albums = set()

artist_metadatas = utils.get_data_files_with_ext("references/tivo_artist_metadata", "**/*.json")

for track_fp in metadata:
    track = utils.read_json_cached(track_fp)
    track_genres = get_genres(track)
    # Use genres associated with the track, if we have these
    if len(track_genres) > 0:
        for gen in track_genres:
            genre_res[gen] += 1
    # Otherwise, use genres associated with the pianist
    else:
        pianist = track["pianist"]
        art_fp = os.path.join(utils.get_project_root(), "references/tivo_artist_metadata", pianist.replace(" ", "") + ".json")
        if os.path.isfile(art_fp):
            art_js = utils.read_json_cached(art_fp)
            for gen in get_genres(art_js):
                genre_res[gen] += 1


In [5]:
bp = plotting.BarPlotGroupedGenreCounts(genre_res)
bp.create_plot()
fpath = os.path.join(utils.get_project_root(), "outputs/figures/tivo_metadata/barplot_grouped_genre_counts")
bp.save_fig(fpath)

### Performer/genre heatmap

In [9]:
res = []    # store results
# Loading genres from all track metadata
for track in metadata_read:
    pianist = track["pianist"]
    if pianist not in INCLUDE["pianist"]:
        continue
    for genre in get_genres(track):
        res.append(dict(performer=track["pianist"], genre=genre))

In [10]:
# Loading genres from all artist metadata
artist_metadatas = utils.get_data_files_with_ext("references/tivo_artist_metadata", "**/*.json")
for art in artist_metadatas:
    read = utils.read_json_cached(art)
    pianist = read["tivo_artist_name"]
    if pianist not in INCLUDE["pianist"]:
        continue
    for genre in get_genres(read):
        res.append(dict(performer=pianist, genre=genre))

In [6]:
# Create the plot
hm = plotting.HeatmapPerformerGenreCounts(pd.DataFrame(res))
hm.create_plot()
fpath = os.path.join(utils.get_project_root(), "outputs/figures/tivo_metadata/heatmap_performer_genre_counts")
hm.save_fig(fpath)

  mask[mask == 0] = ""


### Extract sliding pitch-class entropy + notes-per-second per genre

In [6]:
def pc_entropy(notes_) -> float:
    # Shamelessly ripped from muspy ;)
    counter = np.zeros(12)
    for note in notes_:
        counter[note.pitch % 12] += 1
    denominator = counter.sum()
    if denominator < 1:
        entropy = np.nan
    else:
        prob = counter / denominator
        with np.errstate(divide="ignore", invalid="ignore"):
            # Use natural log (not base 2) for compatibility with PiJAMA paper
            entropy = -np.nansum(prob * np.log(prob))
    return entropy

In [7]:
WINDOW = 15    # window size used for sliding pitch class entropy, as in PiJAMA
genre_pces = defaultdict(list)
genre_nps = defaultdict(list)

for track_fp in tqdm(metadata):
    # Getting genres associated with the track + artist
    track = utils.read_json_cached(track_fp)
    pianist = track["pianist"]
    genres = get_genres(track)

    # If the track has no genres
    if len(genres) == 0:
        # Try getting the genres for the pianist
        pianist = track["pianist"]
        art_fp = os.path.join(utils.get_project_root(), "references/tivo_artist_metadata", pianist.replace(" ", "") + ".json")
        if os.path.isfile(art_fp):
            art_js = utils.read_json_cached(art_fp)
            genres = get_genres(art_js)
    # If the track still has no genres, skip over it
    if len(genres) == 0:
        continue
   
    # Load up the score
    score_path = track_fp.replace("metadata_tivo.json", "piano_midi.mid")
    score = preprocess_score(load_score(score_path, as_seconds=True))
    
    # Extract sliding pitch-class entropy
    notes = get_notes_from_score(score)
    all_pces = []
    for start in range(0, round(score.end()) - WINDOW):    # 1 second hop
        end = start + WINDOW
        notes_in_window = [n for n in notes if start < n.time < end]
        all_pces.append(pc_entropy(notes_in_window))
        
    # Extract notes-per-second
    all_nps = []
    for start in range(0, round(score.end())):    # 1 second hop
        end = start + 1.
        notes_in_window = [n for n in notes if start < n.time < end]
        all_nps.append(len(notes_in_window))
    
    # Append results for all genres
    for genre in genres:
        genre_pces[genre].append(np.nanmean(all_pces))
        genre_nps[genre].append(np.nanmean(all_nps))


  genre_pces[genre].append(np.nanmean(all_pces))
100%|██████████| 4462/4462 [06:59<00:00, 10.65it/s]


In [8]:
all_genre_res = []
for genre in genre_pces.keys():
    pces = genre_pces[genre]
    nps = genre_nps[genre]
    all_genre_res.append(dict(name=genre, pce=np.nanmean(pces), nps=np.nanmean(nps), pce_std=np.nanstd(pces), nps_std=np.nanstd(nps)))

In [9]:
print(all_genre_res)

[{'name': 'Modal Jazz', 'pce': 2.3225340140096793, 'nps': 12.555312613200838, 'pce_std': 0.09547253848746712, 'nps_std': 3.4820555475207158}, {'name': 'Cool Jazz', 'pce': 2.295009714217528, 'nps': 12.17196437334959, 'pce_std': 0.12292558112396135, 'nps_std': 3.5183871145150176}, {'name': 'Avant-Garde Jazz', 'pce': 2.1408029775780197, 'nps': 8.285429387069097, 'pce_std': 0.19858996847722968, 'nps_std': 3.4098950628065747}, {'name': 'Bop', 'pce': 2.268473589865863, 'nps': 11.007015960806614, 'pce_std': 0.09957792806220905, 'nps_std': 3.364436925740267}, {'name': 'Straight-Ahead Jazz', 'pce': 2.2318851039060057, 'nps': 9.786057899353397, 'pce_std': 0.14700442588239152, 'nps_std': 3.5164875526171175}, {'name': 'Post-Bop', 'pce': 2.2198353692209896, 'nps': 9.369270956505655, 'pce_std': 0.16835922027681757, 'nps_std': 3.6825243736789552}, {'name': 'Pop/Rock', 'pce': 2.2160521948793663, 'nps': 9.939947806282786, 'pce_std': 0.14429701850174592, 'nps_std': 4.291618050924111}, {'name': 'Hard Bop

In [21]:
all_pces = [x for xs in genre_pces.values() for x in xs]
all_nps = [x for xs in genre_nps.values() for x in xs]
res_for_corr = np.array([(pc, nps) for pc, nps in zip(all_pces, all_nps) if not np.isnan(pc) and not np.isnan(nps)])
print(np.corrcoef(res_for_corr.transpose()), res_for_corr.shape)

[[1.         0.39498525]
 [0.39498525 1.        ]] (8732, 2)


In [10]:
bp = plotting.BarPlotGenrePCENPS(pd.DataFrame(all_genre_res))
bp.create_plot()
fpath = os.path.join(utils.get_project_root(), "outputs/figures/tivo_metadata/barplot_genre_pce_nps")
bp.save_fig(fpath)

  ax_.set_xticklabels(ax_.get_xticklabels(), rotation=90)
  ax_.set_xticklabels(ax_.get_xticklabels(), rotation=90)
