## Exploratory analysis for differences between performers and genres

Small little notebook that creates a few different plots showing numbers of genre/performer tags and basic features (sliding pitch class entropy, notes-per-second) between genres

In [1]:
import os
from collections import Counter, defaultdict

import pandas as pd
import numpy as np
from tqdm import tqdm

from jazz_style_conditioned_generation import utils, plotting
from jazz_style_conditioned_generation.data.conditions import validate_condition_values, INCLUDE
from jazz_style_conditioned_generation.data.scores import load_score, preprocess_score, get_notes_from_score

  from .autonotebook import tqdm as notebook_tqdm


### Performer/genre bar charts

In [2]:
metadata = utils.get_data_files_with_ext("data/raw", "**/*_tivo.json")
metadata_read = [utils.read_json_cached(js) for js in metadata]

In [3]:
def get_genres(metadat):
    track_genres = [(i["name"], i["weight"]) for i in metadat["genres"]]
    validated = validate_condition_values(track_genres, "genres")
    return [g for g, _ in validated]

In [32]:
pianist_res = Counter()
genre_res = Counter()

for track_fp in metadata:
    track = utils.read_json_cached(track_fp)
    pianist = track["pianist"]
    fp = os.path.join(utils.get_project_root(), "references/tivo_artist_metadata", pianist.replace(" ", "") + ".json")
    if pianist != "Doug McKenzie":
        pianist_res[pianist] += 1
    # for genre in get_genres(track):
    #     genre_res[genre] += 1
    for genre in track["genres"]:
        genre = genre["name"]
        genre_res[genre] += 1

# Subset to get N top genres
sorted_gen_res = Counter()
for g, n in sorted(genre_res.items(), key=lambda d: d[1], reverse=True)[:25]:
    for _ in range(n):
        sorted_gen_res[g] += 1

sorted_pin_res = Counter()
for g, n in sorted(pianist_res.items(), key=lambda d: d[1], reverse=True)[:25]:
    for _ in range(n):
        sorted_pin_res[g] += 1

In [33]:
bp = plotting.BarPlotPianistGenreCount([sorted_pin_res, sorted_gen_res])
bp.create_plot()
fpath = os.path.join(utils.get_project_root(), "outputs/figures/tivo_metadata/barplot_performer_genre_counts")
bp.save_fig(fpath)

### Performer/genre heatmap

In [4]:
res = []    # store results
# Loading genres from all track metadata
for track in metadata_read:
    pianist = track["pianist"]
    if pianist not in INCLUDE["pianist"]:
        continue
    for genre in get_genres(track):
        res.append(dict(performer=track["pianist"], genre=genre))

In [5]:
# Loading genres from all artist metadata
artist_metadatas = utils.get_data_files_with_ext("references/tivo_artist_metadata", "**/*.json")
for art in artist_metadatas:
    read = utils.read_json_cached(art)
    pianist = read["tivo_artist_name"]
    if pianist not in INCLUDE["pianist"]:
        continue
    for genre in get_genres(read):
        res.append(dict(performer=pianist, genre=genre))

In [6]:
# Create the plot
hm = plotting.HeatmapPerformerGenreCounts(pd.DataFrame(res))
hm.create_plot()
fpath = os.path.join(utils.get_project_root(), "outputs/figures/tivo_metadata/heatmap_performer_genre_counts")
hm.save_fig(fpath)

  mask[mask == 0] = ""


### Extract sliding pitch-class entropy + notes-per-second per genre

In [4]:
def pc_entropy(notes_) -> float:
    # Shamelessly ripped from muspy ;)
    counter = np.zeros(12)
    for note in notes_:
        counter[note.pitch % 12] += 1
    denominator = counter.sum()
    if denominator < 1:
        entropy = np.nan
    else:
        prob = counter / denominator
        with np.errstate(divide="ignore", invalid="ignore"):
            # Use natural log (not base 2) for compatibility with PiJAMA paper
            entropy = -np.nansum(prob * np.log(prob))
    return entropy

In [5]:
WINDOW = 15    # window size used for sliding pitch class entropy, as in PiJAMA
genre_pces = defaultdict(list)
genre_nps = defaultdict(list)

for track_fp in tqdm(metadata):
    # Getting genres associated with the track + artist
    track = utils.read_json_cached(track_fp)
    pianist = track["pianist"]
    genres = get_genres(track)
    fp = os.path.join(utils.get_project_root(), "references/tivo_artist_metadata", pianist.replace(" ", "") + ".json")
    
    # Skip over tracks with no genres
    if len(genres) == 0:
        continue    
    
    # Load up the score
    score_path = track_fp.replace("metadata_tivo.json", "piano_midi.mid")
    score = preprocess_score(load_score(score_path, as_seconds=True))
    
    # Extract sliding pitch-class entropy
    notes = get_notes_from_score(score)
    all_pces = []
    for start in range(0, round(score.end()) - WINDOW):    # 1 second hop
        end = start + WINDOW
        notes_in_window = [n for n in notes if start < n.time < end]
        all_pces.append(pc_entropy(notes_in_window))
        
    # Extract notes-per-second
    all_nps = []
    for start in range(0, round(score.end())):    # 1 second hop
        end = start + 1.
        notes_in_window = [n for n in notes if start < n.time < end]
        all_nps.append(len(notes_in_window))
    
    # Append results for all genres
    for genre in genres:
        genre_pces[genre].append(np.nanmean(all_pces))
        genre_nps[genre].append(np.nanmean(all_nps))


100%|██████████| 4462/4462 [05:50<00:00, 12.73it/s] 


In [6]:
all_genre_res = []
for genre in genre_pces.keys():
    pces = genre_pces[genre]
    nps = genre_nps[genre]
    all_genre_res.append(dict(name=genre, pce=np.nanmean(pces), nps=np.nanmean(nps)))

In [7]:
bp = plotting.BarPlotGenrePCENPS(pd.DataFrame(all_genre_res))
bp.create_plot()
fpath = os.path.join(utils.get_project_root(), "outputs/figures/tivo_metadata/barplot_genre_pce_nps")
bp.save_fig(fpath)

  ax_.set_xticklabels(ax_.get_xticklabels(), rotation=90)
  ax_.set_xticklabels(ax_.get_xticklabels(), rotation=90)
