In [54]:
import pandas as pd
import numpy as np
import librosa
import h5py
import math
import warnings
import json

In [55]:
one_to_loop_ratio = 1
loop_unit_multi_sample_pct = 0.2

In [56]:
meta_data = pd.read_csv('audio_metadata - filtered_copy.csv', sep=',')
f = open('patterns.json')
looping_patterns = json.load(f)
# meta_data.drop(columns=['group', 'category', 'sub_category'], inplace=True)

In [57]:
df_id = np.array(meta_data['id'].astype(int)).flatten()

wavs = []
srs = []

def load_sample(f, sample_id):
    group = f["audio_data"][str(sample_id)]
    waveform = np.array(group["waveform"])
    sample_rate = np.array(group["sample_rate"])
    return waveform, sample_rate


with h5py.File('filtered_audio_data.h5', "r") as f:
    for i in df_id:
        waveform, sr = load_sample(f, i)  # load sample with id=42
        wavs.append(waveform)
        srs.append(sr)


samples = pd.DataFrame({'id': df_id, 'waveform': wavs, 'sample_rate': srs, })

In [58]:
samples = samples.set_index('id').join(meta_data.set_index('id'))
#one_shots = samples.copy()
#loops = samples.copy()

In [59]:
samples

Unnamed: 0_level_0,waveform,sample_rate,file_path,file_name,group,category,sub_category,Reverse,Forward,One_Shot,...,Chord,Glitch,Vibraslap,Timpani,Vocal FX,China,Cowbell,Bell,Orchestra,Metalic Tap
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5487,"[5.164626e-05, -0.000103295286, 2.5824858e-05,...",44100,808 and kick/heavy kick.wav,Heavy Kick.wav,Drum,Kick,,0,1,1,...,0,0,0,0,0,0,0,0,0,0
5087,"[-0.0004272461, -0.0016479492, -0.007232666, -...",44100,808 samples/808/bass (12).wav,bass (12).wav,Drum,Kick,,0,1,1,...,0,0,0,0,0,0,0,0,0,0
5084,"[-0.0009765625, -0.0026550293, -0.009033203, -...",44100,808 samples/808/bass (13).wav,bass (13).wav,Drum,Kick,,0,1,1,...,0,0,0,0,0,0,0,0,0,0
5083,"[-0.00039672852, -0.00045776367, -0.0014953613...",44100,808 samples/808/bass (14).wav,bass (14).wav,Drum,Kick,,0,1,1,...,0,0,0,0,0,0,0,0,0,0
5098,"[-0.0029296875, -0.010894775, -0.051208496, -0...",44100,808 samples/808/bass (16).wav,bass (16).wav,Drum,Kick,,0,1,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7061,"[0.0014566779, -0.0014469028, 0.009420276, 0.5...",44100,we unite - ultimate future bounce (sample pack...,16 Kick D#.wav,Drum,Kick,,0,1,1,...,0,0,0,0,0,0,0,0,0,0
7059,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",44100,we unite - ultimate future bounce (sample pack...,17 Kick E.wav,Drum,Kick,,0,1,1,...,0,0,0,0,0,0,0,0,0,0
7053,"[-0.018602252, 0.06855637, 0.044089437, 0.1702...",44100,we unite - ultimate future bounce (sample pack...,18 Kick A.wav,Drum,Kick,,0,1,1,...,0,0,0,0,0,0,0,0,0,0
7046,"[-0.5214118, -0.9589319, -0.13596195, 0.815670...",44100,we unite - ultimate future bounce (sample pack...,19 Kick F.wav,Drum,Kick,,0,1,1,...,0,0,0,0,0,0,0,0,0,0


In [67]:
category_counts = {}
sample_categories = set(samples['category'])
for cat in sample_categories:
    category_counts[cat] = {}
    category_counts[cat]["loop_count"] = len(samples[(samples['category'] == cat) & (samples['One_Shot'] == 0)])
    category_counts[cat]["one_shot_count"] = len(samples[(samples['category'] == cat) & (samples['One_Shot'] == 1)])
    if 'One_Shot_Intent' and 'Loop_id' in samples.columns:
        category_counts[cat]['one_shot_intent_count'] = len(samples[(samples['category'] == cat) & (samples['One_Shot'] == 1) & (samples['One_Shot_Intent'] == 1)])
        category_counts[cat]['loop_intent_count'] = len(samples[(samples['category'] == cat) & (samples['One_Shot'] == 0) & (samples['Loop_id'].notnull())])
    else:
        category_counts[cat]['one_shot_intent_count'] = 0
        category_counts[cat]['loop_intent_count'] = 0
    category_counts[cat]["all"] = category_counts[cat]["loop_count"] + category_counts[cat]["one_shot_count"]
    category_counts[cat]["required_one_shot_count_samples"] = math.floor((one_to_loop_ratio * category_counts[cat]["all"]) / (one_to_loop_ratio +1 + loop_unit_multi_sample_pct))
    category_counts[cat]["required_loop_count_units"] = math.floor((one_to_loop_ratio * category_counts[cat]["all"]) / (one_to_loop_ratio +1 + loop_unit_multi_sample_pct))
    category_counts[cat]["required_loop_count_samples"] = int(category_counts[cat]["required_loop_count_units"] * (1+loop_unit_multi_sample_pct))
    if category_counts[cat]["required_loop_count_samples"] + category_counts[cat]["required_one_shot_count_samples"] > category_counts[cat]["all"]:
        warnings.warn(f"greater required samples than available")


category_counts

{'Clap': {'loop_count': 195,
  'one_shot_count': 1005,
  'one_shot_intent_count': 0,
  'loop_intent_count': 0,
  'all': 1200,
  'required_one_shot_count_samples': 545,
  'required_loop_count_units': 545,
  'required_loop_count_samples': 654},
 'Hi Hat': {'loop_count': 195,
  'one_shot_count': 1005,
  'one_shot_intent_count': 0,
  'loop_intent_count': 0,
  'all': 1200,
  'required_one_shot_count_samples': 545,
  'required_loop_count_units': 545,
  'required_loop_count_samples': 654},
 'Kick': {'loop_count': 195,
  'one_shot_count': 1005,
  'one_shot_intent_count': 0,
  'loop_intent_count': 0,
  'all': 1200,
  'required_one_shot_count_samples': 545,
  'required_loop_count_units': 545,
  'required_loop_count_samples': 654}}

In [61]:
samples_copy = samples.copy()
if 'One_Shot_Intent' and 'Loop_id' not in samples_copy.columns:
    samples_copy['One_Shot_Intent'] = np.nan
    samples_copy['Loop_id'] = np.nan

# next up  
made a way to import files. get their category and count by one shot or loop  

now i need to do the following:  
- figure out the number of samples to switch from one shot to loop in order to have a 50/50 split: done  
  - a small amount (ex: 20%) of loops should use more than 1 sample. need to take into account that we'll use a higher amount of samples for loops than oneshots to acheive a 50/50 split: done  
- identify one shot intent for each oneshot: work in progress  
- doccument their intent
- all of this needs to work with the possibility that loops may or may not exist. can't overwrite existing data when making loops. loop ids can't clash with existing loop ids



# line to save for when 
keep this line for when i need to start generating new ids for the loops
loop_id_num = max(loops['loop_id'])

In [None]:
num_of_loop_units = one_to_loop_ratio / loop_unit_multi_sample_pct
num_of_loop_samples = (one_to_loop_ratio + loop_unit_multi_sample_pct) / loop_unit_multi_sample_pct

# if one shot count is greater than needed and 
for cat in category_counts:
    # count for intents meet the required counts - no edits need to be made
    if (category_counts[cat]['one_shot_intent_count'] == category_counts[cat]['required_one_shot_count_samples']) and (category_counts[cat]['loop_intent_count'] == category_counts[cat]['required_loop_count_samples']):
        print(f'one shot intent and loop intent counts for {cat} meet requirements')
    # 0 count for both intents - just created intent col
    elif category_counts[cat]['one_shot_count'] > category_counts[cat]['required_one_shot_count_samples'] and category_counts[cat]['one_shot_intent_count'] == 0 and category_counts[cat]['loop_intent_count'] == 0 :
        tmp = samples_copy[(samples_copy['category'] == cat) & (samples_copy['One_Shot'] == 1)].copy()
        ones = tmp.iloc[:category_counts[cat]['required_one_shot_count_samples']].copy()
        ones['One_Shot_Intent'] = 1
        
        
        existing_loops = samples_copy[(samples_copy['category'] == cat) & (samples_copy['One_Shot'] == 0)].copy()
        loops = tmp.iloc[category_counts[cat]['required_one_shot_count_samples']:(category_counts[cat]['required_one_shot_count_samples']+category_counts[cat]['required_loop_count_samples']-len(existing_loops))].copy()
        

        existing_loops['One_Shot_Intent'] = 0
        existing_loops['Loop_id'] = existing_loops['id']
        loops['One_Shot_Intent'] = 0
        # add way to generate loops using the looper file. whould rename looper file to something like loop generator
        # should rename this file to something like loop manager
        # need to join every file back to samples_copy using id

        
        print(f'category: {cat} - - tmp len: {len(tmp)} - - ones len: {len(ones)} - - loops len: {len(loops)}')
    # non 0 count exists for intents. intents already existing - dataset has done some loop generation already and new files were probably added
    elif (category_counts[cat]['one_shot_count'] > category_counts[cat]['required_one_shot_count_samples']) and (category_counts[cat]['one_shot_intent_count'] < category_counts[cat]['required_one_shot_count_samples']) and (category_counts[cat]['loop_intent_count'] < category_counts[cat]['required_loop_count_samples']):
        tmp = samples_copy[samples_copy['category'] == cat].copy()
        used_ones = tmp[(tmp['One_Shot'] == 1) & (tmp['One_Shot_Intent'] == 1)]
        unused_ones = tmp[(tmp['One_Shot'] == 1) & (tmp['One_Shot_Intent'].isnull())]
        print(f'used_ones len: {len(used_ones)} -- unused_ones len: {len(unused_ones)}')
    elif category_counts[cat]['needed_each'] > category_counts[cat]['one_shot_count']:
        warnings.warn(f"not enough one shots to reach {one_to_loop_ratio}:1 ratio")



category: Clap - - tmp len: 1005 - - ones len: 545 - - loops len: 459
category: Hi Hat - - tmp len: 1005 - - ones len: 545 - - loops len: 459
category: Kick - - tmp len: 1005 - - ones len: 545 - - loops len: 459


In [65]:
ones.head()

Unnamed: 0_level_0,waveform,sample_rate,file_path,file_name,group,category,sub_category,Reverse,Forward,One_Shot,...,Vibraslap,Timpani,Vocal FX,China,Cowbell,Bell,Orchestra,Metalic Tap,One_Shot_Intent,Loop_id
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5487,"[5.164626e-05, -0.000103295286, 2.5824858e-05,...",44100,808 and kick/heavy kick.wav,Heavy Kick.wav,Drum,Kick,,0,1,1,...,0,0,0,0,0,0,0,0,1,
5087,"[-0.0004272461, -0.0016479492, -0.007232666, -...",44100,808 samples/808/bass (12).wav,bass (12).wav,Drum,Kick,,0,1,1,...,0,0,0,0,0,0,0,0,1,
5084,"[-0.0009765625, -0.0026550293, -0.009033203, -...",44100,808 samples/808/bass (13).wav,bass (13).wav,Drum,Kick,,0,1,1,...,0,0,0,0,0,0,0,0,1,
5083,"[-0.00039672852, -0.00045776367, -0.0014953613...",44100,808 samples/808/bass (14).wav,bass (14).wav,Drum,Kick,,0,1,1,...,0,0,0,0,0,0,0,0,1,
5098,"[-0.0029296875, -0.010894775, -0.051208496, -0...",44100,808 samples/808/bass (16).wav,bass (16).wav,Drum,Kick,,0,1,1,...,0,0,0,0,0,0,0,0,1,
