In [1]:
from datetime import datetime
from pathlib import Path
from string import punctuation

import pandas as pd
import psycopg2 as pg

import database

In [4]:
def agg_factory_SELECT(agg):
    """Return query part usable in SELECT with aggregation column for aribitrary feature table."""
    return "{feature}.{agg} AS {feature}_{agg}".format(agg=agg, feature="{feature}")


def features_aggs_SELECT(features, aggs):
    """Return query part usable in SELECT with feature tables and aggregation columns."""
    aggs = ',\n'.join(agg_factory_SELECT(agg) for agg in aggs)
    return ',\n\n'.join(aggs.format(feature=feature) for feature in features)


def table_others_LEFT_JOIN(table, others):
    """Return query part for LEFT JOIN table ON other tables."""
    leftjoin = "LEFT JOIN {other} ON {table}.song={other}.song AND " \
               "{table}.segment={other}.segment".format(table=table, other="{other}")
    
    leftjoins = "\n".join(leftjoin.format(other=other) for other in others)
    return f'FROM {table}\n{leftjoins}'

In [3]:
pitches = [f'pitch_{i}' for i in range(12)]
spectrals = "spectral_bandwidth spectral_centroid spectral_flatness spectral_rolloff".split()
features = ["dynamic_tempo"] + pitches + spectrals
aggs = ['mean', 'median', 'std', 'amin', 'amax']

In [6]:
# NOTE: treat 'tempo' feature differently since it only has one feature -- 'val'
features_SELECT = features_aggs_SELECT(features, aggs)
features_LEFT_JOIN = table_others_LEFT_JOIN("tempo", features)

features_QUERY = f"""
SELECT DISTINCT ON (tempo.song, tempo.segment)
tempo.song AS song,
tempo.segment AS seg,
tempo.val AS tempo,

{features_SELECT}

{features_LEFT_JOIN};
"""

print(features_QUERY)


SELECT DISTINCT ON (tempo.song, tempo.segment)
tempo.song AS song,
tempo.segment AS seg,
tempo.val AS tempo,

dynamic_tempo.mean AS dynamic_tempo_mean,
dynamic_tempo.median AS dynamic_tempo_median,
dynamic_tempo.std AS dynamic_tempo_std,
dynamic_tempo.amin AS dynamic_tempo_amin,
dynamic_tempo.amax AS dynamic_tempo_amax,

pitch_0.mean AS pitch_0_mean,
pitch_0.median AS pitch_0_median,
pitch_0.std AS pitch_0_std,
pitch_0.amin AS pitch_0_amin,
pitch_0.amax AS pitch_0_amax,

pitch_1.mean AS pitch_1_mean,
pitch_1.median AS pitch_1_median,
pitch_1.std AS pitch_1_std,
pitch_1.amin AS pitch_1_amin,
pitch_1.amax AS pitch_1_amax,

pitch_2.mean AS pitch_2_mean,
pitch_2.median AS pitch_2_median,
pitch_2.std AS pitch_2_std,
pitch_2.amin AS pitch_2_amin,
pitch_2.amax AS pitch_2_amax,

pitch_3.mean AS pitch_3_mean,
pitch_3.median AS pitch_3_median,
pitch_3.std AS pitch_3_std,
pitch_3.amin AS pitch_3_amin,
pitch_3.amax AS pitch_3_amax,

pitch_4.mean AS pitch_4_mean,
pitch_4.median AS pitch_4_median,


In [7]:
connection = database.connect_db()

In [8]:
#: This takes ~10 min
print(datetime.now().strftime("%A, %d. %B %Y %I:%M%p"), '\n')
df = pd.read_sql(features_QUERY, connection)
print(datetime.now().strftime("%A, %d. %B %Y %I:%M%p"), '\n')

Wednesday, 13. February 2019 03:26PM 

Wednesday, 13. February 2019 03:37PM 



In [9]:
df.shape

(2121, 88)

In [11]:
eminem = 'Eminem'
rhcp = 'RedHotChiliPeppers'
punct_table = str.maketrans({char: None for char in punctuation + ' '})

def find_artist(song):
    """Return 'Eminem' or 'RedHotChiliPeppers' as song artist if possible. Else return None."""
    
    for track in tracks:
        name = track.name.title().translate(punct_table)
        if song in name:
            if eminem in name:
                return eminem
            elif rhcp in name:
                return rhcp
        return None

In [12]:
cwd = Path.cwd()
tracks_dir = cwd / 'tracks'
data_dir = cwd / 'data'
pkl_path = data_dir / 'audio_df.pkl'

tracks = list(tracks_dir.iterdir())

In [13]:
df['artist'] = df['song'].map(find_artist)

In [14]:
connection.close()

In [None]:
df.to_pickle(pkl_path)