In [1]:
from pathlib import Path
from string import punctuation

import pandas as pd
import psycopg2 as pg

import database

In [2]:
def agg_factory_SELECT(agg):
    return "{feature}.{agg} AS {feature}_{agg}".format(agg=agg, feature="{feature}")

In [22]:
pitches = [f'pitch_{i}' for i in range(12)]
spectrals = "spectral_bandwidth spectral_centroid spectral_flatness spectral_rolloff".split()
tonnetzes = "x_perf5 x_min3 x_maj3 y_perf5 y_min3 y_maj3".split()
features = "dynamic_tempo".split()
agg_features = features + pitches + spectrals

In [23]:
def features_aggs_SELECT(features, *aggs):
    aggs = ',\n'.join(agg_factory_SELECT(agg) for agg in aggs)
    return ',\n\n'.join(aggs.format(feature=feature) for feature in features)

In [24]:
def table_others_LEFT_JOIN(table, *others):
    leftjoin = "LEFT JOIN {other} ON {table}.song={other}.song AND " \
               "{table}.segment={other}.segment".format(table=table, other="{other}")
    
    leftjoins = "\n".join(leftjoin.format(other=other) for other in others)
    return f'FROM {table}\n{leftjoins}'

In [25]:
# NOTE: treat 'tempo' feature differently since it only has one feature -- 'val'
aggs = ['mean', 'median', 'std', 'amin', 'amax']
features_SELECT = features_aggs_SELECT(agg_features, *aggs) #
features_LEFT_JOIN = table_others_LEFT_JOIN("tempo", *agg_features)
features_SELECT_LEFT_JOIN = f"SELECT\n{features_SELECT}\n\n{features_LEFT_JOIN}"

features_QUERY = f"""
SELECT DISTINCT ON (tempo.song, tempo.segment)
tempo.song AS song,
tempo.segment AS seg,
tempo.val AS tempo,

{features_SELECT}

{features_LEFT_JOIN};
"""

print(features_QUERY)


SELECT DISTINCT ON (tempo.song, tempo.segment)
tempo.song AS song,
tempo.segment AS seg,
tempo.val AS tempo,

dynamic_tempo.mean AS dynamic_tempo_mean,
dynamic_tempo.median AS dynamic_tempo_median,
dynamic_tempo.std AS dynamic_tempo_std,
dynamic_tempo.amin AS dynamic_tempo_amin,
dynamic_tempo.amax AS dynamic_tempo_amax,

pitch_0.mean AS pitch_0_mean,
pitch_0.median AS pitch_0_median,
pitch_0.std AS pitch_0_std,
pitch_0.amin AS pitch_0_amin,
pitch_0.amax AS pitch_0_amax,

pitch_1.mean AS pitch_1_mean,
pitch_1.median AS pitch_1_median,
pitch_1.std AS pitch_1_std,
pitch_1.amin AS pitch_1_amin,
pitch_1.amax AS pitch_1_amax,

pitch_2.mean AS pitch_2_mean,
pitch_2.median AS pitch_2_median,
pitch_2.std AS pitch_2_std,
pitch_2.amin AS pitch_2_amin,
pitch_2.amax AS pitch_2_amax,

pitch_3.mean AS pitch_3_mean,
pitch_3.median AS pitch_3_median,
pitch_3.std AS pitch_3_std,
pitch_3.amin AS pitch_3_amin,
pitch_3.amax AS pitch_3_amax,

pitch_4.mean AS pitch_4_mean,
pitch_4.median AS pitch_4_median,


In [26]:
connection = database.connect_db()

In [27]:
from datetime import datetime

print(datetime.now().strftime("%A, %d. %B %Y %I:%M%p"), '\n')
df = pd.read_sql(features_QUERY, connection)
print(datetime.now().strftime("%A, %d. %B %Y %I:%M%p"), '\n')

Tuesday, 12. February 2019 02:46PM 

Tuesday, 12. February 2019 02:58PM 



In [28]:
df.shape

(2121, 88)

In [29]:
eminem = 'Eminem'
rhcp = 'RedHotChiliPeppers'

In [30]:
punct_table = str.maketrans({char: None for char in punctuation + ' '})

def find_artist(song):
    for track in tracks:
        name = track.name.title().translate(punct_table)
        if song in name:
            if eminem in name:
                return eminem
            elif rhcp in name:
                return rhcp
            else:
                return np.nan

In [31]:
cwd = Path.cwd()
tracks_dir = cwd / 'tracks'
data_dir = cwd / 'data'
pkl_path = data_dir / 'audio_df.pkl'

tracks = list(tracks_dir.iterdir())

In [32]:
df['artist'] = df['song'].map(find_artist)

In [33]:
df.tail()

Unnamed: 0,song,seg,tempo,dynamic_tempo_mean,dynamic_tempo_median,dynamic_tempo_std,dynamic_tempo_amin,dynamic_tempo_amax,pitch_0_mean,pitch_0_median,...,spectral_flatness_median,spectral_flatness_std,spectral_flatness_amin,spectral_flatness_amax,spectral_rolloff_mean,spectral_rolloff_median,spectral_rolloff_std,spectral_rolloff_amin,spectral_rolloff_amax,artist
2116,YouReNeverOver,5,103.359375,105.83436,103.359375,11.737513,103.359375,161.499023,0.149067,0.119116,...,0.054638,0.068446,0.005677,0.390863,7195.289824,7353.588867,1233.897301,3746.777344,9776.074219,Eminem
2117,YouReNeverOver,6,103.359375,110.019335,103.359375,18.516281,103.359375,161.499023,0.162364,0.132784,...,0.053868,0.071312,0.004768,0.345875,7106.148697,7299.755859,1275.627134,3520.678711,9722.241211,Eminem
2118,YouReNeverOver,7,103.359375,105.199658,103.359375,9.839286,103.359375,161.499023,0.158162,0.126101,...,0.05192,0.072374,0.002913,0.370637,7078.757196,7251.306152,1146.33141,3488.378906,9711.474609,Eminem
2119,YouReNeverOver,8,103.359375,103.989371,103.359375,6.019208,103.359375,161.499023,0.14706,0.133317,...,0.046049,0.065159,0.002534,0.356291,6910.874878,7116.723633,1036.181625,3574.511719,9388.476562,Eminem
2120,YouReNeverOver,9,103.359375,105.467598,103.359375,9.90448,103.359375,151.999081,0.125093,0.110724,...,0.050464,0.069711,0.003367,0.365145,7283.73929,7498.937988,938.761908,4414.306641,9259.277344,Eminem


In [34]:
df.to_pickle(pkl_path)

In [35]:
connection.close()

In [13]:
# df.to_pickle("joined_df.pkl")

In [87]:
2099 / 2121

0.9896275341819897

In [94]:
2121 - 2099  # only 22 entries that could possibly have inconsitent values

22