In [1]:
import pandas as pd
import librosa
import os
import numpy as np
import seaborn as sns
from tqdm import tqdm

# raw audio (235 without truncation and PS)

In [2]:
# calculate zcr of original audio clips (without truncating to 1-second clips)
audio_path = '../data/raw_audios/original'
zcr_list = []

for filename in tqdm(os.listdir(audio_path)):
    new_path = audio_path + '/' + filename
    y, _ = librosa.load(path = new_path)
    zcr_bool = librosa.zero_crossings(y=y, pad=True)
    zcr = librosa.feature.zero_crossing_rate(y=y, center=True)
    zcr_avg = np.mean(zcr)
    zcr_median = np.median(zcr)
    zcr_num = sum(zcr_bool)
    zcr_list.append([filename, zcr_num, zcr_avg, zcr_median])

100%|██████████| 235/235 [01:02<00:00,  3.78it/s]


In [3]:
zcr_df = pd.DataFrame(zcr_list, columns=['youtube_id', 'zcr_num', 'zcr_avg', 'zcr_median'])
zcr_df

Unnamed: 0,youtube_id,zcr_num,zcr_avg,zcr_median
0,normalize_5s_intro_xhmtXrtLkgo.wav,6084,0.054927,0.040283
1,normalize_5s_intro_n4HTXYR-2AI.wav,2060,0.018451,0.012695
2,normalize_5s_intro_hejXc_FSYb8.wav,8523,0.076886,0.073242
3,normalize_5s_intro_IYnu4-69fTA.wav,5663,0.051005,0.047607
4,normalize_5s_intro_RoeRU5zxkak.wav,7183,0.064374,0.030273
...,...,...,...,...
230,normalize_5s_intro_JQTlG7NxJek.wav,3746,0.033716,0.017822
231,normalize_5s_intro_XkC8Uzl9pCY.wav,6339,0.057050,0.058594
232,normalize_5s_intro_1CrxzClzLvs.wav,5996,0.053641,0.050781
233,normalize_5s_intro_xePw8n4xu8o.wav,9761,0.087983,0.094482


In [2]:
score_df = pd.read_csv('../data/YTMM_details.csv')[['youtube_id','memorability_score']]
score_df.head(3)

Unnamed: 0,youtube_id,memorability_score
0,normalize_5s_intro_zyQkFh-E4Ak.wav,0.508772
1,normalize_5s_intro_ZvrysfBDzSs.wav,0.454545
2,normalize_5s_intro_zumMQrI_tMg.wav,0.481928


In [5]:
df = pd.merge(zcr_df, score_df, on='youtube_id')
df

Unnamed: 0,youtube_id,zcr_num,zcr_avg,zcr_median,memorability_score
0,normalize_5s_intro_xhmtXrtLkgo.wav,6084,0.054927,0.040283,0.897436
1,normalize_5s_intro_n4HTXYR-2AI.wav,2060,0.018451,0.012695,0.875000
2,normalize_5s_intro_hejXc_FSYb8.wav,8523,0.076886,0.073242,0.870130
3,normalize_5s_intro_IYnu4-69fTA.wav,5663,0.051005,0.047607,0.877551
4,normalize_5s_intro_RoeRU5zxkak.wav,7183,0.064374,0.030273,0.383562
...,...,...,...,...,...
230,normalize_5s_intro_JQTlG7NxJek.wav,3746,0.033716,0.017822,0.395349
231,normalize_5s_intro_XkC8Uzl9pCY.wav,6339,0.057050,0.058594,0.769231
232,normalize_5s_intro_1CrxzClzLvs.wav,5996,0.053641,0.050781,0.658537
233,normalize_5s_intro_xePw8n4xu8o.wav,9761,0.087983,0.094482,0.559524


In [3]:
def save(df, col, clips_type):
    print('{}, corr {} and memorabilty score : {}'.format(clips_type, col, df[col].corr(df['memorability_score'])))
    plot = sns.scatterplot(x=col, y="memorability_score", data=df).get_figure()
    plot.savefig('./corr_zcr/'+clips_type+'_'+col+'.png') 
    plot.clf()

In [7]:
save(df=df, col='zcr_num', clips_type='raw')
save(df=df, col='zcr_avg', clips_type='raw')
save(df=df, col='zcr_median', clips_type='raw')

raw, corr zcr_num and memorabilty score : 0.15977892077269815
raw, corr zcr_avg and memorabilty score : 0.16027710591562633
raw, corr zcr_median and memorabilty score : 0.16841144568769792


<Figure size 432x288 with 0 Axes>

In [8]:
df.to_csv('./zcr_df/raw_zcr_df',index=False)

# augmented audio clips (235 * 9 truncation * 11 pitch shifting)

In [4]:
# calculate zcr of original audio clips (without truncating to 1-second clips)
truncated_audio_path = '../data/1_second_clips'
truncated_zcr_list = []
truncated_withPS_zcr_list = []

for filename in tqdm(os.listdir(truncated_audio_path)):
    new_path = truncated_audio_path + '/' + filename
    y, _ = librosa.load(path = new_path)
    zcr_bool = librosa.zero_crossings(y=y, pad=True)
    zcr = librosa.feature.zero_crossing_rate(y=y, center=True)
    zcr_avg = np.mean(zcr)
    zcr_median = np.median(zcr)
    zcr_num = sum(zcr_bool)
    if filename[-14:-6] == 'original':
        truncated_zcr_list.append([filename, zcr_num, zcr_avg, zcr_median])
    truncated_withPS_zcr_list.append([filename, zcr_num, zcr_avg, zcr_median])

100%|██████████| 23265/23265 [16:41<00:00, 23.23it/s]


In [10]:
print(len(truncated_zcr_list))
print(len(truncated_withPS_zcr_list))

2115
23265


# only truncation 

In [11]:
truncated_zcr_df = pd.DataFrame(truncated_zcr_list, columns=['youtube_id', 'zcr_num', 'zcr_avg', 'zcr_median'])
truncated_zcr_df['memorability_score'] = 0.0
truncated_zcr_df.head(3)

Unnamed: 0,youtube_id,zcr_num,zcr_avg,zcr_median,memorability_score
0,SQBuVfTX1ME_original_7.wav,1056,0.046442,0.050293,0.0
1,gue_crpFdSE_original_9.wav,2040,0.089056,0.092529,0.0
2,Tfypj4UwvvA_original_5.wav,1114,0.048939,0.03833,0.0


In [12]:
for idx, row in truncated_zcr_df.iterrows():
    original_id = 'normalize_5s_intro_' + row['youtube_id'][:-15] + '.wav'
    score = score_df.loc[score_df['youtube_id'] == original_id]['memorability_score']
    truncated_zcr_df.at[idx, 'memorability_score'] = score

In [13]:
save(df=truncated_zcr_df, col='zcr_num', clips_type='truncation')
save(df=truncated_zcr_df, col='zcr_avg', clips_type='truncation')
save(df=truncated_zcr_df, col='zcr_median', clips_type='truncation')

truncation, corr zcr_num and memorabilty score : 0.1823828894097966
truncation, corr zcr_avg and memorabilty score : 0.18236734843183652
truncation, corr zcr_median and memorabilty score : 0.19152266461395703


<Figure size 432x288 with 0 Axes>

In [14]:
truncated_zcr_df.to_csv('./zcr_df/truncated_zcr_df',index=False)

# truncation + PS

In [5]:
truncated_withPS_zcr_df = pd.DataFrame(truncated_withPS_zcr_list, columns=['youtube_id', 'zcr_num', 'zcr_avg', 'zcr_median'])
truncated_withPS_zcr_df['memorability_score'] = 0.0
truncated_withPS_zcr_df.head(3)

Unnamed: 0,youtube_id,zcr_num,zcr_avg,zcr_median,memorability_score
0,SQBuVfTX1ME_original_7.wav,1056,0.046442,0.050293,0.0
1,a8cJLohQ_Jg_1_semitones_1.wav,1146,0.050237,0.054443,0.0
2,Yh78Ll6-ODQ_-2_semitones_2.wav,2474,0.108387,0.094971,0.0


In [6]:
for idx, row in truncated_withPS_zcr_df.iterrows():
    original_id = 'normalize_5s_intro_' + row['youtube_id'][:11] + '.wav'
    score = score_df.loc[score_df['youtube_id'] == original_id]['memorability_score']
    truncated_withPS_zcr_df.at[idx, 'memorability_score'] = score

In [7]:
save(df=truncated_withPS_zcr_df, col='zcr_num', clips_type='truncation&PS')
save(df=truncated_withPS_zcr_df, col='zcr_avg', clips_type='truncation&PS')
save(df=truncated_withPS_zcr_df, col='zcr_median', clips_type='truncation&PS')

truncation&PS, corr zcr_num and memorabilty score : 0.16800313567250244
truncation&PS, corr zcr_avg and memorabilty score : 0.1679892177988872
truncation&PS, corr zcr_median and memorabilty score : 0.1769466877834053


<Figure size 432x288 with 0 Axes>

In [8]:
truncated_withPS_zcr_df.to_csv('./zcr_df/truncated_withPS_zcr_df',index=False)

In [9]:
truncated_withPS_zcr_df

Unnamed: 0,youtube_id,zcr_num,zcr_avg,zcr_median,memorability_score
0,SQBuVfTX1ME_original_7.wav,1056,0.046442,0.050293,0.800000
1,a8cJLohQ_Jg_1_semitones_1.wav,1146,0.050237,0.054443,0.524390
2,Yh78Ll6-ODQ_-2_semitones_2.wav,2474,0.108387,0.094971,0.708333
3,Yh78Ll6-ODQ_1_semitones_3.wav,2756,0.120228,0.130859,0.708333
4,xhmtXrtLkgo_3_semitones_9.wav,1048,0.045510,0.044434,0.897436
...,...,...,...,...,...
23260,IISA6t-9zzc_4_semitones_7.wav,917,0.040217,0.040771,0.829545
23261,lE_747E_Sdg_-5_semitones_4.wav,1292,0.056818,0.052734,0.883333
23262,RCJx5VW-fQI_5_semitones_2.wav,1656,0.072199,0.067139,0.724638
23263,Z5gvqq3ChII_3_semitones_3.wav,2307,0.101030,0.099854,0.875000
