# Music Informatics: Project

- Vasileios Katsaitis  (1115202000073)
- Dimokritos Kolitsos  (1115201900085)
- Konstantinos Chousos (1115202000215) 

In [6]:
import os
import librosa
import numpy as np
import pandas as pd
import re

sample_rate = 44100
duration = 120 # 2 minutes

def trim_starting_zeros(audio):
    # Find the first non-zero sample
    non_zero_index = np.nonzero(audio)[0]
    if len(non_zero_index) > 0:
        return audio[non_zero_index[0]:]
    return audio

def load_and_trim_audio(file_path):
    audio, sr = librosa.load(file_path, sr=sample_rate)
    
    audio = librosa.util.normalize(audio)
    
    # Trim any starting silence
    trimmed_audio = trim_starting_zeros(audio)
    
    # Take only the first 'duration' seconds after trimming
    trimmed_audio = trimmed_audio[:duration * sample_rate]
    
    return trimmed_audio

def load_audio_dataset(base_path):
    data = []
    for root, _, files in os.walk(base_path):
        for file in files:
            if file.endswith('.mp3'):
                file_path = os.path.join(root, file)
                # Load, trim silence, and get the first 'duration' seconds
                audio = load_and_trim_audio(file_path)
                genre = os.path.basename(os.path.dirname(file_path))
                song_name =  re.sub(r'\(.*\)', '', os.path.basename(file).split('.')[0])
                data.append({
                    'song_name': song_name,
                    'genre': genre,
                    'audio': audio,
                })
    return pd.DataFrame(data)

# Load the dataset
df = load_audio_dataset('./data')

In [7]:
import pickle

with open('./data/df.pkl', 'wb') as f:
   pickle.dump(df, f)

del df

In [8]:
with open('./data/df.pkl', 'rb') as f:
   df = pickle.load(f)

In [9]:
display(df)

Unnamed: 0,song_name,genre,audio
0,Soen - Monarch,rock,"[-2.4001781e-12, -1.7760772e-12, -1.2497233e-1..."
1,The Sun,rock,"[3.1816948e-18, 2.9824925e-16, 2.6774146e-15, ..."
2,Stay,rock,"[1.6969752e-05, 3.9227478e-05, -6.531706e-05, ..."
3,Starman,rock,"[1.0315105e-06, -2.3840616e-06, 2.1909225e-06,..."
4,Greta Van Fleet - Watching Over,rock,"[-1.1536949e-28, -1.551013e-26, -2.100204e-25,..."
5,Fleetwood Mac - The Chain,rock,"[6.259764e-06, -2.915813e-05, -4.9381688e-05, ..."
6,Alter Bridge - Open Your Eyes,rock,"[5.717757e-21, 5.401322e-21, 5.0718765e-21, 4...."
7,The Last Song,rock,"[4.8422044e-05, -3.5529763e-05, 7.3554065e-06,..."
8,Pink Floyd - Breathe,rock,"[0.0004476671, 0.0006660364, 0.00060010894, 0...."
9,No Surprises,rock,"[-2.8215513e-19, -2.3695892e-17, -3.3114269e-1..."


### A. Harmony and/or Melody
---

#### Chromagramms & Key detection

Definitions of the provided key templates.

In [10]:
# key templates

edma = np.array([[1., 0.2875, 0.5020, 0.4048, 0.6050, 0.5614, 0.3205, 0.7966, 0.3159, 0.4506, 0.4202, 0.3889], [1., 0.3096, 0.4415, 0.5827, 0.3262, 0.4948, 0.2889, 0.7804, 0.4328, 0.2903, 0.5331, 0.3217]])
edmm = np.array([[1., 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000], [1., 0.2321, 0.4415, 0.6962, 0.3262, 0.4948, 0.2889, 0.7804, 0.4328, 0.2903, 0.5331, 0.3217]])
krumhansl = np.array([[6.35, 2.23, 3.48, 2.33, 4.38, 4.09, 2.52, 5.19, 2.39, 3.66, 2.29, 2.88], [6.33, 2.68, 3.52, 5.38, 2.60, 3.53, 2.54, 4.75, 3.98, 2.69, 3.34, 3.17]])
temperley99 = np.array([[5.0, 2.0, 3.5, 2.0, 4.5, 4.0, 2.0, 4.5, 2.0, 3.5, 1.5, 4.0], [5.0, 2.0, 3.5, 4.5, 2.0, 4.0, 2.0, 4.5, 3.5, 2.0, 1.5, 4.0]])
temperley05 = np.array([[0.748, 0.060, 0.488, 0.082, 0.67, 0.46, 0.096, 0.715, 0.104, 0.366, 0.057, 0.4], [0.712, 0.084, 0.474, 0.618, 0.049, 0.46, 0.105, 0.747, 0.404, 0.067, 0.133, 0.33]])
temperley_essen = np.array([[0.184, 0.001, 0.155, 0.003, 0.191, 0.109, 0.005, 0.214, 0.001, 0.078, 0.004, 0.055], [0.192, 0.005, 0.149, 0.179, 0.002, 0.144, 0.002, 0.201, 0.038, 0.012, 0.053, 0.022]])
thpcp = np.array([[0.95162, 0.20742, 0.71758, 0.22007, 0.71341, 0.48841, 0.31431, 1.00000, 0.20957, 0.53657, 0.22585, 0.55363], [0.94409, 0.21742, 0.64525, 0.63229, 0.27897, 0.57709, 0.26428, 1.0000, 0.26428, 0.30633, 0.45924, 0.35929]])
shaath = np.array([[6.6, 2.0, 3.5, 2.3, 4.6, 4.0, 2.5, 5.2, 2.4, 3.7, 2.3, 3.4], [6.5, 2.7, 3.5, 5.4, 2.6, 3.5, 2.5, 5.2, 4.0, 2.7, 4.3, 3.2]])
gomez = np.array([[0.82, 0.00, 0.55, 0.00, 0.53, 0.30, 0.08, 1.00, 0.00, 0.38, 0.00, 0.47], [0.81, 0.00, 0.53, 0.54, 0.00, 0.27, 0.07, 1.00, 0.27, 0.07, 0.10, 0.36]])
faraldo = np.array([[7.0, 2.0, 3.8, 2.3, 4.7, 4.1, 2.5, 5.2, 2.0, 3.7, 3.0, 3.4], [7.0, 3.0, 3.8, 4.5, 2.6, 3.5, 2.5, 5.2, 4.0, 2.5, 4.5, 3.0]])
pentatonic = np.array([[1.0, 0.1, 0.25, 0.1, 0.5, 0.7, 0.1, 0.8, 0.1, 0.25, 0.1, 0.5], [1.0, 0.2, 0.25, 0.5, 0.1, 0.7, 0.1, 0.8, 0.3, 0.2, 0.6, 0.2]])
# noland = np.array([[0.0629, 0.0146, 0.061, 0.0121, 0.0623, 0.0414, 0.0248, 0.0631, 0.015, 0.0521, 0.0142, 0.0478], [0.0682, 0.0138, 0.0543, 0.0519, 0.0234, 0.0544, 0.0176, 0.067, 0.0349, 0.0297]])

In [22]:
import librosa
import numpy as np

chroma_to_key = ['C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B']
key_templates = [edma, edmm, krumhansl, temperley99, temperley05, temperley_essen, thpcp, shaath, gomez, faraldo, pentatonic]

def detect_key(audio_signal, sr, method='mean'):
    # Compute the Chroma Short-Time Fourier Transform (chroma_stft)
    chromagram = librosa.feature.chroma_stft(y=audio_signal, sr=sr)
    # Calculate the mean or median chroma feature across time
    if method == 'mean':
        chroma = np.mean(chromagram, axis=1)
    elif method == 'median':
        chroma = np.median(chromagram, axis=1)
    else:
        raise ValueError("Method must be 'mean' or 'median'.")
    
    # Normalize the chroma feature
    norm_chroma = chroma / np.sum(chroma)
    
    # Compute the similarity between the chroma feature and each key template
    similarities = []
    for template in key_templates:
        # dot product
        similarity = np.dot(norm_chroma, template[0]) + np.dot(norm_chroma, template[1])
        similarities.append(similarity)
    
    # Find the key template with the highest similarity
    max_similarity_idx = np.argmax(similarities)
    max_similarity = similarities[max_similarity_idx]
    
    # Determine the key and mode (major or minor)
    key_template = key_templates[max_similarity_idx]
    if max_similarity_idx % 2 == 0:
        mode = 'major'
    else:
        mode = 'minor'
    
    # Find the key by selecting the maximum chroma feature
    estimated_key_index = np.argmax(norm_chroma)
    estimated_key = chroma_to_key[estimated_key_index]
    
    return estimated_key, mode

df['key'], df['mode'] = zip(*df['audio'].apply(lambda x: detect_key(x, sample_rate, method='mean')))

In [21]:
display(df)

Unnamed: 0,song_name,genre,audio,key,mode
0,Soen - Monarch,rock,"[-2.4001781e-12, -1.7760772e-12, -1.2497233e-1...",E,minor
1,The Sun,rock,"[3.1816948e-18, 2.9824925e-16, 2.6774146e-15, ...",D,minor
2,Stay,rock,"[1.6969752e-05, 3.9227478e-05, -6.531706e-05, ...",A,minor
3,Starman,rock,"[1.0315105e-06, -2.3840616e-06, 2.1909225e-06,...",F,minor
4,Greta Van Fleet - Watching Over,rock,"[-1.1536949e-28, -1.551013e-26, -2.100204e-25,...",E,minor
5,Fleetwood Mac - The Chain,rock,"[6.259764e-06, -2.915813e-05, -4.9381688e-05, ...",E,minor
6,Alter Bridge - Open Your Eyes,rock,"[5.717757e-21, 5.401322e-21, 5.0718765e-21, 4....",D,minor
7,The Last Song,rock,"[4.8422044e-05, -3.5529763e-05, 7.3554065e-06,...",C#,minor
8,Pink Floyd - Breathe,rock,"[0.0004476671, 0.0006660364, 0.00060010894, 0....",B,minor
9,No Surprises,rock,"[-2.8215513e-19, -2.3695892e-17, -3.3114269e-1...",A,minor
