In [1]:
import ast
from collections import Counter
import math
import zlib
import pandas as pd

In [3]:
output_directory = "music_metrics.csv"

In [4]:
def extract_features(notes_dict):
    pitches = [note[0] for note in notes_dict.values()]
    durations = [note[1] for note in notes_dict.values()]
    velocities = [note[2] for note in notes_dict.values()]
    return pitches, durations, velocities
def calculate_entropy(sequence):
    counts = Counter(sequence)
    probabilities = [count / len(sequence) for count in counts.values()]
    return -sum(p * math.log2(p) for p in probabilities if p > 0)
def calculate_compression_ratio(sequence):
    # Convert the sequence to a string for compression
    sequence_str = ",".join(map(str, sequence))
    compressed = zlib.compress(sequence_str.encode('utf-8'))
    return len(compressed) / len(sequence_str.encode('utf-8'))

def calculate_metrix(df):
    complexity_scores = []
    for _, row in df.iterrows():
        notes_dict = ast.literal_eval(row["Notes"])  # Convert string back to dictionary
        pitches, durations, velocities = extract_features(notes_dict)

        metrics = {
        "pitch_entropy": calculate_entropy(pitches),
        "duration_entropy": calculate_entropy(durations),
        "velocity_entropy": calculate_entropy(velocities),
        "pitch_compression": calculate_compression_ratio(pitches),
        "duration_compression": calculate_compression_ratio(durations),
        "velocity_compression": calculate_compression_ratio(velocities),
        }

        complexity_scores.append(metrics)

    # Add the complexity column to the dataframe
    return complexity_scores

In [7]:
df = pd.read_csv(output_directory)
df["Metrix"] = calculate_metrix(df)
print(df["Metrix"])

0       {'pitch_entropy': 4.039494663005557, 'duration...
1       {'pitch_entropy': 3.3698373541095803, 'duratio...
2       {'pitch_entropy': 3.6740738464145557, 'duratio...
3       {'pitch_entropy': 4.404917585176534, 'duration...
4       {'pitch_entropy': 4.484162704131576, 'duration...
                              ...                        
2031    {'pitch_entropy': 4.346988230461985, 'duration...
2032    {'pitch_entropy': 4.517722105737881, 'duration...
2033    {'pitch_entropy': 4.0377172942187, 'duration_e...
2034    {'pitch_entropy': 3.960525399817489, 'duration...
2035    {'pitch_entropy': 4.226813284500464, 'duration...
Name: Metrix, Length: 2036, dtype: object


In [6]:
df.shape

(2036, 7)

In [10]:
def extract_notes_from_df(df):
    # Crée trois nouvelles colonnes pour les pitches, durations et velocities
    df["pitches"] = df["Notes"].apply(lambda x: [note[0] for note in ast.literal_eval(x).values()])
    df["durations"] = df["Notes"].apply(lambda x: [note[1] for note in ast.literal_eval(x).values()])
    df["velocities"] = df["Notes"].apply(lambda x: [note[2] for note in ast.literal_eval(x).values()])
    return df


df = extract_notes_from_df(df)

In [16]:
#l = [df["pitches"][0],df["durations"][0],df["velocities"][0]]


print(df["pitches"][0][0]) #df["pitches"] est en 2D

55


Il n'y a que des partitions de piano dans le dataset, une seule piste à chaque morceau/ligne (donc ce qu'on voulait nice)