# Data Analysis

In [None]:
from pathlib import Path
from pytube import YouTube
import json

In [None]:
class color:
   BOLD = '\033[1m'
   END = '\033[0m'

### Raw data total duration

In [None]:
def read_jsonl(link_file):
    """
    Function reads a jsonl link file and converts it to a list of dictionaries
    """
    link_info_dict_list = []
    with open(link_file, "r") as opened_link_file:
        # For each line in the link file
        for line in opened_link_file:
            
            # Load each dictionary
            link_info_dict = json.loads(line)
            
            # Add the dictionary to the list
            link_info_dict_list.append(link_info_dict)
    return link_info_dict_list

In [None]:
# Get jsonl full path
data_split = 'train'
jsonl_path_main = Path('../Dataset/youtube_music_links/')
jsonl_full_path = jsonl_path_main / data_split / 'links.jsonl'

In [None]:
link_info_dict_list = read_jsonl(jsonl_full_path)

In [None]:
def get_valid_raw_duration(link_line_dict):
    """
    Given the link dictionary info, return the duration of the valid music piece (interval split applied)
    Meaning that split: "0-50" of a 2 minute music will return 50 seconds, not 120 seconds.
    """
    # Get url and split from the dictionary
    url = link_line_dict['link']
    split = link_line_dict['split']

    # Get the youtube video length
    yt = YouTube(url)
    length = yt.length

    # Prepare the split by replacing end with the length
    split = split.replace('end', str(length))

    duration = 0
    #If there is a split
    if split:
        # Get splits
        separated_splits = split.split(',')

        # For split, get interval and add its duration
        for s in separated_splits:
            dur_start, dur_end = s.split('-')
            duration += int(dur_end) - int(dur_start)

    # If no split, then just add the length to the duration
    else:
        duration += length
    
    return duration

In [None]:
def get_total_data_duration(link_info_dict_list):
    """
    Iterate over the list of the link dictionaries and extract the total duration in hours.
    """
    total_duration_sec = 0 
    for index, link_info_dict in enumerate(link_info_dict_list):
        if index%100==0:
            print(f'The index {index} done!')
        total_duration_sec += get_valid_raw_duration(link_info_dict)

    return total_duration_sec/3600

In [None]:
raw_data_total_duration = get_total_data_duration(link_info_dict_list)

In [None]:
# Our data will have 62.8 hours
print(f'The raw data is in total {color.BOLD}{raw_data_total_duration}{color.END} hour long')

### Clipped data total duration with slide = 15 seconds

In [None]:
music_main_path = Path('../Dataset/raw_music/')

In [None]:
music_full_path = music_main_path / data_split

In [None]:
json_files_path = music_full_path.glob('*.json')
wav_files_path = music_full_path.glob('*.wav')

In [None]:
import wave

def get_wav_length(filepath):
    """
    Get wav length in hours
    """
    with wave.open(filepath, 'rb') as wf:
        num_frames = wf.getnframes()
        frame_rate = wf.getframerate()
        duration = num_frames / frame_rate
    return duration / 3600

In [None]:
total_hour_duration = 0

# For wav file in all wav files
for wav in wav_files_path:
    total_hour_duration += get_wav_length(str(wav))

In [None]:
# Our data will have 120 hours
print(f'Clipped data is in total {color.BOLD}{total_hour_duration}{color.END} hour long')

### Calculate the frequencies of different instruments

In [None]:
from collections import defaultdict

In [None]:
def get_variable_counts(json_files_path):
    """
    Function for counting all unique instruments, genres and moods from the total data (json files)
    """
    instrument_counter = defaultdict(int)
    genre_counter = defaultdict(int)
    moods_counter = defaultdict(int)
    for i, json_path in enumerate(json_files_path):
        if i%1000 == 0:
            print(f'Checkpoint {i} passed!')
        with open(json_path, "r") as json_file:
            info_dict = json.load(json_file)

        # Get instruments, genres and moods
        instruments = info_dict['instrument'].lower().split(', ')
        genres = info_dict['genre'].lower().split(', ')
        moods = info_dict['moods']
    
        for instrument in instruments:
            instrument_counter[instrument] += 1
    
        for genre in genres:
            genre_counter[genre] += 1
    
        for mood in moods:
            moods_counter[mood] += 1

    return instrument_counter, genre_counter, moods_counter

In [None]:
instrument_counter, genre_counter, moods_counter = get_variable_counts(json_files_path)

In [None]:
import matplotlib.pyplot as plt

In [None]:
genre_counter.pop('armenian traditional music')

In [None]:
counters = {'Instruments' : instrument_counter,
            'Moods': moods_counter,
            'Genres': genre_counter}

In [None]:
def plot_most_frequents(counters, colors=['skyblue', 'tomato', 'limegreen'], take_best_n=10):
    """
    Function for plotting most frequent classes for each category in one plot.
    """
    fig, axs = plt.subplots(3, 1, figsize=(10, 12), sharey=True)

    # For each index, category and dictionary:
    for index, (key, dictionary) in enumerate(counters.items()):

        # Sort the count in the dictionary into list of tuples
        sorted_dict_count = sorted(dictionary.items(), key=lambda x: x[1], reverse=True)

        # Get the best N
        best_n_count = sorted_dict_count[:take_best_n]

        # Separate keys and values
        keys = [item[0] for item in best_n_count]
        values = [item[1] for item in best_n_count]

        # Plot barplot
        bars = axs[index].bar(keys, values, color=colors[index])

        # For bar, get the height and write the corresponding value above it
        for bar in bars:
            yval = bar.get_height()
            axs[index].text(bar.get_x() + bar.get_width() / 2, yval, round(yval, 2), va='bottom', ha='center', size=12)

        # Plot the subplots
        axs[index].grid(axis='y', linestyle='--')
        axs[index].set_xlabel('')
        axs[index].set_ylabel('Frequency', size=13)
        axs[index].set_ylim(0, max(values)*1.2)
        axs[index].set_title(f'Top 10 {key} appearing in the dataset (frequency)', size=15)
        axs[index].tick_params(axis='x', rotation=45, labelsize=13)
        
    plt.tight_layout()
    plt.show()
        

In [None]:
plot_most_frequents(counters)

# Evaluation Analysis

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import re

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
data = pd.read_csv('./additional_tools/Evaluation Data.csv')

In [3]:
replacements = {
    '5 - Very good': 5,
    '4 - Good': 4,
    '3 - Moderate': 3,
    '2 - Poor': 2,
    '1 - Very poor': 1,
    '5 - Very well aligning': 5,
    '4 - Well aligning': 4,
    '3 - Moderately aligning': 3,
    '2 - Poorly aligning': 2,
    '1 - Very poorly aligning': 1
}

In [4]:
data = data.replace(replacements)
data = data.drop(['Timestamp', 'Email (Optional)'], axis=1)

  data = data.replace(replacements)


In [5]:
def find_first_hashtag_number(text):
    """
    Get the number of the question. Ex. #1, #2, #3 using regex
    """
    match = re.search(r'#\d+', text)
    if match:
        return match.group()
    else:
        return None

In [6]:
# This are the correct categories for each question
category_mapping = {

    '#1': 'Generated_1',
    '#2': 'MusicGen_1',
    '#3': 'Original_1',
    '#4': 'Original_2',
    '#5': 'Generated_2',
    '#6': 'MusicGen_2',
    '#7': 'MusicGen_3',
    '#8': 'Generated_3',
    '#9': 'Original_3',
    '#10': 'MusicGen_4',
    '#11': 'Original_4',
    '#12': 'Generated_4'
}

In [7]:
# Separate the questions into quality and aligning categories
seen = set()
to_replace = {}

# For column
for col in data.columns:

    # Find the number
    found_number = find_first_hashtag_number(col)

    # If the first time, then it's quality
    if found_number not in seen:
        seen.add(found_number)
        found_number = category_mapping[found_number] + '_quality'
    # If the second time, then it's aligning
    else:
        found_number = category_mapping[found_number] + '_aligning'
        
    # Map the previous column name to the new one that should be replaced
    to_replace[col] = found_number

In [8]:
# Rename the columns
data = data.rename(columns=to_replace)

In [9]:
# Separate the quality and alignment columns
quality_columns = [col for col in data.columns if 'quality' in col]
alignment_columns = [col for col in data.columns if 'aligning' in col]

In [10]:
# Separate the datas
quality_data = data[quality_columns]
alignment_data = data[alignment_columns]

In [11]:
# Define three categories
categories = ['Generated', 'MusicGen', 'Original']

In [12]:
# Separate the datasets based on the categories
quality_separated = {key: [col for col in quality_data.columns if key in col] for key in categories}
alignment_separated = {key: [col for col in alignment_data.columns if key in col] for key in categories}

In [13]:
# Calculate the average for quality
for category, columns in quality_separated.items():
    count = 0
    total = 0
    for col in columns:
        count += data[col].count()
        total += data[col].sum(skipna=True)
    print(f'{category} music quality average is equal to = {total/count}')

Generated music quality average is equal to = 3.8452380952380953
MusicGen music quality average is equal to = 3.201183431952663
Original music quality average is equal to = 4.141176470588236


In [14]:
# Calculate the average for alignment
for category, columns in alignment_separated.items():
    count = 0
    total = 0
    for col in columns:
        count += data[col].count()
        total += data[col].sum(skipna=True)
    print(f'{category} music alignment average is equal to = {total/count}')

Generated music alignment average is equal to = 3.9583333333333335
MusicGen music alignment average is equal to = 3.1736526946107784
Original music alignment average is equal to = 4.158823529411765
