In [None]:
import gc

import requests
import tempfile  # For temporary file storage
import pandas as pd
from tqdm import tqdm
import plotly.express as px
from concurrent.futures import ThreadPoolExecutor, as_completed, ProcessPoolExecutor
import librosa
import numpy as np
from pydub import AudioSegment
from pydub.utils import which

# Explicitly set the path to ffmpeg and ffprobe
AudioSegment.converter = which("ffmpeg")
AudioSegment.ffprobe = which("ffprobe")

In [None]:
import os
os.cpu_count()

In [None]:
mtg_genre_tags_filepath = 'assets/autotagging_genre.tsv'

In [None]:
columns = ['track_id', 'artist_id', 'album_id', 'path', 'duration', 'tag1', 'tag2', 'tag3', 'tag4', 'tag5', 'tag6', 'tag7', 'tag8', 'tag9']


In [None]:
data = pd.read_csv(mtg_genre_tags_filepath, sep='\t', names=columns, engine='python')

In [None]:
data.sample(5)

In [None]:
# Combine all tag columns into one column (if needed)
data['tags'] = data.iloc[:, 5:].apply(
    lambda row: ', '.join(
        tag.replace('genre---', '') for tag in row.dropna().astype(str) if 'genre---' in tag
    ),
    axis=1
)
data = data[['track_id', 'artist_id', 'album_id', 'path', 'duration', 'tags']]

In [None]:
data = data.drop(index=0)

In [None]:
data.head()

In [None]:
data['duration'] = data['duration'].astype(float)

In [None]:
data['genres_list'] = data['tags'].apply(lambda x: x.split(', '))

unique_genres = set([genre for genres in data['genres_list'] for genre in genres])
for genre in unique_genres:
    data[genre] = data['genres_list'].apply(lambda x: 1 if genre in x else 0)

In [None]:
genre_distribution = data[list(unique_genres)].sum().sort_values(ascending=False)

In [None]:
fig = px.bar(
    x=genre_distribution.index,  # Genre names (x-axis)
    y=genre_distribution.values,  # Count of tracks (y-axis)
    labels={"x": "Genres", "y": "Number of Tracks"},  # Axis labels
    title="Genre Distribution",
    template="plotly_white"
)

fig.show()

In [None]:
fig = px.box(data,
             x='duration',
             points='outliers', 
             title="Box Plot of Track Durations",
             labels={"duration": "Duration (seconds)"},
             template="plotly_white") 

fig.show()

In [None]:
lyrics_genres = ['pop', 'heavymetal','metal', 'rock', 'hiphop', 'rap', 'indie', 'jazz', 'country']

In [None]:
data_genres = data[['track_id', 'artist_id', 'album_id', 'path', 'duration']+lyrics_genres]

In [None]:
data_genres.head()

In [None]:
# Merge metal and heavy metal (due to similarity and small amount of data for heavy metal)
data_genres['metal'] = (data_genres['metal'] | data_genres['heavymetal']).astype(int)

# Drop the old 'heavy metal' column
data_genres_filtered = data_genres.drop(columns=['heavymetal'])

In [None]:
common_genres = ['pop', 'metal', 'rock', 'hiphop', 'rap', 'indie', 'jazz', 'country']

In [None]:
genre_columns = common_genres

# Filter out rows where the sum of genre values is 0
data_genres_filtered = data_genres_filtered[data_genres_filtered[genre_columns].sum(axis=1) > 0]

In [None]:
data_genres_filtered.sample(10)

In [None]:
# Delete duplicate tracks
data_genres_filtered = data_genres_filtered.drop_duplicates(subset=['path'])

In [None]:
genre_distribution_filtered = data_genres_filtered[common_genres].sum().sort_values(ascending=False)

In [None]:
# Get the distribution of genres divided by 10 rounded to integer
genre_distribution_filtered_d10_int = genre_distribution_filtered.apply(lambda x: int(x / 10))

In [None]:
genre_distribution_filtered_d10_int['pop']

In [None]:
fig = px.bar(
    x=genre_distribution_filtered.index,  # Genre names (x-axis)
    y=genre_distribution_filtered.values,  # Count of tracks (y-axis)
    labels={"x": "Genres", "y": "Number of Tracks"},  # Axis labels
    title="Genre Distribution Filtered",
    template="plotly_white"
)

fig.show()

In [None]:
fig = px.box(data_genres_filtered,
             x='duration',
             points='outliers',
             title="Box Plot of Track Durations",
             labels={"duration": "Duration (seconds)"},
             template="plotly_white")

fig.show()

In [None]:
data_genres_filtered.to_csv('assets/autotagging_genres_filtered.csv', index=False)

In [None]:
data_genres_filtered

### Download the data from certain path to certain path

Based on MTG_Jamendo repository

In [None]:
CHUNK_SIZE = 512 * 1024  # 512 KB
BASE_URL = "https://cdn.freesound.org/mtg-jamendo/raw_30s/audio/"

# Function to download a track
def download_track(path, save_directory):
    try:
        # Ensure save directory exists
        if not os.path.exists(save_directory):
            os.makedirs(save_directory)
        
        # Extract file name and construct URL
        file_name = os.path.basename(path)
        save_location = os.path.join(save_directory, file_name)
        url = BASE_URL + path
        
        # Stream the file from the URL
        with requests.get(url, stream=True) as res:
            res.raise_for_status()
            total = int(res.headers.get('content-length', 0))
            with open(save_location, 'wb') as f:
                with tqdm(total=total, unit='B', unit_scale=True, desc=path) as progressbar:
                    for chunk in res.iter_content(chunk_size=CHUNK_SIZE):
                        f.write(chunk)
                        progressbar.update(len(chunk))
        
        return save_location  # Return the local file path
        
    except Exception as e:
        print(f"Error downloading {path}: {e}")
        return None

### Cut the files to 30s

In [None]:
from pydub import AudioSegment

def cut_audio_to_30s_in_memory(input_path):
    """
    Cuts an audio file to a 30-second segment, focusing on the middle portion if it's longer than 30 seconds.
    The output audio is kept in memory for further processing.

    Args:
        input_path (str): Path to the input audio file.

    Returns:
        AudioSegment: The 30-second audio segment.
    """
    try:
        # Load the audio file
        audio = AudioSegment.from_file(input_path)
        duration_ms = len(audio)  # Duration of the audio in milliseconds

        if duration_ms > 30 * 1000:  # If audio is longer than 30 seconds
            # Calculate start and end times for the middle 30 seconds
            start_time = (duration_ms // 2) - (15 * 1000)  # Middle minus 15 seconds
            end_time = start_time + (30 * 1000)           # 30 seconds from the start_time
            audio = audio[start_time:end_time]            # Slice the audio
        else:
            print("Audio is already less than or equal to 30 seconds.")

        return audio

    except Exception as e:
        print(f"Error during cutting: {e}")
        return None


### Extract features from the audio files

In [None]:
def extract_audio_features(input_audio_path):
    """
    Extracts audio features from a 30-second audio file and returns them as a dataframe.

    Args:
        input_audio_path (str): Path to the 30-second audio file.

    Returns:
        pd.DataFrame: DataFrame containing the extracted audio features.
    """
    try:
        # Load the audio file
        y, sr = librosa.load(input_audio_path, sr=None)  # Load with the original sampling rate

        # Feature extraction
        features = {}

        # Zero Crossing Rate
        features["zero_crossing_rate"] = np.mean(librosa.feature.zero_crossing_rate(y)[0])

        # Harmonic-Percussive Source Separation
        harmonic, percussive = librosa.effects.hpss(y)
        features["harmonic_mean"] = np.mean(harmonic)
        features["percussive_mean"] = np.mean(percussive)

        # Tempo using onset strength
        onset_env = librosa.onset.onset_strength(y=y, sr=sr)
        tempo = librosa.feature.tempo(onset_envelope=onset_env, sr=sr)[0]  # Extract tempo

        features["tempo_bpm"] = tempo

        # Spectral Features
        spectral_centroids = librosa.feature.spectral_centroid(y=y, sr=sr)[0]
        features["spectral_centroid_mean"] = np.mean(spectral_centroids)
        features["spectral_bandwidth_mean"] = np.mean(librosa.feature.spectral_bandwidth(y=y, sr=sr)[0])
        features["spectral_rolloff_mean"] = np.mean(librosa.feature.spectral_rolloff(y=y, sr=sr)[0])

        # MFCCs
        mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
        for i, coeff in enumerate(mfccs, 1):
            features[f"mfcc_{i}_mean"] = np.mean(coeff)

        # Chroma Features
        chroma = librosa.feature.chroma_stft(y=y, sr=sr)
        for i, chroma_mean in enumerate(np.mean(chroma, axis=1)):
            features[f"chroma_{i+1}_mean"] = chroma_mean

        # STFT (Spectrogram-based features)
        stft = librosa.stft(y)
        features["stft_magnitude_mean"] = np.mean(np.abs(stft))

        # RMS (Root Mean Square)
        rms = librosa.feature.rms(y=y)[0]
        features["rms_mean"] = np.mean(rms)

        return features

    except Exception as e:
        print(f"Error extracting features: {e}")
        return None

### Download the file, cut it to 30s and extract features and then delete - repeat for all files in autotagging_genres_filtered.csv

In [None]:
import tempfile  # For temporary file storage

def process_single_track(row, save_directory):
    """
    Processes a single track: downloads, cuts to 30s, extracts features, and deletes the file.

    Args:
        row (pd.Series): Row containing metadata and path.
        save_directory (str): Temporary directory for storing downloaded audio.

    Returns:
        dict: Extracted features combined with metadata, or None if processing fails.
    """
    try:
        path = row['path']
        metadata = row.drop('path')  # Exclude path column from metadata

        # Download the track
        local_path = download_track(path, save_directory)
        if not local_path:
            return None

        # Cut the audio to 30 seconds in memory
        audio_segment = cut_audio_to_30s_in_memory(local_path)
        os.remove(local_path)  # Delete downloaded file immediately

        if audio_segment is None:
            return None

        # Write the 30-second segment to a temporary file for librosa processing
        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio_file:
            temp_audio_path = temp_audio_file.name
            audio_segment.export(temp_audio_path, format="wav")

        # Extract features
        features = extract_audio_features(temp_audio_path)
        os.remove(temp_audio_path)  # Delete temporary file immediately

        if features is None:
            return None

        # Combine metadata and features
        return {**metadata, **features}

    except Exception as e:
        print(f"Error processing track {row['path']}: {e}")
        return None

def process_tracks_sequentially(autotagging_genres_filtered, save_directory):
    """
    Processes multiple tracks sequentially, extracting features and compiling into a DataFrame.

    Args:
        autotagging_genres_filtered (pd.DataFrame): DataFrame containing metadata and paths.
        save_directory (str): Temporary directory for storing downloaded audio.

    Returns:
        pd.DataFrame: DataFrame with extracted features and metadata.
    """
    genres_with_extracted_features = []
    
    total_tracks = len(autotagging_genres_filtered)
    for _, row in tqdm(autotagging_genres_filtered.iterrows(), total=total_tracks, desc="Processing Tracks"):
        try:
            result = process_single_track(row, save_directory)
            if result:
                genres_with_extracted_features.append(result)
        except Exception as e:
            print(f"Error during processing: {e}")

    # Convert the results to a DataFrame
    return pd.DataFrame(genres_with_extracted_features)


In [None]:
data_genres_filtered['num_genres'] = data_genres_filtered[common_genres].sum(axis=1)

In [None]:
# Distribution of number of genres
data_genres_filtered['num_genres'].value_counts()

In [None]:
data_genres_one_genre = data_genres_filtered[data_genres_filtered['num_genres'] == 1]

In [None]:
# Introduce column named 'genre' with text value of the genre
data_genres_one_genre['genre'] = data_genres_one_genre[common_genres].idxmax(axis=1)

In [None]:
data_genres_one_genre.sample(5)

In [None]:
# Save to csv
data_genres_one_genre.to_csv('assets/genres_one_genre.csv', index=False)

In [None]:
# Choose randomly 100 tracks from each genre into ona dataframe

test_subset = pd.concat([
    data_genres_one_genre[data_genres_one_genre[genre] == 1].sample(125)
    for genre in common_genres
])

test_subset.to_csv('assets/test_subset.csv', index=False)

In [None]:
genre_distribution_test_subset = test_subset[common_genres].sum().sort_values(ascending=False)

In [None]:
fig = px.bar(
    x=genre_distribution_test_subset.index,  # Genre names (x-axis)
    y=genre_distribution_test_subset.values,  # Count of tracks (y-axis)
    labels={"x": "Genres", "y": "Number of Tracks"},  # Axis labels
    title="Genre Distribution Test Subset",
    template="plotly_white"
)

fig.show()

In [None]:
data_genres_one_genre = data_genres_one_genre.drop(columns=['num_genres'])

In [None]:
# Spliting the data into two parts due to memory issues
data_genres_one_genre_subset_1 = data_genres_one_genre.iloc[:8000]
data_genres_one_genre_subset_2 = data_genres_one_genre.iloc[8000:]

In [None]:
save_directory = "temp_audio"
processed_data = process_tracks_sequentially(data_genres_one_genre_subset_1, save_directory)

# Save to file
output_file = "assets/genres_with_extracted_features_subset_1.csv"
processed_data.to_csv(output_file, index=False)

In [None]:
save_directory = "temp_audio"
processed_data = process_tracks_sequentially(data_genres_one_genre_subset_2, save_directory)

# Save to file
output_file = "assets/genres_with_extracted_features_subset_2.csv"
processed_data.to_csv(output_file, index=False)

In [None]:
processed_data #12885

In [None]:
# Choose randomly 100 tracks from each genre into ona dataframe

test_subset_inbalanced = pd.concat([
    data_genres_one_genre[data_genres_one_genre[genre] == 1].sample(genre_distribution_filtered_d10_int[genre])
    for genre in common_genres
])
test_subset_inbalanced.to_csv('assets/test_subset_inbalanced_big.csv', index=False)
genre_distribution_test_subset_inbalanced = test_subset_inbalanced[common_genres].sum().sort_values(ascending=False)

In [None]:
fig = px.bar(
    x=genre_distribution_test_subset_inbalanced.index,  # Genre names (x-axis)
    y=genre_distribution_test_subset_inbalanced.values,  # Count of tracks (y-axis)
    labels={"x": "Genres", "y": "Number of Tracks"},  # Axis labels
    title="Genre Distribution Test Subset",
    template="plotly_white"
)

fig.show()

In [None]:
save_directory = "temp_audio"
processed_data = process_tracks_sequentially(test_subset_inbalanced, save_directory)

# Save to file
output_file = "assets/genres_with_extracted_features_inbalanced_big.csv"
processed_data.to_csv(output_file, index=False)

In [None]:
data.sample()

In [None]:
subset_1 = pd.read_csv('assets/genres_with_extracted_features_subset_1.csv')
subset_2 = pd.read_csv('assets/genres_with_extracted_features_subset_2.csv')

In [None]:
# When heavymetal 1 then metal 1
subset_1

In [None]:
subset_1['metal'] = (subset_1['metal'] | subset_1['heavymetal']).astype(int)

In [None]:
subset_1[subset_1['heavymetal']==1]

In [None]:
subset_1 = subset_1.drop(columns='heavymetal')

In [None]:
subset_1

In [None]:
subset_2['metal'] = (subset_2['metal'] | subset_2['heavymetal']).astype(int)
subset_2 = subset_2.drop(columns='heavymetal')

In [None]:
# Combine subset 1 and 2 into one
combined_df = pd.concat([subset_1, subset_2])

In [None]:
combined_df

In [None]:
combined_df.to_csv('assets/data_one_genre_with_extracted_features_full.csv')