## Dataset Related

In this notebook someone can expect to find functions and processes used in our work, in currating Music4All and AudioSet datsets.

### Music4All

To gain access to the contents of this dataset visit [this page](https://sites.google.com/view/contact4music4all) to contact the dataset's creators. 

The final train-val-test split for this dataset can be found in this [kaggle dataset](https://kaggle.com/datasets/e78330360ccc38240615520a24abff467392708cb1426d10f841520cf1fb35ff) or this [google drive folder](https://drive.google.com/drive/folders/1vafYhkNSFiJ8w0YmCLmZWRvABZhKVK-c?usp=drive_link). The json files contain id, lyrics, genre label (primary_short) and emotion label for each entry.

Tokenized lyrics and audio spectrograms can be found in the following kaggle datasets: [tokens dataset](https://kaggle.com/datasets/f7bc1aca99a5c94c47418b505754ae54881a6c351478bb68f517d90150a0a256), [spectrogram dataset](https://kaggle.com/datasets/b9ae5846ec8b60e6de1e2b20c87920abbb739afb196914e6dbcac9611dbfafc9) or these drive files and folders: [tokenized lyrics file](https://drive.google.com/file/d/1RgGpqTdik-MKFBOMRA4lp2ad0lIrAKsr/view?usp=sharing) and [spectrograms folder](https://drive.google.com/drive/folders/10cUVpCQbOqm_gqboucYzwYdbdeNmr-fb?usp=sharing)

#### Genre Mapping



In [None]:
m4a_df = pd.read_csv("./id_genres.csv", sep="\t")
m4a_df = pd.concat([m4a_df["id"],m4a_df['genres'].str.split(pat=',',expand=True)],axis=1)
m4a_df.columns = ["id" if i==0 else "genre" + str(i) for i in range(len(m4a_df.columns))]
id_lang_df = pd.read_csv('./id_lang.csv', sep="\t") 
id_lang_df_en = id_lang_df[id_lang_df['lang'] == 'en']
merged_df = pd.merge(m4a_df, id_lang_df_en, on='id')
merged_df

In [None]:
merged_df['primary_genre']='other'

conditions = [
    (
        (merged_df['genre1'].isin(['k-pop', 'indie pop', 'electropop', 'soft rock', 'power pop', 
                                   'indietronica', 'chillwave', 'art pop', 'j-pop', 'pop rock', 'electroclash', 
                                   'indie pop', 'alternative pop'])) | 
        ((merged_df['genre1'] == 'pop') & ~((merged_df.iloc[:, 2:].isin(['rock', 'alternative rock', 'indie rock', 'electronic', 'folk']).any(axis=1)))) |
        ((merged_df['genre1'] == 'singer-songwriter') & (merged_df.iloc[:, 2:].isin(['pop']).any(axis=1))) | 
        ((merged_df['genre1'] == 'synthpop') & (merged_df.iloc[:, 2:].apply(lambda row: 'pop' in ' '.join(map(str, row)), axis=1))) |
        ((merged_df['genre1'].isin(['shoegaze', 'dream pop'])) & (merged_df.iloc[:, 2:].apply(lambda row: 'pop' in ' '.join(map(str, row)), axis=1))) | 
        ((merged_df['genre1'] == 'britpop') & ~(merged_df.iloc[:, 2:].apply(lambda row: 'rock' in ' '.join(map(str, row)), axis=1))) |
        ((merged_df['genre1'] == 'disco') & (merged_df.iloc[:, 2:].isin(['pop']).any(axis=1))) |
        ((merged_df['genre1'] == 'soundtrack') & (merged_df.iloc[:, 2:].isin(['pop']).any(axis=1))) |
        ((merged_df['genre1'] == 'eurovision') & (merged_df.iloc[:, 2:].isin(['pop']).any(axis=1))) |
        ((merged_df['genre1'] == 'disney') & (merged_df.iloc[:, 2:].isin(['pop']).any(axis=1))) |
        ((merged_df['genre1'].isin(['lo-fi'])) & (merged_df.iloc[:, 2:].apply(lambda row: 'pop' in ' '.join(map(str, row)), axis=1))) |
        ((merged_df['genre1'] == 'indie folk') & ~(merged_df.iloc[:, 2:].isin(['folk']).any(axis=1))) |
        ((merged_df['genre1'] == 'synthpop') & (merged_df['genre2'].isna())) |
        (merged_df['genre1'].isin(['nu gaze', 'uk pop', 'operatic pop', 'europop', 'swedish pop', 'canadian country', 'romanian pop', 'french pop', 'indonesian indie', 'swedish synthpop']))

    ),
    
    (
        (merged_df['genre1'].isin(['classic rock', 'progressive rock', 'psychedelic rock', 'glam rock', 'garage rock', 
                                   'folk rock', 'southern rock'])) |
        ((merged_df['genre1'] == 'rock') & ((merged_df['genre2'] != 'alternative rock') & (merged_df['genre2'] != 'indie rock') 
                                            & ~(merged_df.iloc[:, 2:].isin(['punk', 'pop', 'electronic', 'folk']).any(axis=1)) )) |
#         ((merged_df['genre1'] == 'indie rock') & (~merged_df.iloc[:, 2:].isin(['alternative rock']).any(axis=1))) |
        ((merged_df['genre1'] == 'singer-songwriter') & (merged_df.iloc[:, 2:].isin(['pop']).any(axis=1) & (merged_df['primary_genre'] == 'other'))) |
        ((merged_df['genre1'] == 'hard rock') & ~(merged_df.iloc[:, 2:].apply(lambda row: 'metal' in ' '.join(map(str, row)), axis=1))) |
        ((merged_df['genre1'] == 'soundtrack') & (merged_df.iloc[:, 2:].isin(['rock']).any(axis=1)))
    ),
        
    (
        (merged_df['genre1'].isin(['alternative rock', 'pop punk', 'noise rock', 'post-rock', 'math rock', 'dream pop', 'shoegaze'])) |
#         ((merged_df['genre1'] == 'rock') & ((merged_df['genre2'] == 'alternative rock') | (merged_df['genre2'] == 'indie rock'))) |
        ((merged_df['primary_genre'] == 'other') & (merged_df['genre1'] == 'indie rock')) |
        ((merged_df['genre1'] == 'grunge') & ~(merged_df.iloc[:, 2:].apply(lambda row: 'metal' in ' '.join(map(str, row)), axis=1))) |
        ((merged_df['genre1'] == 'britpop') & (merged_df.iloc[:, 2:].apply(lambda row: 'rock' in ' '.join(map(str, row)), axis=1))) |
        ((merged_df['genre1'] == 'lo-fi') & (merged_df.iloc[:, 2:].isin(['indie rock', 'alternative rock']).any(axis=1))) |
        ((merged_df['genre1'] == 'experimental') & (merged_df.iloc[:, 2:].isin(['indie rock', 'alternative rock']).any(axis=1)))
    ),
        
    (
        (merged_df['genre1'].isin(['rhythm and blues','soul', 'funk', 'new jack swing', 'motown'])) |
        ((merged_df['genre1'] == 'disco') & ~(merged_df.iloc[:, 2:].isin(['pop']).any(axis=1))) |
        (merged_df['genre1'].isin(['neo soul', 'acid jazz'])) |
        (merged_df['genre1'].isin(['quiet storm']))
    ),
    
    (
        (merged_df['genre1'].isin(['house', 'dubstep', 'trance', 'ambient', 'downtempo', 'eurodance', 'electronica', 
                                   'new age', 'drum and bass', 'grime', 'techno', 'deep house', 'electro', 'trip hop', 'lounge', 'witch house', 
                                   'electro house'])) | 
        ((merged_df['genre1'] == 'electronic') & (merged_df.iloc[:, 2:].isin(['pop', 'rock']).any(axis=1))) |
        ((merged_df['genre1'] == 'synthpop') & (merged_df.iloc[:, 2:].isin(['electronic']).any(axis=1))) |
        ((merged_df['genre1'] == 'experimental') & (merged_df.iloc[:, 2:].isin(['electronic']).any(axis=1))) |
        ((merged_df['genre1'] == 'soundtrack') & (merged_df.iloc[:, 2:].isin(['electronic']).any(axis=1))) |
        (merged_df['genre1'].isin(['minimal', 'remix', 'beats'])) |
        ((merged_df['genre1'] == 'synthpop') & (merged_df.iloc[:, 2:].isin(['minimal wave']).any(axis=1))) |
        (merged_df['genre1'].isin(['industrial', 'ebm', 'futurepop', 'noise', 'krautrock'])) |
        (merged_df['genre1'].isin(['progressive house', 'uk garage', 'edm', 'electronic rock', 'chiptune', 'glitch', 'vaporwave', 'minimal wave', 'future garage'])) |
        (merged_df['genre1'].isin(['dark techno', 'tribal house', 'hyperpop', 'jumpstyle',  'bass house', 'acid house', 'neurofunk', 'aggrotech', 'phonk',  'electropunk',  'vocaloid', 
                                   'italo house', 'forro', 'kizomba', 'funk carioca', 'tecnobrega', 'noise punk']))

    ),

    (
        (merged_df['genre1'].isin(['hip hop', 'rap', 'hardcore hip hop'])) |
        (merged_df['genre1'].isin(['emo rap', 'underground hip hop', 'experimental hip hop', 'drill', 'jazz rap', 'horrorcore', 'uk hip hop'])) |
        (merged_df['genre1'].isin(['old school hip hop', 'uk drill', 'underground rap', 'trap music', 'irish hip hop', 'country rap', 'trap soul']))

    ),
        
    (
        (merged_df['genre1'].isin(['folk', 'country', 'celtic'])) |
        (merged_df['genre1'].isin(['folk']) & ~merged_df.iloc[:, 2:].isin(['pop', 'rock']).any(axis=1)) |
        (merged_df['genre1'].isin(['singer-songwriter']) & merged_df.iloc[:, 2:].isin(['folk']).any(axis=1)) |
        ((merged_df['genre1'] == 'soundtrack') & (merged_df.iloc[:, 2:].isin(['folk']).any(axis=1))) |
        ((merged_df['genre1'] == 'indie folk') & (merged_df.iloc[:, 2:].isin(['folk']).any(axis=1))) |
        (merged_df['genre1'].isin(['neofolk', 'folktronica', 'contemporary folk', 'psychedelic folk', 'folk-pop']))

    ),
        
    (
        (merged_df['genre1'].isin(['jazz', 'blues', 'swing', 'gospel', 'smooth jazz'])) |
        (merged_df['genre1'].isin(['jazz fusion', 'nu jazz', 'vocal jazz'])) |
        (merged_df['genre1'].isin(['contemporary gospel', 'praise']))

    ),
    (
        (merged_df['genre1'].isin(['metal', 'black metal', 'death metal', 'thrash metal', 'gothic metal', 
                                   'alternative metal', 'progressive metal', 'power metal', 'symphonic metal', 
                                   'nu metal', 'doom metal', 'melodic death metal', 'stoner rock', 'deathcore', 
                                   'post-metal', 'groove metal', 'technical death metal', 'viking metal', 
                                   'stoner metal', 'folk metal', 'nwobhm', 'djent', 'atmospheric black metal', 
                                   'symphonic death metal', 'symphonic black metal'])) |
        ((merged_df['genre1'] == 'hard rock') &  (merged_df.iloc[:, 2:].apply(lambda row: 'metal' in ' '.join(map(str, row)), axis=1))) |
        ((merged_df['genre1'] == 'grunge') &  (merged_df.iloc[:, 2:].apply(lambda row: 'metal' in ' '.join(map(str, row)), axis=1))) |
        ((merged_df['genre1'] == 'metalcore') &  ~(merged_df.iloc[:, 2:].apply(lambda row: 'hardcore' in ' '.join(map(str, row)), axis=1))) |
        ((merged_df['genre1'] == 'grindcore') &  (merged_df.iloc[:, 2:].apply(lambda row: 'metal' in ' '.join(map(str, row)), axis=1))) |
        (merged_df['genre1'].isin(['depressive black metal', 'post-black metal', 'progressive black metal'])) |
        (merged_df['genre1'].isin(['gothic rock', 'industrial metal','industrial rock'])) |
        (merged_df['genre1'].isin(['speed metal', 'progressive metalcore', 'brutal death metal', 'sludge metal', 'symphonic power metal', 'melodic black metal', 'melodic metalcore', 'deathgrind', 'technical deathcore']))
    ),
        
    (
        (merged_df['genre1'].isin(['post-punk', 'new wave', 'horror punk', 'riot grrrl', 'ska punk'])) |
        ((merged_df['genre1'] == 'punk') & ~(merged_df.iloc[:, 2:].isin(['rock']).any(axis=1))) |        
        ((merged_df['genre1'] == 'synthpop') & (merged_df.iloc[:, 2:].isin(['new wave']).any(axis=1))) |
        (merged_df['genre1'].isin(['dark cabaret', 'psychobilly'])) |
        (merged_df['genre1'].isin(['skate punk', 'folk punk', 'emocore', 'anarcho-punk', 'punk blues', 'no wave']))
    ),

    (
        (merged_df['genre1'].isin(['hardcore', 'post-hardcore', 'emo', 'melodic hardcore', 'screamo', 'mathcore', 'hardcore punk', 'hardstyle'])) |
        ((merged_df['genre1'] == 'metalcore') &  (merged_df.iloc[:, 2:].apply(lambda row: 'hardcore' in ' '.join(map(str, row)), axis=1))) |
        ((merged_df['genre1'] == 'grindcore') &  ~(merged_df.iloc[:, 2:].apply(lambda row: 'metal' in ' '.join(map(str, row)), axis=1))) |
        (merged_df['genre1'].isin(['chaotic hardcore']))
    ),
    (
        (merged_df['genre1'].isin(['reggae', 'ska', 'dancehall', 'dub']))
    ),
]

values = ['pop',
          'rock',
          'alternative rock',
          'rythm and blues',
          'electronic',
          'hip hop',
          'folk',
          'blue note',
          'metal',
          'punk',
          'hardcore',
          'jamaican',
         ]

merged_df['primary_genre'] = np.select(conditions, values, default='other')
merged_df

#### Emotion Mapping

The map_to_emotion creates labels for songs based on their  valence and energy values

In [None]:
def map_to_emotion(valence, energy):
    if valence > 0.65:
        if energy > 0.65:
            return 'Excited'
        elif energy < 0.35:
            return 'Relaxed'
        else:
            return 'Happy'
    elif valence < 0.35:
        if energy > 0.65:
            return 'Angry'
        elif energy < 0.35:
            return 'Depressed'
        else:
            return 'Sad'
    else:
        if energy > 0.65:
            return 'Tense'
        elif energy < 0.35:
            return 'Calm'
        else:
            return 'Neutral'

#### Aggregating the dataset

The following processes can be used to fetch new data from spotify. Someone should have their own client_id and client_secret from [spotify for developers](https://developer.spotify.com/). 

The first cell is responsible to fetch new songs with valance and energy values in a certain range. Spotify as of now (2024) does not allow to search for songs based on it's metadata. In order to find eg. 'relaxed' songs someone has to loop through genres and years (with varying offset), fetching 50 songs each time and filtering to get songs with the needed metadata.

The second cell describes the process to get lyrics for the newly found songs. This involves two sources for optimization purposes, a filtering function for english lyrics and a function to clean lyrics from geniuslyrics. Again someone would need a key from [Genius API management page](https://genius.com/api-clients)

In [None]:
client_credentials_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

new_tracks_relaxed = []
years = range(1970, 2024)
genres = ['smooth jazz', 'country', 'r&b', 'gospel', 'blues', 'jazz', 'jamaican', 'indie folk', 'electronic', 'chill-out', 'downtempo', 'new age']

try:
    for offset in range(0, 300, 50):
        print(offset)
        for genre in genres:
            for year in years:
            
                print(f"Tracks found - Relaxed: {len(new_tracks_relaxed)}, Calm: {len(new_tracks_calm)}, Happy: {len(new_tracks_happy)}, Depressed: {len(new_tracks_depressed)}, Neutral: {len(new_tracks_neutral)}")
                time.sleep(random.uniform(3, 9))
    
                try:
                    results = sp.search(q=f'genre:"{genre}" year:{year}', type='track', limit=50, offset=offset)
                except SpotifyException as e:
                    if e.http_status == 429:
                        print(e)
                        print("Genre: " + genre + " Year: " + str(year) + " Offset: " + str(offset))
                        
    
                track_ids = [track['id'] for track in results['tracks']['items'] if track['id'] not in existing_spotify_ids and track['preview_url']]
    
                audio_features_list = None
    
                time.sleep(random.uniform(3, 9))
                try:
                    audio_features_list = sp.audio_features(track_ids)
                except SpotifyException as e:
                    if e.http_status == 429:
                        print(e)
                        print("Genre: " + genre + " Year: " + str(year) + " Offset: " + str(offset))
    
                if audio_features_list is None:
                    print(f"Audio features NoneType for year {year}")
                    continue
                    
                for audio_features in audio_features_list:
                    if audio_features:
                        track = next((match for match in results['tracks']['items'] if match["id"] == audio_features['id']), None)
                        emotion = map_to_emotion(audio_features['valence'], audio_features['energy'])
                        track_info = {
                            'spotify_id': track['id'],
                            'check_id': audio_features['id'],
                            'name': track['name'],
                            'artist': track['artists'][0]['name'],  
                            'popularity': track['popularity'],
                            'release_date': track['album']['release_date'],
                            'danceability': audio_features['danceability'],
                            'energy': audio_features['energy'],
                            'key': audio_features['key'],
                            'mode': audio_features['mode'],
                            'valence': audio_features['valence'],
                            'tempo': audio_features['tempo'],
                            'duration_ms': audio_features['duration_ms'],
                            'preview_url': track['preview_url'],
                            'genre': genre
                        }
                        
                        if emotion == 'Relaxed' and len(new_tracks_relaxed) < 10000:
                            new_tracks_relaxed.append(track_info)
                            existing_spotify_ids.add(track['id'])
                 
    
                if len(new_tracks_relaxed) >= 10000 and len(new_tracks_calm) >= 8000:
                    break
        
        time.sleep(60)         
except Exception as e:
    print(f"An error occurred: {e}")
    print(f"Current year: {year}, Offset: {offset}")

In [None]:
genius = lyricsgenius.Genius(key)
genius.verbose = False
genius.skip_non_songs = True

def get_lyrics(artist, title):
    url = f"https://lyrics.lyricfind.com/lyrics/{artist.lower().replace(' ', '-')}-{title.lower().replace(' ', '-')}"
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    script_tag = soup.find('script', id='__NEXT_DATA__')
    if script_tag:
        script_content = script_tag.string
        json_data = json.loads(script_content)
        lyrics = json_data.get('props', {}).get('pageProps', {}).get('songData', {}).get('track', {}).get('lyrics', '')
        if lyrics:
            return lyrics
    return None

def is_english(text, num_trials=5, threshold=0.42):
    results = detect_langs(text)
    if 'en' in [lang.lang for lang in results] and max([lang.prob for lang in results if lang.lang == 'en']) >= threshold:
        return True

    for _ in range(1, num_trials):
        results = detect_langs(text)
        if 'en' in [lang.lang for lang in results] and max([lang.prob for lang in results if lang.lang == 'en']) >= threshold:
            return True
    return False
    
def clean_lyrics(lyrics):
    lines = lyrics.split('\n')[1:]
    lyrics = '\n'.join(lines)
    lyrics = re.sub(r'\[.*?\]\n|\d*Embed$', '', lyrics, flags=re.IGNORECASE)
    return lyrics.strip()

def get_lyrics_genius(artist, title):
    try:
        song = genius.search_song(title, artist)
        if song and (song.artist.lower() == artist.lower()) and (song.title.lower() in title.lower()):
            cleaned_lyrics = clean_lyrics(song.lyrics)
            if cleaned_lyrics:
                return cleaned_lyrics
    except Exception as e:
        print(f"Error fetching lyrics: {e}")
        return None
    return None

#Example usage
for track in tqdm.tqdm(new_tracks_relaxed):
    time.sleep(1)
    artist = track.get('artist')
    name = track.get('name')
    lyrics = get_lyrics(artist, name)
    if lyrics and is_english(lyrics):
        track['lyrics'] = lyrics
    else:
        lyricsg = get_lyrics_genius(artist, name)
        if lyricsg and is_english(lyricsg):
            track['lyrics'] = lyricsg
        else:
            track['lyrics'] = None

#### Spotify artist gernes

The following mapping should be used when fetching artist genres from spotify. The map and the reverse map match subgenres to the 9 broad genres used in our work. The artist genres are used in comparison with the M4A genres, to remove noisy genre labels.

In [None]:
spotify_genre_mapping = {
    'hip hop': ['trap', "polish trap", "australian trap", "canadian trap", "bounce", "melbourne bounce international", "melbourne bounce", "boom bap brasileiro", "plugg", "balkan hip hop", "swedish gangsta rap", "russian trap", "bass trap", "arab trap", "trap brasileiro", "zhenskiy rep", "swedish trap", 'boom bap', "drill and bass", "new york drill", "florida drill", "brooklyn drill", "drill espanol", "chicago drill", "swedish drill", "uk drill", "aussie drill", "viral trap", "christian trap", "dark trap", "trap triste", "atl trap", "trap latino", "detroit trap", "trap queen", "pinoy trap", "trap soul", "trap tuga", 'east coast hip hop', 'gangster rap', 'drill','rap', 'jazz rap', "israelite hip hop", "old school atlanta hip hop", "manchester hip hop", "old school uk hip hop", "chill abstract hip hop", "ghanaian hip hop", "portland hip hop", "uk hip hop", "deep underground hip hop", "norwegian hip hop", "sudanese hip hop", "uk alternative hip hop", "kentucky hip hop", "underground hip hop", "hip house", "maltese hip hop", "desi hip hop", "swedish hip hop", "experimental hip hop", "israeli hip hop", "mississippi hip hop", "indiana hip hop", "scottish hip hop", "psychedelic hip hop", "polish hip hop", "asian american hip hop", "virginia hip hop", "drumless hip hop", "sichuanese hip hop", "minnesota hip hop", "abstract hip hop", "german hip hop", "native american hip hop", "glitch hop", "nigerian hip hop", "alternative hip hop", "venezuelan hip hop", "hamburg hip hop", "old school hip hop", "south african hip hop", "middle east hip hop", "finnish hip hop", "nottingham hip hop", "irish hip hop", "canadian old school hip hop", "canadian indigenous hip hop", "bronx hip hop", "albanian hip hop", "sacramento hip hop", "australian hip hop", "lgbtq+ hip hop", "iraqi hip hop", "arabic hip hop", "queens hip hop", "hip-hop experimental", "detroit hip hop", "christian hip hop", "brazilian hip hop", "hawaiian hip hop", "danish hip hop", "hip hop", "japanese old school hip hop", "atl hip hop", "nashville hip hop", "industrial hip hop", "political hip hop", "buffalo hip hop", "canadian hip hop", "kansas city hip hop", "russian hip hop", "korean old school hip hop", "conscious hip hop", "seattle hip hop", "ohio hip hop", "indie hip hop", "jamaican hip hop", "tennessee hip hop", "hardcore hip hop", "miami hip hop", "birmingham hip hop", "memphis hip hop", "oakland hip hop", "north carolina hip hop", "spiritual hip hop", "nz hip hop", "indonesian hip hop", "cologne hip hop", "gulf hip hop", "estonian hip hop", "golden age hip hop", "trip hop", "latin hip hop", "australian underground hip hop", "southern hip hop", "hip hop tuga", "bc underground hip hop", "south carolina hip hop", "harlem hip hop", "boston hip hop", "instrumental hip hop", "zambian hip hop", "arkansas hip hop", "chinese hip hop","comedy rap", "san diego rap", "sad rap", "florida rap", "houston rap", "maga rap", "dfw rap", "lo-fi rap", "k-rap", "german cloud rap", "korean underground rap", "rap cearense", "proto-rap", "dmv rap", "german jazz rap", "underground rap", "indonesian emo rap", "rap mineiro", "chicago rap", "oc rap", "new jersey rap", "pop rap", "st louis rap", "texas latin rap", "rap feminino nacional", "rap metalcore", "scam rap", "rap conscient", "baton rouge rap", "rap chileno", "cloud rap", "dirty south rap", "rap marseille", "philly rap", "rap nacional antigo", "new orleans rap", "scream rap", "rage rap", "melodic rap", "military rap", "indie pop rap", "nyc rap", "aesthetic rap", "ottawa rap", "russian emo rap", "rap rock", "cali rap", "josei rap", "meme rap", "rhode island rap", "alabama rap", "london rap", "viral rap", "battle rap", "rap latina", "russian gangster rap", "emo rap", "country rap", "rap kreyol", "chicano rap", "west coast rap", "pittsburgh rap", "upstate ny rap", "cloud rap francais"],
    'punk': ['punk', 'new wave', 'no wave', "skinhead oi", "riot grrrl", "cowpunk", 'post-punk','psychobilly', "protopunk", "oi", "queercore", "ukhc", "minimal wave", "solo wave", "italian new wave", "new wave pop", "crank wave", "belgian new wave", "permanent wave", "grave wave", "deep new wave", "new wave of osdm", "scottish new wave", "ethereal wave", "dark wave", "post-punk argentina", "australian garage punk", "brazilian punk", "garage punk", "spanish post-punk", "italian post-punk", "pop punk", "nz punk", "irish post-punk", "new jersey punk", "synth punk", "turkish post-punk", "russian punk", "early us punk", "surf punk", "irish punk", "austrian punk", "post-punk brasileiro", "anarcho-punk", "russian post-punk", "fast melodic punk", "canadian pop punk", "chicano punk", "virginia punk", "uk pop punk", "melbourne punk", "dance-punk", "gothic post-punk", "canadian punk", "indie punk", "italian pop punk", "celtic punk", "japanese punk rock", "german punk rock", "kentucky punk", "noise punk", "french post-punk", "chicago punk", "texas pop punk", "chain punk", "uk diy punk", "dub punk", "boston punk", "punk rock italiano", "uk post-punk revival", "minneapolis punk", "northern irish punk", "icelandic post-punk", "indonesian punk", "horror punk", "deep german punk", "chicago pop punk", "christian punk", "vancouver punk", "deep punk rock", "psychedelic punk", "dark post-punk", "art punk", "long island punk", "quebec punk", "gypsy punk", "norwegian punk rock", "american post-punk", "black punk", "acoustic punk", "street punk espanol", "texas punk", "danish post-punk", "neon pop punk", "swedish post-punk", "arizona punk", "australian post-punk", "skate punk", "socal pop punk", "power-pop punk", "crust punk", "brisbane punk", "uk post-punk", "italian punk", "punk 'n' roll", "atlanta punk", "portland punk", "punk", "german punk", "post-punk mexicano", "balkan post-punk", "deep gothic post-punk", "ska punk", "modern ska punk", "german post-punk", "glam punk", "canadian post-punk", "emo punk"],
    'electronic': ['house', "pop edm", "power noise", "japanese screamo", "game mood", "slow game", "japanese dub", "new jack smooth", "french indietronica", "new beat", "new rave", "new french touch", "lo-fi beats", "swancore", "neue deutsche harte", "russian rave", "big room", "grimewave", "purple sound", "drift phonk", "jump up", "deconstructed club", "sovietwave", "wonky", "klubowe", "russian edm", "russian trance", "nu-cumbia", "dub", "grime", "birmingham grime", "russian grime", "instrumental grime", "moog", "jungle", "eurodance", "hi-nrg", "nu skool breaks", "lo-fi chill", "lo-fi", "sad lo-fi", "lo-fi study", "dark techno", "uk bass", "kawaii future bass", "future garage", "atlanta bass", "electric bass", "hard bass", "drum and bass", "future bass", "bubblegum bass", "miami bass", "bmore","minimal dubstep", "psychill", "steampunk", "chillhop", "chiptune", "neurofunk", "canadian house", "funky breaks", "minimal dnb", "brazilian dnb", "austrian dnb", "uk dnb", "halftime dnb", "stateside dnb", "dancefloor dnb", "nz dnb", "video game music", "darkstep", "experimental dubstep", "deep dubstep", "classic dubstep", "melodic dubstep", "tecnobrega", "vaportrap", "complextro", "broken beat", "electronic",  "moombahton", "synthwave", "electrofox", "solipsynthm", "big beat", "funktronica", "breakbeat", "japanoise", "world fusion", "synthesizer", "rhythm game", "jazztronica", "electropowerpop", "circuit", "euphoric hardstyle", "swedish ebm", "italo dance", "bubblegum dance", "australian dance", "russian dance", "intelligent dance music", "alternative dance", "belgian dance", "dark electro", "proto-techno", "chill lounge", "azontobeats", "ghettotech", "breakcore", "vaporwave", "jamtronica", "ukg revival", "2-step", "chillstep", "rave", "futurepop", "microhouse", "schranz", "aussietronica", "nightrun", "electronicore", "livetronica", "nightcore", "gamecore", "world chill", "chill r&b", "lounge", "belgian techno", "techno kayo", "minimal techno", "israeli techno", "german techno", "melodic techno", "french techno", "dub techno", "swedish techno", "bleep techno", "detroit techno", "destroy techno", "danish techno", "modern ebm", "ebm", "finnish electro", "finnish edm", "hyperpop", "proto-hyperpop", "korean hyperpop", "downtempo bass", "downtempo", "downtempo fusion", "ambient", "compositional ambient", "ambient techno", "j-ambient", "experimental ambient", "ambient idm", "space ambient", "australian ambient", "ambient guitar", "dark ambient", "ritual ambient", "nordic ambient", "german trance", "german dance", "dutch edm", "brazilian edm", "belgian edm", "indian edm", "lithuanian edm", "edm", "ukrainian edm", "gaming edm", "power electronics", "electro trash", "electro swing", "electro", "dungeon synth", "synth prog", "modular synth", "swedish synth", "progressive uplifting trance", "deep uplifting trance", "italian trance", "psychedelic trance", "bubble trance", "old school hard trance", "ambient trance", "deep progressive trance", "dutch trance", "progressive trance", "vocal trance", "dream trance", "uplifting trance", "industrial", "neo-synthpop", "synthpop", "polish synthpop", "french synthpop", "dark synthpop", "early synthpop", "swedish synthpop", "british industrial", "industrial pop", "industrial rock", "electro-industrial", "ambient industrial", "indie electronica", "german electronica", "polish electronica", "electronica", "vapor pop", "electro-pop francais", "glitch pop", 'techno', 'dubstep', 'trance',"uk experimental electronic", "boston electronic", "icelandic electronic", "slovak electronic", "electronic rock", "lithuanian electronic", "electronic trap", "electronic", "hamburg electronic", "russian electronic", "turkish electronic", "canadian electronic", "swedish electronic", "indian electronic", "belgian electronic", "frankfurt electronic", "bosnian electronic", "scottish electronic", "bristol electronic", "leipzig electronic", "persian electronic", "vintage french electronic", "spanish electronic", "organic electronic", "electronic djent", "slovenian electronic", "finnish electronic", "nz electronic", "dusseldorf electronic", "arab electronic", "experimental electronic", "danish electronic", "bouncy house", "future house", "deep euro house", "swedish tropical house", "norwegian house", "italo house", "deep tropical house", "bass house", "saxophone house", "jazz house", "minimal tech house", "filter house", "tech house", "detroit house", "classic house", "progressive electro house", "disco house", "ambient house", "classic progressive house", "garage house", "acid house", "melodic house", "german house", "piano house", "romanian house", "australian house", "diva house", "dutch house", "fidget house", "chicago house", "swedish house", "organic house", "speed house", "float house", "outsider house", "mexican tech house", "tropical house", "vocal house", "nordic house", "soulful house", "deep disco house", "tribal house", "progressive house", "slap house", "russian witch house", "electro house", "brazilian house", "experimental house", "chill house", "deep groove house", "uk house", "stutter house", "hard house", "french tech house", "turkish deep house", "g-house", "pop house", "deep house", "deep tech house", "witch house", "lo-fi house"],
    'heavy music': ["german black metal", "german melodeath", "black sludge", "powerviolence", "emoviolence", "voidgrind", "mathgrind", "d-beat", "progressive doom", "nyhc", "blackened screamo", "straight edge", "blackgaze", "american oi", "stenchcore", "melodic doom", "brutal prog", "goregrind", "uk beatdown", "gymcore", "deathrash", "beatdown", "xtra raw", "italian gothic", "atmospheric sludge", "gotico brasileiro", "modern goth", "deathgrass", "djent", "slayer", "vegan straight edge", "melodic metalcore", "deep melodic metalcore", "deathrock", "thall", "instrumental djent", "usbm", "emo", "doomgaze", "ukrainian metalcore", "screamocore", "deathgrind", "deathstep", "blackened crust", "noisecore", "screamo", "heavy alternative", "gothic alternative", 'nwothm', "nwobhm", "death 'n' roll", "horrorcore", "finnish melodeath", "swedish melodeath", "american melodeath", "black 'n' roll", "progressive thrash", "atmospheric doom", "death doom", "gaian doom", "epic doom", "funeral doom", "psychedelic doom", "melodic thrash", "thrash core", "black thrash", "power thrash", "crossover thrash", "technical thrash", "downtempo deathcore", "melodic deathcore", "symphonic deathcore", "technical deathcore", "blackened deathcore", "brutal deathcore", "deathcore", "slamming deathcore", "progressive deathcore", "stoner rock", "gothic rock", "american grindcore", "swedish metalcore", "nu-metalcore", "metalcore", "american metalcore", "japanese metalcore", "german metalcore", "russian metalcore", "christian metalcore", "australian metalcore", "uk metalcore", "canadian metalcore", "progressive metalcore",  "british grindcore", "swedish grindcore", "grindcore", "technical grindcore", "martial industrial", "new england emo", "aussie emo", "5th wave emo", "diy emo", "anthem emo", "pop emo", "emo", "swedish emo", "indie emo", "french emo", "indonesian emo", "midwest emo", "alternative emo", "brazilian emo", "japanese emo", "industrial noise", "death industrial", "technical melodic death metal", "belgian black metal", "nz metal", "swedish black metal", "new york death metal", "kawaii metal", "german death metal", "sci-fi metal", "technical groove metal", "pittsburgh metal", "brutal death metal", "italian death metal", "industrial black metal", "symphonic metal", "dissonant death metal", "progressive power metal", "canadian black metal", "german thrash metal", "texas metal", "grim death metal", "belgian metal", "celtic metal", "german metal", "hungarian metal", "dutch black metal", "danish death metal", "neo-trad metal", "swedish power metal", "russian nu metal", "new england metal", "depressive black metal", "rap metal", "portland metal", "minneapolis metal", "brazilian progressive metal", "epic black metal", "cascadian black metal", "virginia metal", "uk post-metal", "arkansas metal", "portuguese black metal", "cyber metal", "spanish folk metal", "australian death metal", "south carolina metal", "grisly death metal", "irish black metal", "canadian death metal", "alabama metal", "australian black metal", "christian doom metal", "birmingham metal", "finnish death metal", "proto-metal", "taiwan metal", "irish metal", "fantasy metal", "norwegian doom metal", "metal cover", "vancouver metal", "swedish progressive metal", "cosmic black metal", "groove metal", "metal balear", "florida death metal", "native american metal", "occult black metal", "tolkien metal", "atmospheric post-metal", "canadian metal", "slam death metal", "comic metal", "norwegian death metal", "swiss black metal", "j-metal", "miami metal", "icelandic metal", "metal baiano", "polish folk metal", "austrian metal", "melodic death metal", "drone metal", "black death", "arab metal", "indian metal", "german heavy metal", "british power metal", "folk black metal", "denver metal", "brazilian thrash metal", "singaporean metal", "australian metal", "edinburgh metal", "cavernous death metal", "kentucky metal", "italian gothic metal", "saxony metal", "swedish metal", "uk doom metal", "dark black metal", "norwegian metal", "brazilian black metal", "spanish black metal", "japanese power metal", "post-metal", "sludge metal", "russian death metal", "viking metal", "ukrainian metal", "doom metal", "swiss metal", "mexican metal", "speed metal", "metal", "cleveland metal", "italian black metal", "nu metal", "oulu metal", "italian metal", "slavic metal", "glam metal", "west virginia metal", "oriental metal", "funk metal", "southern metal", "british death metal", "alternative metal", "brazilian power metal", "technical brutal death metal", "austrian black metal", "new wave of glam metal", "british black metal", "opera metal", "lapland metal", "industrial metal", "polish metal", "austin metal", "ambient black metal", "progressive metal", "finnish heavy metal", "retro metal", "gothenburg metal", "dub metal", "french black metal", "symphonic power metal", "metal catala", "italian power metal", "new wave of speed metal", "atmospheric black metal", "prog metal", "progressive death metal", "greek metal", "polish thrash metal", "italian doom metal", "seattle metal", "cosmic death metal", "emotional black metal", "brazilian groove metal", "icelandic black metal", "tennessee metal", "instrumental black metal", "swedish heavy metal", "post-doom metal", "polish black metal", "lithuanian metal", "power metal", "dutch death metal", "avant-garde black metal", "technical death metal", "finnish progressive metal", "post-black metal", "welsh metal", "progressive technical death metal", "death metal", "black speed metal", "gothic black metal", "pagan black metal", "finnish black metal", "finnish power metal", "new wave of thrash metal", "danish black metal", "metal mineiro", "symphonic black metal", "war metal", "metal guitar", "french metal", "symphonic melodic death metal", "gothic symphonic metal", "progressive black metal", "us power metal", "danish metal", "thrash metal", "greek black metal", "gothic metal", "viking black metal", "atlanta metal", "slavic folk metal", "turkish death metal", "folk metal latinoamericano", "folk metal", "utah metal", "polish death metal", "melodic black metal", "nyc metal", "melodic metal", "neo metal", "autonomous black metal", "australian thrash metal", "finnish metal", "brazilian death metal", "christian power metal", "norwegian black metal", "belgian death metal", "japanese death metal", "symphonic death metal", "latin metal", "black metal", "russian metal", "quebec death metal", "medieval black metal", "lovecraftian metal", "dutch metal", "german power metal", "buffalo ny metal", "boston metal", "swedish death metal", "swedish doom metal", "portuguese metal", "avant-garde metal", "french death metal", "north carolina metal", "melodic groove metal", "progressive groove metal", "israeli metal", "finnish doom metal", "christian metal", "nordic folk metal", "neo classical metal", "louisiana metal", "metal gaucho", "appalachian black metal", "forest black metal", "italian folk metal", "technical black metal", "scottish metal", "japanese heavy metal", "texas death metal", "stoner metal", "brazilian metal", "chicago hardcore", "brazilian post-hardcore", "nz hardcore", "la hardcore", "hardcore punk", "dark hardcore", "uk post-hardcore", "happy hardcore", "modern hardcore", "french hardcore", "boston hardcore", "german hardcore", "california hardcore", "japanese post-hardcore", "florida hardcore", "canadian hardcore", "brazilian hardcore", "post-post-hardcore", "blackened hardcore", "lion city hardcore", "ohio hardcore", "post-hardcore", "umea hardcore", "hardcore techno", "texas hardcore", "digital hardcore", "melodic hardcore", "metallic hardcore", "progressive post-hardcore", "swedish post-hardcore", "pennsylvania hardcore", "christian hardcore", "norwegian hardcore", "german post-hardcore", "swedish hardcore", "slc hardcore", "new jersey hardcore", "connecticut hardcore", "chaotic hardcore", "uptempo hardcore", "dc hardcore", "australian post-hardcore", "canadian post-hardcore", "freeform hardcore", "hardcore", "deep happy hardcore", "modern melodic hardcore"],
    'rhythm music': ['blues','jazz', "new orleans jazz", "indie r&b", "christian music", "old school dancehall", "christian music", "roots worship", "new orleans funk", "canadian contemporary r&b", "new orleans soul", "uk contemporary r&b", "japanese r&b", "r&b francais","neo r&b",  "r&b brasileiro", "experimental r&b", "australian r&b", "scandinavian r&b", "afro r&b", "dutch r&b", "korean r&b", "crunk", "souldies", "pinoy r&b", "r&b", "contemporary r&b", "afrobeat", "tejano",  "dark r&b", "alternative r&b", "boogie", "chicago bop", "bedroom r&b", "sami", "brazilian boogie", "boogie-woogie", "worship", "world worship", "ambient worship", "apostolic worship", "anthem worship", "naija worship", "tagalog worship", "instrumental worship", "smooth soul", "japanese soul", "soul blues", "pop soul", "memphis soul", "philly soul", "indie soul", "northern soul", "brazilian soul", "finnish soul", "classic soul", "chicago soul", "psychedelic soul", "bedroom soul", "southern soul blues", "vapor soul", "british soul", "traditional soul", "neo soul", "instrumental soul", "soul flow", "swedish soul", "retro soul", "southern soul", "soul", 'funk', "ska", "jamaican ska", "ccm", "canadian ccm", "alternative ccm", "deep ccm", "italian ska", "ska mexicano", "ska argentino", "australian ska", "ska revival", "swing revival", "swing", "new jack swing", "greek swing", "western swing", "neapolitan funk", "dancehall", "uk dancehall", "swedish dancehall", "jamaican dancehall", "dancehall queen",  "instrumental funk", "deep funk house", "liquid funk", "jazz funk", "funk mtg", "funk paulista", "synth funk", "g funk", "traditional funk", "funk rj", "funk pop", "modern funk", "funk rock", "future funk", "p funk", "afro-funk", "funk carioca", "brit funk", "modern jazz piano", "brazilian modern jazz", "danish modern jazz", "belgian modern jazz", "canadian modern jazz", "modern jazz trio", 'gospel', "rebel blues", "psychedelic blues-rock", "piedmont blues", "jazz blues", "texas blues", "classic female blues", "malian blues", "australian blues", "delta blues", "canadian blues", "jump blues", "piano blues", "power blues-rock", "traditional blues", "electric blues", "louisiana blues", "desert blues", "modern blues", "dutch blues", "harmonica blues", "acoustic blues", "rhythm and blues", "memphis blues", "blues rock", "modern blues rock", "country blues", "pre-war blues", "swamp blues", "british blues", "chicago blues", "punk blues", "new orleans blues", "contemporary gospel", "gospel soul", "bluegrass gospel", "gospel blues", "family gospel", "gospel singers", "southern gospel", "south african gospel", "country gospel", "gospel drill", "pacific islands gospel", "gospel reggae", "baptist gospel", "gospel r&b", "gospel rap", "gospel antigas","psychedelic jazz fusion", "harmonica jazz", "german jazz", "samba-jazz", "latin jazz", "british jazz", "jazz fusion", "nz jazz", "jazz boom bap", "straight-ahead jazz", "jazz violin", "jazz drums", "soul jazz", "free jazz", "vocal jazz", "brazilian jazz", "jazz double bass", "acid jazz", "vintage jazz", "jazz metal", "jazz pop", "cool jazz", "experimental jazz", "french jazz", "deep free jazz", "avant-garde jazz", "jazz cover", "jazz trumpet", "ecm-style jazz", "ska jazz", "jazz trombone", "jazz rock", "contemporary vocal jazz", "indie jazz", "electro jazz", "jazz harp", "nu jazz", "south african jazz", "jazz puertorriqueno", "turkish jazz", "japanese jazz", "jazz piano", "jazz vibraphone", "swedish jazz", "norwegian jazz", "jazz trio", "uk contemporary jazz", "gypsy jazz", "jazz worship", "jazz guitar", "smooth jazz", "progressive jazz fusion", "contemporary jazz", "neo soul-jazz", "jazz quartet", "indonesian jazz", "classic japanese jazz", "jazz saxophone", "spiritual jazz", "jazz clarinet", "jazz brass", "arabic jazz", "bossa nova jazz", "italian jazz fusion", "dark jazz", "ethio-jazz","east coast reggae", "reggae en espanol", "african reggae", "reggae fusion", "virgin islands reggae", "reggae", "roots reggae", "early reggae", "uk reggae", "reggae rock", "reggae cover", "australian reggae fusion", "swedish reggae", "french reggae", "german reggae", "brazilian reggae", "argentine reggae", "pinoy reggae", "west coast reggae", "modern reggae"],
    'pop': ['indie pop', "greek pop", "experimental pop", "idol", "tin pan alley", "classic schlager", "melodipop", "pop","modern power pop", "modern jangle pop", "modern indie pop", "modern dream pop", "modern alternative pop", "bitpop", "piano mpb", "electroclash", "korean ost", "anime", "shibuya-kei", "alt z", "australian alternative pop", "popwave", "europop", "opm", "cabaret", "bebop", "scandipop", "ballroom", "post-disco", "classic disco polo", "nu disco", "disco", "italo disco", "new italo disco", "dark disco", "deep disco", "post-disco soul", "uk alternative pop", "japanese alternative pop", "alternative pop", "uk pop", "australian electropop", "britpop revival", "britpop", "chinese electropop", "korean electropop", "swedish electropop", "electropop", "danish electropop", "canadian electropop", "japanese electropop", "russian alt pop", "russian pop", "deep dance pop", "korean dream pop", "finnish dance pop", "swedish pop", "classic romanian pop", "swedish indie pop", "classic j-pop", "dark pop", "vintage swedish pop", "classic danish pop", "deep pop edm", "swedish trap pop", "swedish pop rap", "australian pop", "korean city pop", "classic italian pop", "classic icelandic pop", "classic greek pop", "deep acoustic pop", "swedish idol pop", "classic nz pop", "korean pop", "classic country pop", "japanese dance pop", "ambient pop", "pop", "classic belgian pop", "pop ambient", "classic uk pop", "classic persian pop", "finnish pop", "classic k-pop", "pop teen brasileiro", "classic norwegian pop", "pop dance", "classic swedish pop", "canadian pop", "dance pop", "classic french pop", "finnish idol pop", "austrian pop", "icelandic pop", "indonesian pop", "faroese pop", "swiss pop", "nz pop", "latin arena pop", "polish viral pop", "polish pop", "teen pop", "christian pop",  "psychedelic pop", "pop nacional",   "belgian pop", "indie psych-pop", "italian indie pop", "japanese dream pop", "chinese idol pop",  "latin viral pop", "latin pop", "dutch pop", "japanese indie pop", "ukrainian pop", "pop nacional antigas", "south african pop", "italian adult pop", "pop", "indonesian indie pop", "norwegian pop", "italian pop", "p-pop", "chill pop", "candy pop", "mande pop", "joseon pop", "persian pop", "guyanese pop", "greenlandic pop", "barbadian pop", "pop r&b", "slovenian pop", "chill dream pop", "maltese pop", "romanian pop", "georgian pop", "sudanese pop", "garage pop", "irish pop", "j-pop", "v-pop", "beninese pop", "puerto rican pop", "bahamian pop", "pop chileno", "swamp pop", "shiver pop", "armenian pop", "colombian pop", "city pop", "baroque pop", "nyc pop", "spanish indie pop", "thai pop", "pacific islands pop", "syrian pop", "k-pop", "mongolian pop", "hypnagogic pop", "gauze pop", "acoustic pop", "spanish pop", "czech pop", "hip pop", "cypriot pop", "danish pop", "shimmer pop", "malaysian pop", "bow pop", "afghan pop", "k-pop girl group", "slovak pop", "moldovan pop", "pop violin", "albanian pop", "kosovan pop", "antiviral pop", "channel pop", "sophisti-pop", "viral pop", "pop costarricense", "estonian pop", "sunshine pop", "german pop", "soda pop", "pop lgbtq+ brasileira", "indie dream pop", "la pop", "thai indie pop", "t-pop", "pop argentino", "azeri pop", "danish indie pop", "pop electronico", "lithuanian pop", "mexican pop", "macedonian pop", "twee pop", "space age pop", "portuguese pop", "igbo pop", "pop quebecois", "art pop", "social media pop", "tanzanian pop", "bedroom pop", "latvian pop", "j-pop boy group", "cambodian pop", "math pop", "brill building pop", "bubblegum pop", "french pop", "k-pop reality show", "okinawan pop", "collage pop", "turkish pop", "hungarian pop", "french indie pop", "k-pop boy group", "israeli pop", "j-pop girl group", "pop", "operatic pop", "tatar pop", "jangle pop", "chamber pop", "singaporean pop", "stomp pop", "nigerian pop"],
    'folk': ['country', "folk", "polish folk", "folk", "canadian contemporary country", "canadian country", "bluegrass", "canadian folk", "neofolk", "norske viser", "cosmic american", "pagode", "ectofolk", "bush ballad", "banjo", "wyoming roots", "north carolina roots", "roots americana", "kentucky roots", "dutch americana", "swedish americana", "new americana", "gothic americana", "austin americana", "norwegian americana", "new england americana", "southern americana", "new orleans americana", "australian americana", "canadian americana", "nashville americana", "memphis americana", "midwest americana", "alternative americana", "black americana", "western americana", "uk americana", "modern psychedelic folk", "country pop", "texas country", "country boogie", "truck-driving country", "classic texas country", "traditional country", "contemporary country", "arkansas country", "oklahoma country", "neo-traditional country", "country dawn", "queer country", "alberta country", "deep contemporary country", "outlaw country", "alternative country", "country road", "australian country", "irish country", "classic australian country", "classic oklahoma country", "swedish country", "modern country pop", "british country", "string folk", "contemporary folk", "protest folk", "pop folk", "early american folk", "nz folk", "vintage country folk", "australian indie folk", "british folk", "boston folk", "folk rock", "dark folk", "indie anthem-folk", "rune folk", "icelandic folk", "korean indie folk", "indie folk argentino", "anti-folk", "german indie folk", "traditional folk", "american folk revival", "folk punk", "medieval folk", "indonesian folk", "swedish indie folk", "appalachian folk", "children's folk", "folk-pop", "christian indie folk", "geek folk", "russian folk", "arab folk", "armenian folk", "scottish indie folk", "experimental folk", "bahamian folk", "traditional scottish folk", "chamber folk", "drone folk", "indonesian folk pop", "kentucky mountain folk", "psychedelic folk rock", "spanish indie folk", "free folk", "traditional southern folk", "freak folk", "ambient folk", "turbo folk", "indie folk", "swiss indie folk", "michigan folk", "thai folk", "psychedelic folk", "irish folk", "french indie folk", "scottish folk", "viking folk", "nl folk", "folk brasileiro"],
    'alternative rock': ['indie rock',  "indie shoegaze", "c86", "new jersey indie", "new delhi indie", "new weird finland", "new weird america", "new hampshire indie", "new orleans indie", "nu gaze", "shoegaze brasileiro", "latin shoegaze", "grungegaze", "arkansas indie", "finnish indie", "polish indie", "bubblegrunge", "swedish indie rock", "grunge", "underground grunge", "italogaze", "spacegrunge", "modern progressive rock", "experimental rock", "italian progressive rock", "experimental indie rock", "finnish progressive rock", "progressive rock", "alternative pop rock", "swedish alternative rock", "australian alternative rock", "british post-rock",  "tunisian alternative", "polish alternative", "italian alternative", "mongolian alternative", "south african alternative", "arab alternative", "cuban alternative", "latinx alternative", "azeri alternative", "hard alternative", "ambient post-rock", "french post-rock", "post-rock", "cinematic post-rock", "canadian post-rock", "nordic post-rock", "portuguese post-rock", "american post-rock", "japanese post-rock", "instrumental post-rock", "belgian post-rock", "cosmic post-rock", "german post-rock", "german alternative rock", "australian indie rock", "german indie", "american shoegaze", "french shoegaze", "albuquerque indie", "colombian indie", "olympia wa indie", "worcester ma indie", "channel islands indie", "milwaukee indie", "welsh indie", "dundee indie", "saskatchewan indie", "kelowna bc indie", "monterrey indie", "lexington ky indie", "athens indie", "indie rockism", "oxford indie", "guadalajara indie", "wyoming indie", "northamptonshire indie", "michigan indie", "windsor on indie", "rochester ny indie", "san marcos tx indie", "ukrainian indie", "indy indie", "glasgow indie", "j-indie", "western mass indie", "north carolina indie", "dusseldorf indie", "brighton indie", "triangle indie", "baton rouge indie", "belgian indie", "dallas indie", "cincinnati indie", "firenze indie", "k-indie", "indie surf", "hokkaido indie", "auckland indie", "argentine indie", "reading indie", "idaho indie", "nashville indie", "denton tx indie", "slc indie", "north east england indie", "cardiff indie", "duluth indie", "san diego indie", "belo horizonte indie", "orebro indie", "indie veneto", "hampton roads indie", "leeds indie", "greek indie", "lund indie", "slovenian indie", "york indie", "munich indie", "vegas indie", "indie poptimism", "gothenburg indie", "bristol indie", "rhode island indie", "kentucky indie", "philly indie", "lithuanian indie", "connecticut indie", "kent indie", "malmo indie", "manchester indie", "canadian indie", "christian indie", "bergen indie", "springfield mo indie", "fort wayne indie", "yogyakarta indie", "sacramento indie", "rennes indie", "dayton indie", "newcastle nsw indie", "tulsa indie", "el paso indie", "ann arbor indie", "mississippi indie", "warrington indie", "buffalo ny indie", "estonian indie", "tampa indie", "indie viet", "northern irish indie", "rva indie", "zurich indie", "danish indie", "columbus ohio indie", "dutch indie", "modern indie folk", "morelos indie", "indie curitibano", "alaska indie", "melbourne indie", "fremantle indie", "london indie", "nz indie", "lancashire indie", "cologne indie", "oc indie", "seattle indie", "icelandic indie", "vermont indie", "derby indie", "asheville indie", "maine indie", "st petersburg fl indie", "brooklyn indie", "boston indie", "vancouver indie", "fort worth indie", "stuttgart indie", "pov: indie", "sheffield indie", "dalarna indie", "oth indie", "cork indie", "montreal indie", "stl indie", "dunedin indie", "west yorkshire indie", "nice indie", "devon indie", "trondheim indie", "birmingham indie", "south carolina indie", "utah indie", "jacksonville indie", "gainesville indie", "detroit indie", "limerick indie", "montana indie", "canadian indie folk", "indonesian indie", "sydney indie", "chicago indie", "edinburgh indie", "adelaide indie", "baltimore indie", "ohio indie", "dominican indie", "okc indie", "washington indie", "eugene indie", "bay area indie", "knoxville indie", "swansea indie", "kansas indie", "cape town indie", "eau claire indie", "lebanese indie", "asbury park indie", "hamilton on indie", "isle of wight indie", "norman ok indie", "victoria bc indie", "chattanooga indie", "charlotte nc indie", "portuguese indie", "ecuadorian indie", "israeli indie", "chihuahua indie", "rochester mn indie", "ontario indie", "edmonton indie", "kc indie", "lafayette indie", "australian indie", "ok indie", "ghent indie", "irish indie", "indiana indie", "denver indie", "omaha indie", "east anglia indie", "essex indie", "oakland indie", "halifax indie", "lawrence ks indie", "milan indie", "austrian indie", "calgary indie", "atlanta indie", "nantes indie", "nottingham indie", "brazilian indie", "oulu indie", "christchurch indie", "mexican indie", "indie game soundtrack", "kolkata indie", "leicester indie", "tucson indie", "indie electropop", "oslo indie", "delaware indie", "perth indie", "scottish indie", "cornwall indie", "indie fuzzpop", "pittsburgh indie", "tempe indie", "leipzig indie", "memphis indie", "cleveland indie", "london on indie", "la indie", "stockholm indie", "cambridgeshire indie", "liverpool indie", "houston indie", "swiss indie", "coventry indie", "aberdeen indie", "alabama indie", "indie quebecois", "toronto indie", "south dakota indie", "grand rapids indie", "uae indie", "singaporean indie", "bath indie", "iowa indie", "brisbane indie", "minneapolis indie", "hamburg indie", "louisville indie", "chinese indie", "phoenix indie", "latvian indie", "luxembourgian indie", "norwegian indie", "lancaster pa indie", "dc indie", "portland indie", "manitoba indie", "miami indie", "quebec indie", "indian indie", "southampton indie", "charlottesville indie", "ottawa indie", "wisconsin indie", "newcastle indie", "albany ny indie", "west virginia indie", "hong kong indie", "belfast indie", "kingston on indie", "rio grande do sul indie", "rome indie", "nordic shoegaze", "german shoegaze", "korean shoegaze", "australian shoegaze", "japanese shoegaze", "canadian shoegaze", "shoegaze", 'alternative rock', "noise pop", "dream pop", "alternative rock", "danish alternative rock", "norwegian alternative rock", "polish alternative rock", "christian alternative rock", "british alternative rock", "alternative roots rock", "indonesian alternative rock", "japanese alternative rock", "finnish alternative rock", "russian indie rock", "british indie rock", "indie garage rock", "belgian indie rock", "irish indie rock", "thai indie rock", "english indie rock", "indonesian indie rock", "indie rock mexicano", "brazilian indie rock", "indie psychedelic rock", "chinese indie rock", "dutch indie rock", "korean indie rock", "scottish indie rock", "portuguese indie rock", "little rock indie",],
    'rock': ['classic rock', "pop rock", "canadian rockabilly", "japanese psychedelic", "swedish prog", "dutch prog", "norwegian prog", "swedish melodic rock", "neo-psychedelic", "neo-psicodelia brasileira", "chamber psych", "shimmer psych", "garage psych", "australian psych", "experimental psych", "cascadia psych", "heavy psych", "psych gaze", "afro psych", "classic canadian rock", "melodic hard rock", "canadian rock", "neo-rockabilly", "rockabilly", "beatlesque", "paisley underground", "rocksteady", "traditional rockabilly", "german rockabilly", "danish pop rock", "post-grunge", "german hard rock", "finnish hard rock", "christian hard rock",  "german pop rock", "australian surf rock", "modern hard rock", "swedish psychedelic rock", "austin rock", "dance rock", "swedish rock-and-roll", "swedish hard rock", "german rock",   "swedish garage rock", "german prog", "german ebm", "rock", "hard rock brasileiro", "swedish stoner rock", "german indie rock", "australian rock", "hard rock", "german stoner rock", "power pop", "underground power pop", "italian pop rock", "symphonic rock", "country rock", "modern alternative rock", "french rock", "rock progressif francais", "rock en espanol", "celtic rock", "rock nacional", "mexican rock", "latin rock", "art rock", "space rock", "rock brasiliense", "acid rock", "deep indie rock", "rock drums", "classic j-rock", "rock goiano", "british math rock", "cambodian rock", "heartland rock", "turkish rock", "idol rock", "rock independant francais", "rock potiguar",   "dutch rock", "rock", "instrumental math rock",  "varmland rock",  "future rock", "soft rock", "norwegian rock",  "lovers rock", "rock of gibraltar", "deep soft rock", "crack rock steady", "garage rock revival", "rock cristiano", "kindie rock", "south african rock",   "swiss rock", "belgian rock", "pub rock", "deep classic garage rock", "psychedelic rock",  "boston rock", "garage rock mexicano", "comedy rock", "danish rock",  "hong kong rock",  "medieval rock", "african rock", "brazilian stoner rock", "dark rock", "psychedelic space rock",  "scottish rock", "roots rock", "modern folk rock", "mexican rock-and-roll",  "rock gotico", "japanese math rock", "instrumental rock", "yacht rock",  "southern rock", "mexican classic rock", "piano rock", "argentine rock",  "japanese garage rock", "welsh rock",  "faroese rock", "rock alternatif francais", "rock keyboard", "finnish psychedelic rock", "classic garage rock", "novo rock gaucho", "brazilian rock", "rock abc paulista", "spanish rock", "glam rock",  "uk stoner rock", "j-rock", "rock alternativo brasileiro", "ukrainian rock", "swamp rock", "anime rock", "irish rock", "album rock", "geek rock", "flute rock",  "latvian rock", "portuguese rock", "detroit rock", "japanese psychedelic rock", "modern southern rock", "k-rock",  "action rock", "modern country rock", "instrumental stoner rock", "uk noise rock", "avant-rock", "rock nacional brasileiro", "greek rock", "christian rock", "rock baiano",  "slovenian rock", "rock-and-roll", "drone rock", "garage rock", "rock cearense", "acoustic rock", "samba-rock", "noise rock", "estonian rock", "indonesian rock", "icelandic rock", "french stoner rock", "rock gaucho", "trop rock", "math rock",  "modern rock", "rock paraibano",  "sleaze rock",  "rock quebecois", "greek psychedelic rock", "classic finnish rock",  "suomi rock", "israeli rock", "chilean rock", "kiwi rock"],
    'other': ["british contemporary classical","australian talent show", 'latin', "deep talent show", "big band", "jug band", "brass band", "jam band", "boy band", "one-person band", "string band", "beat poetry", "harp", "pink noise", "french movie tunes", "new age piano", "new age", "new tribe", "new isolationism", "french dub", "french soundtrack", "new romantic", "noise", "classic afrobeat", "soca", "motown", "smooth saxophone", "hel", "scottish hush", "musique militaire", "nintendocore", "turntablism", "xenharmonic", "impressionism", "danseband", "pluggnb", "barrelhouse piano", "autoharp", "hurdy-gurdy", "microtonal", "acoustic cover", "britcore", "trondersk musikk", "contrabass", "old school bassline", "tape club", "adult standards", "latin afrobeat", "parody", "nu age", "j-idol", "canadian classical", "canadian classical piano", "easycore",  "axe", "psicodelia brasileira", "sound team", "avant-prog", "quiet storm", "indian percussion", "russian indie", "umbanda", "countrygaze", "ragtime", "corrosion", "hindustani instrumental", "honky tonk", "irish pub song", "nursery", "velha guarda", "double drumming", "praise", "italian tenor", "j-division", "alte", "dainuojamoji poezija", "classify", "baroque", "madchester", "polish prog", "eastern bloc groove", "wrock", "nwocr", "early romantic era", "dark cabaret", "dark plugg", "medway sound", "epicore", "furry", "pixel", "chutney", "lo star", "anime score", "bossa nova cover", "folkmusik", "hawaiian", "bossa nova", "modern mod", "modern cello", "easy listening", "modern salsa", "modern old-time", "meme", "slam poetry", "gbvfi", "necrotrap", "melancholia", "choral", "novelty", "filmi", "opera", "classic soundtrack", "italian soundtrack", "soundtrack", "german soundtrack", "oceania soundtrack", "nordic soundtrack", "canadian soundtrack", "british soundtrack", "orchestral soundtrack", "pops orchestra", "nederpop", "bagpipe", "native american traditional", "pastoral", "american orchestra", "man's orchestra", "british orchestra", "orchestra", "fingerstyle", "laiko", "talent show", "latin talent show", "cartoon", "drone", "beatboxing", "college a cappella", "vintage italian soundtrack", "vintage hollywood", "broadway", "vintage chanson", "vintage schlager", "vintage broadway", "sda a cappella", "a cappella", "christian a cappella", "wu fam", "poetry", "post-minimalism", "tabla", "tropical alternativo",  "freakbeat", "british comedy", "talent show", "black comedy", "clean comedy", "comedy", "salsa", "lds", "mpb", "fantasy", "pixie", "afrofuturism", "sped up", "hollywood", "minimalism", "australian indigenous music", "mambo", "movie tunes", "bossbeat", "talentschau", "hammond organ", "deep liquid", "australian children's music", "abstractro", "substep", "mountain dulcimer", "supergroup", "exotica", "rogaland musikk", "ramonescore", "merseybeat", "bboy", "deep orgcore", "speedrun", "sung poetry", "elephant 6", "gay chorus", "parody", "c86", "vbs", "scorecore", "mellow gold", "british comedy", "black comedy", "clean comedy", "comedy", "strut", "dixieland", "draga", "spanish invasion", "tropical", "canzone napoletana", "cyberpunk", "oud", "hauntology", "audiophile vocal", "balkan brass", "sunnlensk tonlist", "bleakgaze", "red dirt", "belly dance", "gregorian dance", "british dance band", "christian dance", "calypso", "trova", "papuri", "shamanic", "halloween", "polish psychedelia", "dansband", "redneck", "torch song", "khmer", "histoire pour enfants", "cymraeg", "stride", "multidisciplinary", "cantautor", "spectra", "abstract", "illbient", "futuristic swag", "scratch", "bongo flava", "singer-songwriter", "israeli singer-songwriter", "deep latin alternative", "latin alternative", "irish singer-songwriter", "indonesian singer-songwriter", "singer-songwriter pop", "nashville singer-songwriter", "neo-singer-songwriter", "japanese singer-songwriter", "scottish singer-songwriter", "british singer-songwriter", "icelandic singer-songwriter", "dutch singer-songwriter", "swedish singer-songwriter", "swiss singer-songwriter", "belgian singer-songwriter", "korean singer-songwriter", "german singer-songwriter", "indie singer-songwriter", "czech singer-songwriter", "danish singer-songwriter", "austin singer-songwriter", "australian singer-songwriter", "nz singer-songwriter", "gen z singer-songwriter", "singaporean singer-songwriter", "canadian singer-songwriter", "norwegian singer-songwriter", "chinese classical performance", "danish experimental", "experimental", "experimental guitar", "british experimental", "arab experimental", "icelandic experimental", "brazilian experimental", "experimental vocal", "australian experimental", "canadian experimental", "norwegian experimental", "tennessee experimental", "experimental indie", "nordic classical piano", "classical", "neo-classical", "belgian contemporary classical", "classical bass", "french classical piano", "classical soprano", "indian classical", "chinese classical piano", "classical countertenor", "classical baritone", "latin classical", "hindustani classical", "american contemporary classical", "american 21st century classical", "classical piano", "brazilian classical", "polish classical", "baltic classical", "classical tenor", "contemporary classical", "american modern classical", "classical trumpet", "classical mandolin", "finnish classical", "classical performance", "ukrainian classical", "21st century classical", "icelandic classical", "african-american classical"]
}
reverse_mapping = {subgenre: broader for broader, subgenres in spotify_genre_mapping.items() for subgenre in subgenres}

### AudioSet

A helpful repository for processing AudioSet can be found [here](https://github.com/aoifemcdonagh/audioset-processing/tree/master).

Our use case involved 1. Getting from AudioSet entries that fit our genre categories 2. Getting video titles for those entries 3. Filtering the video titles and extracting artist-song pairs 4. Getting lyrics for these pairs 5. Fetching audio for the pairs with lyrics.

In our work we fetched dataa for 'hip hop', 'country', 'folk', 'electronic', 'jazz', 'pop', 'reggae', 'rhythm & blues' and 'rock'. Rock also contains metal music which we seperate. We combine country and folk under one label, jazz reggae and rnb under the 'rhythm music' label

In [None]:
#1. This function from the provided repo can be used to fetch entries from AudioSet that belong to certain classes
def create_csv(class_name, args):
    """
    Function for creating csv file containing all clips and corresponding info for given class
    Blacklisted classes functionality implemented here
    :param class_name:
    :param args:
    :return:
    """
    # construct path to destination dir
    dst_dir = args.destination_dir if args.destination_dir is not None else DEFAULT_DEST_DIR
    csv_dataset = args.csv_dataset if args.csv_dataset is not None else DEFAULT_CSV_DATASET
    print("Looking into: ", DEFAULT_CSV_DATASET)

    new_csv_path = os.path.join(str(dst_dir) + "/" + class_name + '.csv')
    print(new_csv_path)

    # Should check if CSV already exists and possibly return if so? Overwriting for now
    if os.path.isfile(new_csv_path):
        print("A CSV file for class " + class_name + ' already exists.')
        print("*** Overwriting " + str(new_csv_path) + " ***")

    label_id = get_label_id(class_name, args)  # Get a list of label IDs which match class_name

    if args.blacklist != None:
        blacklisted_ids = [get_label_id(blacklisted_class, args) for blacklisted_class in args.blacklist ] # Get a list of label IDs for blacklisted classes
        blacklisted_ids = [id for blacklist in blacklisted_ids for id in blacklist]  # Flatten list of lists into a single list
    else:
        blacklisted_ids = []

    with open(csv_dataset) as dataset, open(new_csv_path, 'w', newline='') as new_csv:
        reader = csv.reader(dataset, skipinitialspace=True)
        writer = csv.writer(new_csv)

        #  Include the row if it contains label for desired class and no labels of blacklisted classes
        to_write = [row for row in reader for label in label_id if label in row[3]
                    and bool(set(row[3].split(",")).intersection(blacklisted_ids)) is False]  # added check for blacklisted classes
        writer.writerows(to_write)

    print("Finished writing CSV file for " + class_name)

    return new_csv_path

#2. The following two can be used to get video titles. Someone would need a youtube api developer key.
def initialize_youtube_api():
    api_key = 'YOUR_API_KEY'
    youtube = build('youtube', 'v3', developerKey=)
    return youtube

def get_titles(class_name, args):
    youtube = initialize_youtube_api()
    new_csv = create_csv(class_name, args)
    dst_dir_root = args.destination_dir if args.destination_dir is not None else DEFAULT_DEST_DIR
    enhanced_csv_path = os.path.join(dst_dir_root, f"{class_name}_titles_enhanced.csv")

    with open(new_csv, 'r', encoding='utf-8') as infile:
        reader = csv.reader(infile)
        video_ids = [row[0] for row in reader]  
        
        titles = get_video_titles(youtube, video_ids) 

        titles_data = []
        infile.seek(0) 

        next(reader)  
        for row in reader:
            video_id = row[0]
            start_time = row[1]  
            end_time = row[2]  
            ontology_ids = row[3]  
            title = titles.get(video_id, "Title Not Found")
            titles_data.append({
                'Video ID': video_id, 
                'Start Time': start_time, 
                'End Time': end_time, 
                'Ontology IDs': ontology_ids, 
                'Video Title': title
            })

    with open(enhanced_csv_path, 'w', newline='', encoding='utf-8') as outfile:
        writer = csv.writer(outfile)
        writer.writerow(['Video ID', 'Start Time', 'End Time', 'Ontology IDs', 'Video Title'])

        for data in titles_data:
            row = [data['Video ID'], data['Start Time'], data['End Time'], data['Ontology IDs'], data['Video Title']]
            writer.writerow(row)

    print("Titles fetched and CSV enhanced for class: " + class_name)




def get_video_titles(youtube, video_ids):
    titles = {}
    for i in tqdm(range(0, len(video_ids), 50), desc="Fetching video titles"):
        batch_ids = video_ids[i:i+50]
        request = youtube.videos().list(
            part='snippet',
            id=','.join(batch_ids)
        )
        response = request.execute()
        for item in response.get('items', []):
            video_id = item['id']
            title = item['snippet']['title']
            titles[video_id] = title
    return titles

#3. & 4. We use clea_title to remove strings that might cause trouble. The code that follows outlines how someone might choose to fetch lyrics with genius api
base = ...
input_path = ...
output_path = ...

data = pd.read_csv(input_path)
data['Artist'] = data['Artist'].str.lower()
data['Song'] = data['Song'].str.lower()

output_data = pd.read_csv(output_path)

def song_exists(output_data, song_id):
    return not output_data[output_data['Video ID'] == song_id].empty

for idx, row in tqdm.tqdm(data.iterrows(), total=data.shape[0]):
    song_id = row['Video ID']
    original_artist, original_song = row['Artist'], row['Song']
    
    cleaned_artist = original_artist
    if pd.isna(original_song) or pd.isna(cleaned_artist):
        continue
    cleaned_song = clean_title(original_song)

    if (original_artist != cleaned_artist or original_song != cleaned_song) and not song_exists(output_data, song_id):
        lyrics = get_lyrics(cleaned_artist, cleaned_song)
        if lyrics and is_english(lyrics):
            row['Artist'] = cleaned_artist
            row['Song'] = cleaned_song
            row['Lyrics'] = lyrics
            output_data = output_data.append(row, ignore_index=True)
        else:
            lyrics_genius = get_lyrics_genius(cleaned_artist, cleaned_song)
            time.sleep(1.5)
            if lyrics_genius and is_english(lyrics_genius):
                row['Artist'] = cleaned_artist
                row['Song'] = cleaned_song
                row['Lyrics'] = lyrics_genius
                output_data = output_data.append(row, ignore_index=True)

output_data.to_csv(output_path, index=False)

#5. Finally to download the audio data someone could combine yt-dlp with ffmpeg. 
def download(class_name, args):
    csv_dir = ...
    csv_file_name = f"{class_name.replace(' ', '_').lower()}.csv"
    csv_path = os.path.join(csv_dir, csv_file_name)
    
    data = pd.read_csv(csv_path)
    audio_dir = os.path.join('D:', r'\Diploma Thesis', 'Datasets', 'AudioSet', 'audioset-processing', 'output', 'extracted', 'with_lyrics', class_name.replace(' ', '_').lower())
    if not os.path.isdir(audio_dir):
        os.makedirs(audio_dir)
        print(f"Audio files directory: {audio_dir}")

    data['Audio Filename'] = None

    progress = tqdm(data.iterrows(), total=data.shape[0], desc="Downloading audio")
    for idx, row in progress:
        video_id, start_time, end_time = row['Video ID'], row['Start Time'], row['End Time']
        end_time = str(int(end_time) + 1)
        
        current_time = time.time()
        elapsed_time = current_time - progress.last_print_t
        if elapsed_time < 4:
            time.sleep(4 - elapsed_time)
        
        filename = download_audio_segment(video_id, start_time, end_time, audio_dir)
        if filename:
            data.at[idx, 'Audio Filename'] = filename
        
        progress.set_postfix({'Video ID': video_id})
    
    # Save the updated DataFrame back to CSV
    data.to_csv(csv_path, index=False)
    print("Completed processing all audio segments.")

def download_audio_segment(video_id, start_time, end_time, output_dir):
    # Correcting the template to ensure it saves as .mp3 with the specified name directly
    temp_output_template = os.path.join(output_dir, f"{video_id}_temp.%(ext)s")
    final_output_path = os.path.join(output_dir, f"{video_id}_{start_time}_{end_time}.mp3")

    download_command = [
        'path\\to\\yt-dlp.exe',
        '-x', '--audio-format', 'mp3', '--audio-quality', '0',
        f'https://www.youtube.com/watch?v={video_id}',
        '--output', temp_output_template
    ]
    
    download_process = subprocess.run(download_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    print(download_process.stderr.decode())
    
    if download_process.returncode != 0:
        print("Download failed. Skipping post-processing.")
        return None

    temp_output_path = os.path.join(output_dir, f"{video_id}_temp.mp3")

    if os.path.exists(temp_output_path):
        trim_command = [
            'ffmpeg',
            '-loglevel', 'error',
            '-i', temp_output_path,
            '-ss', str(start_time), '-to', str(end_time),
            '-acodec', 'copy',
            final_output_path
        ]

        trim_process = subprocess.run(trim_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        print(trim_process.stderr.decode())

        if trim_process.returncode != 0:
            print("Error trimming the audio file.")
            return None

        os.remove(temp_output_path)
    else:
        print("Downloaded file not found.")
        return None

    return final_output_path if os.path.exists(final_output_path) else None

In [None]:
# clean_titles function and functions to fetch lyrics

import pandas as pd
import requests
from bs4 import BeautifulSoup
import json
import re
from langdetect import detect_langs
import lyricsgenius
import tqdm
import os
import time


def clean_title(title):
    """Remove common unwanted substrings from titles."""
    patterns_to_remove = [
        r'\(with lyrics\)', r'lyrics', r'official video', r'audio',
        r'ft\.?', r'feat\.?', r'HD', r'HQ', r'Visual by [^\s]+',
        r'prod\.? by [^\s]+', r'prod\.?', r'\.\w{3}', r'►', r'♫', 
        r'official video', r'\(official video\)', r'\(OFFICIAL VIDEO - HD\)', 
        r'\[OFFICIAL MUSIC VIDEO - HD\]', r'\(Official Music Video\)',  r'- Music Video',
        r'OFFICIAL VIDEO \(HD\)', r'\[Official Video\]', r'OFFICIAL VIDEO HD', 
        r'Unofficial Video', r'\.mov', r'\[Free Download\]', r'\(full\)', 
        r'\(directed by JD Films\)', r'\(OFFICIAL VIDEO 2010 \)',
        r'\(Official Video with Lyrics\)', r'\(official audio\+lyrics\)', r'\(Official Release\)', 
        r'\[Official Video\]', r'\(Official Video Clip\)'
        r'\(Official Video 2012\)', r'Visual by @MasterMindRichy', 
        r'Directed By Manmeet Singh \[HD\]', r'Directed by: @SureShotGunny', 
        r'\(Directed by F5 x Cherry Sky Studios\)', 
        r'\(Official Music Video Directed By Robert Gallardo\)', 
        r'DIRECTED BY: I SUPPOSE', r'\( Directed by @A_royal_Payne \)', 
        r'With Lyrics!!', r'Lyrics', r'\(with lyrics\)', r'\+ Lyrics \(1995\)', 
        r'\+ Lyrics', r'\+lyrics', r'w/ Lyrics', r'\(Lyrics & Translation\)', 
        r'with Lyrics', r'\*with Lyrics\*', r'\(Lyrics video\)', 
        r'\(With Lyrics On Screen\)', r'\(LYRICS ON SCREEN\)', r'\(lyrics\)',
        r'\[Lyrics\]', r'\(official audio\+lyrics\)', r'\[ New Video \+ Lyrics \+ Download \]', 
        r'\[with Lyrics\]', r'\(w/ Lyrics\)', r'\[CLIP\] \[Dubstep Lyrics\]', r'DUBSTEP LYRICS', 
        r'\( lyrics on screen \)', r'\[Lyrics included\]', r'\[Original Techno \+ LYRICS ON SCREEN\]'
        r'lyrics \(HQ\)', r'\(Lyrics On Screen Video HD\)', 
        r'\[Video \+ Lyrics\]', r'w/ lyrics', r'New!!!', 
        r'- Lyrics \+ Download \(Official 2010 Song\)', 
        r'\(lyrics in description\)', r'\(w\. Official Lyrics\)', 
        r'\( lyrics on screen\)', r'\+ Lyrics \[HQ\] 2009', r'With Lyrics!!', 
        r'\+ Lyrics \(1995\)', r'\[HD\]', r'\(MUSIC\) Video', r'\*HD CDQ\*', 
        r'\(NEW WIZ KHALIFA 2011 HD\)', 
        r'\(LETRA\)\(2013\)\(HD\)\(link p/ download\)', 
        r'Preview 1080HD', r'\[HD/HQ\]', r'\(New 2012 HD\)', 
        r'HD', r'\(Lyrics On Screen Video HD\)', 
        r'\[Official HD-Video\]', r'Prod\.ByEray\.flv', r'HD Video',
        r'\(.*?\bprod\.?\b.*?\)', r'\[.*?\bprod\.?\b.*?\]', r'\(.*?\bproduced\b.*?\)',
        r'\[.*?\bproduced\b.*?\]', r'produced by [^\s]+', r'prod by [^\s]+',
        r'\(high quality\)', r'\(\s?\d{4}\s?\)', r'\(\d{4}\)',
    ]
    
    for pattern in patterns_to_remove:
        title = re.sub(pattern, '', title, flags=re.IGNORECASE)
    title = title.strip()
    
    return title

genius = lyricsgenius.Genius(key)
genius.verbose = False
genius.skip_non_songs = True

def get_lyrics(artist, title):
    url = f"https://lyrics.lyricfind.com/lyrics/{artist.lower().replace(' ', '-')}-{title.lower().replace(' ', '-')}"
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    script_tag = soup.find('script', id='__NEXT_DATA__')
    if script_tag:
        script_content = script_tag.string
        json_data = json.loads(script_content)
        lyrics = json_data.get('props', {}).get('pageProps', {}).get('songData', {}).get('track', {}).get('lyrics', '')
        if lyrics:
            return lyrics
    return None

def is_english(text, num_trials=5, threshold=0.42):
    results = detect_langs(text)
    if 'en' in [lang.lang for lang in results] and max([lang.prob for lang in results if lang.lang == 'en']) >= threshold:
        return True

    for _ in range(1, num_trials):
        results = detect_langs(text)
        if 'en' in [lang.lang for lang in results] and max([lang.prob for lang in results if lang.lang == 'en']) >= threshold:
            return True
    return False
    
def clean_lyrics(lyrics):
    lines = lyrics.split('\n')[1:]
    lyrics = '\n'.join(lines)
    lyrics = re.sub(r'\[.*?\]\n|\d*Embed$', '', lyrics, flags=re.IGNORECASE)
    return lyrics.strip()

def normalize_string(s):
    """Normalize a string by removing spaces, punctuation, and invisible characters like zero-width spaces."""
    # Remove zero-width spaces and other invisible characters
    s = re.sub(r'[\u200b-\u200d\ufeff]', '', s)
    # Remove spaces and periods, and convert to lowercase
    return re.sub(r'\s+|\.', '', s.lower())

def get_lyrics_genius(artist, title):
    try:
        song = genius.search_song(title, artist)
        if song:
            # Normalize the artist names for comparison
            normalized_artist_input = normalize_string(artist)
            normalized_artist_genius = normalize_string(song.artist)
            
            # Check if the normalized names match and if the title matches
            if normalized_artist_genius == normalized_artist_input and (song.title.lower() in title.lower()):
                cleaned_lyrics = clean_lyrics(song.lyrics)
                if cleaned_lyrics:
                    return cleaned_lyrics
    except Exception as e:
        print(f"Error fetching lyrics: {e}")
        return None
    return None

#### Preprocessing

While for small amounts of data this step can be implemented during the training or testing phase, we convert lyrics to tokens and audios to spectrograms for the M4A dataset to optimize our pipeline. The processes used are the following.

For lyrics we use the roberta-large tokenizer and save token ids and attention masks

For audios we use ASTFeatureExtractor, implement batching and save in pickle files

In [None]:
# Lyrics Preprocessing Step
from transformers import AutoTokenizer
import string
import json
import tqdm

class DataPreprocessor:
    def __init__(self, tokenizer, max_len=256):
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def preprocess(self, text):
        text = text.lower()
        text = text.translate(str.maketrans('', '', string.punctuation))
        text = text.replace('\n', ' ').replace('\t', ' ')
        text = ' '.join(text.split())
        return text
    
    def tokenize_text(self, text):
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True
        )
        return inputs['input_ids'], inputs['attention_mask']

tokenizer = AutoTokenizer.from_pretrained("roberta-large")
preprocessor = DataPreprocessor(tokenizer)

encoded_data_dict = {}
for index, row in tqdm.tqdm(df.iterrows()):
    preprocessed_text = preprocessor.preprocess(row['lyrics'])
    input_ids, attention_mask = preprocessor.tokenize_text(preprocessed_text)
    encoded_data_dict[row['id']] = {'input_ids': input_ids, 'attention_mask': attention_mask}
    
with open("./tokenized_texts.json", "w") as f:
    json.dump(encoded_data_dict, f)

In [None]:
# Audio Preprocessing Step
from transformers import ASTFeatureExtractor
import tqdm
import os
import librosa
import pandas as pd

audio_folders = ['./audios', './new_audios']
feature_extractor = ASTFeatureExtractor.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593",sampling_rate= 44100)

def process_audio_files(audio_files, first):
    input_values_dict = {}
    for i, audio_file_id in enumerate(tqdm.tqdm(audio_files)):
        audio_folder = audio_folders[0] if first.iloc[i] else audio_folders[1]
        audio_file_path = os.path.join(audio_folder, f"{audio_file_id}.mp3")
        try:
            waveform, sample_rate = librosa.load(audio_file_path, sr=44100)
            inputs = feature_extractor(waveform, sampling_rate=sample_rate, padding="max_length", return_tensors="pt")
            input_values = inputs.input_values
            input_values_dict[audio_file_id] = input_values
        except Exception as e:
            print(f"Error processing {audio_file_path}: {e}")
    return input_values_dict

batch_size = 5000
for i in range(0, len(df), batch_size):
    batch_df = df.iloc[i:i+batch_size]
    input_values_dict_batch = process_audio_files(batch_df['id'], batch_df['id'] != batch_df['spotify_id'])
    output_file_path = os.path.join('./spectogram_pikles', f"batch_{i}.pkl")
    with open(output_file_path, 'wb') as f:
        pickle.dump(input_values_dict_batch, f)