In [5]:
import pandas as pd
from numpy.random import shuffle

df = pd.read_csv('data/song_lyrics_reduced_rap_rock_pop_rb_country_80000.csv')

In [6]:
tags = df['tag'].unique()
print(', '.join(tags))
for tag in tags:
    print(f'{tag}: {len(df[df["tag"] == tag])}')

rap, rock, pop, rb, country
rap: 80000
rock: 80000
pop: 80000
rb: 80000
country: 80000


In [3]:
def make_data(genres, data, lim=-1):
    '''Make a reduced version of the total dataset. 
    Try to balance it (unless lim is -1, then take all data for the given genres)'''
    # Select only the given genres
    print('\nCreating new reduced dataset')
    new_df = data.loc[data['tag'].isin(genres)]

    # Output the total songs per genre
    amounts = [f'{genre}: {len(new_df.loc[new_df["tag"] == genre])}' for genre in genres]
    print(f'Total genre sizes - {", ".join(amounts)}')
    print(f'Old dataset size: {len(data)}, old dataset size with only these genres: {len(new_df)}')

    # Take the first 'lim' amount of songs from each genre, or all songs if lim is -1
    new_data = pd.DataFrame(columns=new_df.columns)
    for genre in genres:
        addition = new_df.loc[new_df['tag'] == genre][:lim]
        new_data = pd.concat([new_data, addition], ignore_index=True)

    # Save to csv
    if lim == -1:
        lim = 'all'
    new_data.to_csv(f'data/song_lyrics_reduced_{"_".join(genres)}_{lim}.csv')
    print(f'Succesfully wrote {genres} with size {len(new_data)} to file ({lim} songs per genre)\n')

    return

In [7]:
genres = ['rap', 'rock', 'rb', 'country']
lims = [80000]

for lim in lims:
    make_data(genres, df, lim)

# genres = ['rap', 'rock', 'pop']
# lims = [100, 1000, 10000, 100000, 600000, -1]

# for lim in lims:
#     make_data(genres, df, lim)


Creating new reduced dataset
Total genre sizes - rap: 80000, rock: 80000, rb: 80000, country: 80000
Old dataset size: 400000, old dataset size with only these genres: 320000
Succesfully wrote ['rap', 'rock', 'rb', 'country'] with size 320000 to file (80000 songs per genre)



In [None]:
# # Print Bohemian Rhapsody
# test = pd.read_csv('data/song_lyrics_reduced_rap_rock_pop_10000.csv')
# print(test['lyrics'][10001])