This notebook keeps the information of duplicated songs and identifies which songs are covers (duplicate lyrics by different artists) and duplicates (duplicate lyrics by the same artists).

In [1]:
# mount GDrive
from google.colab import drive
#drive.mount('/content/drive')
drive._mount('/content/drive')

Mounted at /content/drive


In [2]:
# load dataset_10
!cp -r "drive/MyDrive/Artistic_Content_Creation/WASABI_gender_experiments/WASABI_gender_experiments_definitive/dataset_10/data_lyrics_group_decades" .
!cp -r "drive/MyDrive/Artistic_Content_Creation/WASABI_gender_experiments/WASABI_gender_experiments_definitive/dataset_10/data_lyrics_person_decades" .

# load results of duplicate detection
!cp "drive/MyDrive/Artistic_Content_Creation/WASABI_gender_experiments/WASABI_gender_experiments_definitive/dataset_10/cliques_exact_duplicate_lyrics.json" . 

# load all artists info
!cp "drive/MyDrive/Artistic_Content_Creation/WASABI_gender_experiments/WASABI_gender_experiments_definitive/data/artists_info.json.gz" .


In [3]:
%pylab inline
import pandas as pd
import glob

from tqdm.auto import tqdm
tqdm.pandas()

Populating the interactive namespace from numpy and matplotlib


In [4]:
# get number of songs across time

data_folders = ['data_lyrics_person_decades/', 'data_lyrics_group_decades/']

song_lyrics = []

for data_folder in data_folders:
    type_ = data_folder.split('_')[-2]
    for file in glob.glob(data_folder+'*_[!.]*.json.gz'): # skip songs with no dates

        data_chunk = pd.read_json(file, orient='records', lines=True, chunksize=5000 )
        for chunk in data_chunk:        

            chunk = chunk[['song_id', 'song_title', 'language_detect', 'lyrics', 'n_words', 'n_lines', 'artist_id', 'artist_name',
                            'song_pubdate_combined', 'song_year_combined', 'song_decade_combined', 'other_artist_info']]
            song_lyrics.append(chunk)

song_lyrics = pd.concat(song_lyrics)
print('Total number of song lyrics: ', song_lyrics.shape[0])
song_lyrics.head()

Total number of song lyrics:  460339


Unnamed: 0,song_id,song_title,language_detect,lyrics,n_words,n_lines,artist_id,artist_name,song_pubdate_combined,song_year_combined,song_decade_combined,other_artist_info
0,5714dec325ac0d8aee380b13,Tears All Over Town,english,Ride the subway home\nPretend that you're not ...,192,33,56d7e9416b60c09814f93eb7,A Girl Called Eddy,2004-08-10,2004,2000,"{'n_albums': 2, 'n_songs': 16, 'languages': {'..."
1,5714dec325ac0d8aee380b14,Kathleen,english,If I don't laugh i'll cry\nAt all the years go...,201,39,56d7e9416b60c09814f93eb7,A Girl Called Eddy,2004-08-10,2004,2000,"{'n_albums': 2, 'n_songs': 16, 'languages': {'..."
2,5714dec325ac0d8aee380b15,Girls Can Really Tear You Up Inside,english,There's a girl whose stuck inside a picture fr...,233,38,56d7e9416b60c09814f93eb7,A Girl Called Eddy,2004-08-10,2004,2000,"{'n_albums': 2, 'n_songs': 16, 'languages': {'..."
3,5714dec325ac0d8aee380b16,The Long Goodbye,english,You stormed my winter palace\nYou rid my heart...,202,36,56d7e9416b60c09814f93eb7,A Girl Called Eddy,2004-08-10,2004,2000,"{'n_albums': 2, 'n_songs': 16, 'languages': {'..."
4,5714dec325ac0d8aee380b17,Somebody Hurt You,english,Boy\nSomebody hurt you\nBoy\nI wish I knew who...,210,42,56d7e9416b60c09814f93eb7,A Girl Called Eddy,2004-08-10,2004,2000,"{'n_albums': 2, 'n_songs': 16, 'languages': {'..."


In [5]:
# load cliques
cliques = pd.read_json("cliques_exact_duplicate_lyrics.json")
cliques.head()

Unnamed: 0,song_id,group_id
0,5714deec25ac0d8aee573522,0
1,5714deec25ac0d8aee5735f8,0
2,5714ded925ac0d8aee493c75,1
3,5714ded925ac0d8aee493ca1,1
4,5714decd25ac0d8aee3fa956,1


In [6]:
# group all lyrics of a clique
duplicated_lyrics = cliques.groupby("group_id").apply(lambda rows: 
                                  pd.Series({'song_ids':rows.song_id.tolist()}))
duplicated_lyrics.head()

Unnamed: 0_level_0,song_ids
group_id,Unnamed: 1_level_1
0,"[5714deec25ac0d8aee573522, 5714deec25ac0d8aee5..."
1,"[5714ded925ac0d8aee493c75, 5714ded925ac0d8aee4..."
2,"[5714decb25ac0d8aee3e44c1, 5714deca25ac0d8aee3..."
4,"[5714ded225ac0d8aee43d44c, 5714ded225ac0d8aee4..."
5,"[5714ded125ac0d8aee42bb07, 5714ded125ac0d8aee4..."


In [7]:
# load artists
artists = pd.read_json("artists_info.json.gz", orient='records', lines=True)
artists.head()

Unnamed: 0,lifeSpan,nameVariations,labels,deezerFans,n_unknown,gender,abstract,id_artist_discogs,urlWikipedia,subject,urlPureVolume,artist_name,recordLabel,urlMusicBrainz,urls,urlSoundCloud,id_artist_deezer,urlDeezer,disambiguation,urlOfficialWebsite,location,urlYouTube,name_accent_fold,urlMySpace,urlWikidata,urlFacebook,languages,type,urlTwitter,urlRateYourMusic,members,locationInfo,dbp_abstract,genres,urlAmazon,id_artist_musicbrainz,dbp_genre,n_male,n_songs,urlDiscogs,n_female,urlWikia,nameVariations_fold,artist_id,urlITunes,n_albums,n_members,urlAllmusic,urlSpotify,urlBBC,urlInstagram,urlLastFm,urlSecondHandSongs,urlGooglePlus
0,"{'ended': False, 'begin': '1995', 'end': ''}",['A'],[],6519.0,0.0,,"Alternative rock band formed in Leeds, England...",72848.0,http://en.wikipedia.org/wiki/A_(band),"[Musical groups established in 1995, English a...",,A,[Warner Bros. Records],http://musicbrainz.org/artist/55c6eb6e-8388-49...,[http://www.myspace.com/officialA],,3412.0,http://www.deezer.com/artist/3412,British band,http://www.a-communication.com/,{'id_city_musicbrainz': '6e2d2d30-dbc9-4d27-99...,,A,,https://www.wikidata.org/wiki/Q300307,,"{'english': 98, 'unknown': 3, 'spanish': 1}",Group,,http://rateyourmusic.com/artist/a,[{'id_member_musicbrainz': '3ec05e94-bf6e-439f...,"[England, West Yorkshire, Leeds]",A (later changed to A + R) are a British alter...,[],http://www.amazon.com/asdf/e/B000APPUE6?tag=wi...,55c6eb6e-8388-497c-acaf-dbff584d0c3a,"[Alternative rock, Pop punk, Hard rock]",6.0,102,http://www.discogs.com/artist/72848,0.0,A,['A'],56d7e91b6b60c09814f93e4a,https://itunes.apple.com/us/artist/id635168856,6,6.0,http://www.allmusic.com/artist/mn0000474971,,,,,,
1,"{'ended': False, 'begin': '2010-04-18', 'end':...",,[],,1.0,,,,,,,A (エース) (ACE),,http://musicbrainz.org/artist/51257cf7-1672-45...,,,,,Japanese Band,http://a-rock.jp/index.php,"{'id_city_musicbrainz': '', 'country': 'Japan'...",,A,,,,"{'english': 24, 'hausa': 4, 'unknown': 3, 'tur...",Group,,,[{'id_member_musicbrainz': '82bd3da4-7085-40b8...,[Japan],,"[J-Rock, Visual Kei]",,51257cf7-1672-4580-ae5c-93eefe3684fb,,0.0,34,,0.0,A_(%E3%82%A8%E3%83%BC%E3%82%B9)_(ACE),[],56d7e91c6b60c09814f93e4c,https://itunes.apple.com/us/artist/id4328888,7,1.0,,,,,,,
2,"{'ended': False, 'begin': '', 'end': ''}","[a balladeer, A BALLADEER AND FRIENDS, a balla...",[],423.0,1.0,,A Balladeer (stylised as 'a balladeer') is Dut...,472300.0,https://en.wikipedia.org/wiki/A_Balladeer,,,A Balladeer,,http://musicbrainz.org/artist/8cb0ebc9-db95-47...,[http://www.aballadeer.com/],,242156.0,http://www.deezer.com/artist/242156,,http://www.aballadeer.com/,"{'id_city_musicbrainz': '', 'country': 'Nether...",,A Balladeer,https://myspace.com/aballadeer,https://www.wikidata.org/wiki/Q4655340,https://www.facebook.com/aballadeer,{'english': 29},Group,https://twitter.com/aballadeerhere,,[{'id_member_musicbrainz': '2931cbb9-56a0-4a96...,[],,[],http://www.amazon.com/asdf/e/B003BF7QWG?tag=wi...,8cb0ebc9-db95-4748-81df-8e1e24e70541,,0.0,29,http://www.discogs.com/artist/472300,0.0,A_Balladeer,"[a balladeer, A BALLADEER AND FRIENDS, a balla...",56d7e91d6b60c09814f93e4e,https://itunes.apple.com/us/artist/id130037087,4,1.0,http://www.allmusic.com/artist/mn0001591642,https://play.spotify.com/artist/5MUNbMtqB3EOKx...,,,,,
3,"{'ended': False, 'begin': '', 'end': ''}",,[],0.0,,,,,,,,A Beautiful Silence,,http://musicbrainz.org/artist/4616c4f1-fe79-40...,,,4708137.0,http://www.deezer.com/artist/4708137,,,"{'id_city_musicbrainz': '', 'country': '', 'ci...",,A Beautiful Silence,,,,{'english': 23},,,,[],"[United States, Michigan, Marquette]",,[],http://www.amazon.com/asdf/e/B001LI3SMC?tag=wi...,4616c4f1-fe79-40f0-ac8d-2b319528b683,,,23,,,A_Beautiful_Silence,[],56d7e91e6b60c09814f93e50,https://itunes.apple.com/us/artist/id115104139,2,,http://www.allmusic.com/artist/mn0001930454,https://play.spotify.com/artist/2FcgcBYwiCDG37...,,,,,
4,"{'ended': False, 'begin': '2001', 'end': ''}",[],[],32.0,4.0,,,407539.0,http://en.wikipedia.org/wiki/A_Band_Called_Pain,"[Musical groups from Oakland, California, Afri...",,A Band Called Pain,[Hieroglyphics Imperium Recordings],http://musicbrainz.org/artist/e5fd8fd1-9073-45...,[http://abandcalledpain.com],,1006041.0,http://www.deezer.com/artist/1006041,,http://www.abandcalledpain.com/,"{'id_city_musicbrainz': '', 'country': 'United...",,A Band Called Pain,https://myspace.com/abandcalledpain,https://www.wikidata.org/wiki/Q4655349,,"{'english': 1, 'unknown': 32}",Group,,,[{'id_member_musicbrainz': '2f647f4c-6272-4ad9...,"[United States, California, Oakland]",A Band Called Pain (abbreviated ABCP) is an Am...,[],http://www.amazon.com/asdf/e/B001LHOG4M?tag=wi...,e5fd8fd1-9073-4586-a741-e44164e543db,[Heavy metal music],0.0,33,http://www.discogs.com/artist/407539,0.0,A_Band_Called_Pain,[],56d7e91e6b60c09814f93e52,https://itunes.apple.com/us/artist/id83305886,2,4.0,http://www.allmusic.com/artist/mn0000843313,https://play.spotify.com/artist/4g3RlzXVHjXaPp...,,,,,


In [8]:
# get all the name of members of each group
group_id_members_map = artists[artists['type']=='Group'].apply(lambda row: pd.Series({'artist_id':row.artist_id,
                                                      'member_names':[mbr['name'] for mbr in row.members]}), axis=1)

group_id_members_map = group_id_members_map[group_id_members_map.artist_id.isin(song_lyrics.artist_id)]
group_id_members_map.head()

Unnamed: 0,artist_id,member_names
0,56d7e91b6b60c09814f93e4a,"[Adam Perry, Mark Chapman, Jason Perry, John M..."
18,56d7e9276b60c09814f93e6c,"[Anders Hernestam, Nathan Larson, Mark Linkous..."
50,56d7e93e6b60c09814f93eae,"[Frank Maudsley, Mike Score, Paul Reynolds, Al..."
67,56d7e94a6b60c09814f93ecc,[Dave Couse]
81,56d7e9536b60c09814f93ee6,[Matty Arsenault]


In [9]:
# there are artists belonging to more than 1 groups
group_id_members_map.explode('member_names').member_names.duplicated().sum()

2719

In [10]:
# artist names are keys, values are the list of groups the artist belongs to
group_id_members_map = group_id_members_map.explode('member_names').reset_index().groupby('member_names').apply(lambda rows:
                                                                                         rows.artist_id.tolist())
group_id_members_map.head()

member_names
1,000,000 Light Years    [56d8306153a7ddfc01f95195]
2D                       [56d837f153a7ddfc01f95b97]
2Mex                     [56d99156cc2ddd0c0f6be890]
3D                       [56d93a9fce06f50c0fed82d3]
50 Cent                  [56d8331653a7ddfc01f95536]
dtype: object

In [11]:
def member_of_group(artist_name, group_id_members_map, is_group):

    if is_group or artist_name not in group_id_members_map.index:
        return np.nan

    groups = group_id_members_map[artist_name]
    groups = list(set(groups))
    return groups
    
def get_covers_and_duplicates(song_ids):
    '''
    Given a group of duplicated lyrics, this function determines whether each song_id refers to a duplicated or cover song.
    It returns a dictionary with two items:
    'duplicated_lyrics_artist' : list of dictionaries, grouping the group of duplicated lyrics by artist. Each item contains song_ids performed by the same artist/group
    'cover_songs_dict' : dict, contains the oldest song and all the others are considered as covers. All of them are performed by different artists
    
    Note: we know the publication date of songs at the level of year, thus we can not distinguish the "origina" song if two of them were performed in the same year.
    '''

    # it has 5 columns: artist_id, type, belong_to_groups, song_id, song_year
    song_ids_df = pd.DataFrame(song_ids)
    
    # split groups and person (explode persons' belong_to_groups)
    song_ids_group_df = song_ids_df[song_ids_df['type']=='Group']
    song_ids_person_df = song_ids_df[song_ids_df['type']=='Person']
    song_ids_person_df_ = song_ids_person_df.explode('belong_to_groups')

    # artists may belong to more than one group. Let us consider only the belonging to one of them
    # This allows us to avoid fake duplicates
    song_ids_person_df_ = song_ids_person_df_.drop_duplicates(subset=['song_id'], keep='first')
    
    #.explode('belong_to_groups')

    # GET DUPLICATED SONGS
    # 1. duplicated of grouops and single artists being members of group
    # loop across group_ids
    duplicated_lyrics_ids = [] # the song ids considered as duplicates
    duplicated_lyrics_artist = [] # list containing for each artist, the list of duplicated songs
    artist_ids_in_groups = []
    for group_id, rows in song_ids_group_df.groupby('artist_id'):

        rows_cp = rows.copy()
        # take the persons belonging to this group
        song_ids_persons_belong_group = song_ids_person_df_[song_ids_person_df_.belong_to_groups==group_id]
        artist_ids_in_groups.extend(song_ids_persons_belong_group.artist_id.tolist())

        # concat this to rows (sort by year)
        # this contains all the song lyrics performed by the group group_id, or one of the members of group_id
        rows_cp = pd.concat([rows_cp, song_ids_persons_belong_group]).sort_values('song_year')

        # the first row is the first "original song"
        # all the others are duplicates performed by the same artists
        original_song = rows_cp.iloc[0]
        duplicated_songs = rows_cp[1:].song_id.tolist()

        if len(duplicated_songs)>0:
            duplicated_lyrics_artist.append({'song_id':original_song.song_id,
                                            'artist_id':original_song.artist_id,
                                            'song_year':original_song.song_year,
                                            'duplicated_songs':duplicated_songs})
            duplicated_lyrics_ids.extend(duplicated_songs)

        # 

    # 2. duplicated of single artists not being members of group
    song_ids_person_remaining_df = song_ids_person_df[~song_ids_person_df.artist_id.isin(artist_ids_in_groups)]
    for artist_id, rows in song_ids_person_remaining_df.groupby('artist_id'):

        rows = rows.sort_values('song_year')
        original_song = rows.iloc[0]
        duplicated_songs = rows[1:].song_id.tolist()

        if len(duplicated_songs)>0:
            duplicated_lyrics_artist.append({'song_id':original_song.song_id,
                                            'artist_id':original_song.artist_id,
                                            'song_year':original_song.song_year,
                                            'duplicated_songs':duplicated_songs})
            duplicated_lyrics_ids.extend(duplicated_songs)



    # GET COVERS
    song_ids_covers_df = song_ids_df[~song_ids_df.song_id.isin(duplicated_lyrics_ids)].sort_values('song_year')

    # the first one is the "original song", all the others are covers
    original_song = song_ids_covers_df.iloc[0]
    cover_songs = song_ids_covers_df[1:].drop(columns=['type', 'belong_to_groups']).to_dict(orient='records')

    cover_songs_dict = {'song_id':original_song.song_id,
                        'artist_id':original_song.artist_id,
                        'song_year':original_song.song_year,
                        'covers':cover_songs}

    return {'duplicated_lyrics_artist':duplicated_lyrics_artist, 'cover_songs_dict':cover_songs_dict}

In [12]:
# add the artis id to the song id
song_lyrics = song_lyrics.set_index('song_id')

# extract some info of all the lyrics in the duplicated group
duplicated_lyrics.loc[:, 'song_ids'] = duplicated_lyrics.song_ids.apply(lambda ids:
                                                    [{'artist_id':song_lyrics.loc[id_].artist_id, 
                                                      'type':song_lyrics.loc[id_].other_artist_info['type'], 
                                                      'belong_to_groups':member_of_group(song_lyrics.loc[id_]['artist_name'], group_id_members_map, song_lyrics.loc[id_].other_artist_info['type']=='Group'),
                                                      'song_id':id_,
                                                      'song_year':song_lyrics.loc[id_]['song_year_combined']} for id_ in ids])

# For each group, I want to identify the covers and the duplicated from the same artist
# STEPS: start with the group of duplicated lyrics defined in the previous step
# for each artist_id, group their lyrics. In that way, the oldest is the "original and firs", while all the others are duplicates of the same artist
# remove the song_ids corresponding to artist duplicates from the initial group of duplicated lyrics.
# sort the lyrics in chronological order
# Here we have a list of song_id from unique artists. The oldest is the original one, the others are cover
# These steps performed by the function get_covers_and_duplicates


duplicated_lyrics = duplicated_lyrics.merge(duplicated_lyrics.song_ids.progress_apply(lambda ids: 
                                                                              pd.Series(get_covers_and_duplicates(ids))),
                                            left_index=True, right_index=True)


# Now take all the duplicated or cover songs and label them as cover or duplicated (incliding the song_id they are duplicated or cover of)
duplicated_and_cover_songs = []
for idx, row in duplicated_lyrics.iterrows():

    duplicated_lyrics_, cover_lyrics = row.duplicated_lyrics_artist, row.cover_songs_dict

    for duplicated_lyric in duplicated_lyrics_:
        original_song = duplicated_lyric['song_id']
        duplicates = duplicated_lyric['duplicated_songs']
        for duplicate in duplicates:
            duplicated_and_cover_songs.append({'song_id':duplicate, 
                                               'duplicated_of':original_song,
                                               'cover_of':np.nan})

    if len(cover_lyrics['covers'])>0:
        for cover in cover_lyrics['covers']:
            duplicated_and_cover_songs.append({'song_id':cover['song_id'], 
                                                'duplicated_of':np.nan,
                                                'cover_of':cover_lyrics['song_id']})

duplicated_and_cover_songs = pd.DataFrame(duplicated_and_cover_songs)
duplicated_and_cover_songs.loc[:, 'is_duplicated'] = ~duplicated_and_cover_songs.duplicated_of.isna()
duplicated_and_cover_songs.loc[:, 'is_cover'] = ~duplicated_and_cover_songs.cover_of.isna()

song_lyrics = song_lyrics.reset_index()
duplicated_lyrics.shape[0]

  0%|          | 0/54838 [00:00<?, ?it/s]

54838

In [13]:
print("Total number of songs: ", song_lyrics.shape[0])
print("Number of duplicated songs: ", duplicated_and_cover_songs.is_duplicated.sum())
print("Number of cover songs: ", duplicated_and_cover_songs.is_cover.sum())
print("Are there duplicated songs in the list? ", duplicated_and_cover_songs.song_id.duplicated().any())

Total number of songs:  460339
Number of duplicated songs:  82531
Number of cover songs:  7524
Are there duplicated songs in the list?  False


In [14]:
duplicated_and_cover_songs.head()

Unnamed: 0,song_id,duplicated_of,cover_of,is_duplicated,is_cover
0,5714deec25ac0d8aee5735f8,5714deec25ac0d8aee573522,,True,False
1,5714dec625ac0d8aee3a5ee7,5714dec625ac0d8aee3a5d8d,,True,False
2,5714ded925ac0d8aee493bfd,5714ded925ac0d8aee493af6,,True,False
3,5714ded925ac0d8aee493c75,5714ded925ac0d8aee493af6,,True,False
4,5714ded925ac0d8aee493ca1,5714ded925ac0d8aee493af6,,True,False


In [15]:
duplicated_and_cover_songs.to_json("final_duplicates_and_covers.json")

In [16]:
!cp final_duplicates_and_covers.json "drive/MyDrive/Artistic_Content_Creation/WASABI_gender_experiments/WASABI_gender_experiments_definitive/dataset_10"
