This notebook makes another dataset in which the duplicated songs are not considered. This will be used whenever the presence of duplicated songs may affect the result of the analysis.

In [1]:
# mount GDrive
from google.colab import drive
#drive.mount('/content/drive')
drive._mount('/content/drive')

Mounted at /content/drive


In [2]:
# load dataset_10
!cp -r "drive/MyDrive/Artistic_Content_Creation/WASABI_gender_experiments/WASABI_gender_experiments_definitive/dataset_10/data_lyrics_group_decades" .
!cp -r "drive/MyDrive/Artistic_Content_Creation/WASABI_gender_experiments/WASABI_gender_experiments_definitive/dataset_10/data_lyrics_person_decades" .

# load duplicated songs info
!cp "drive/MyDrive/Artistic_Content_Creation/WASABI_gender_experiments/WASABI_gender_experiments_definitive/dataset_10/final_duplicates_and_covers.json" .


In [8]:
import re
import pandas as pd
import glob
import json

In [4]:
# load duplicated info
# this table contains all the song lyrics marked as duplicated or covers
duplicated_songs_info = pd.read_json("final_duplicates_and_covers.json")

ids_duplicated_songs = set(duplicated_songs_info.song_id.tolist())

print("Total number of covers and duplicates: ", duplicated_songs_info.shape[0])
duplicated_songs_info.head()

Total number of covers and duplicates:  90055


Unnamed: 0,song_id,duplicated_of,cover_of,is_duplicated,is_cover
0,5714deec25ac0d8aee5735f8,5714deec25ac0d8aee573522,,True,False
1,5714dec625ac0d8aee3a5ee7,5714dec625ac0d8aee3a5d8d,,True,False
2,5714ded925ac0d8aee493bfd,5714ded925ac0d8aee493af6,,True,False
3,5714ded925ac0d8aee493c75,5714ded925ac0d8aee493af6,,True,False
4,5714ded925ac0d8aee493ca1,5714ded925ac0d8aee493af6,,True,False


In [5]:
# make new folders
!mkdir dataset_10_no_duplicates

!mkdir dataset_10_no_duplicates/data_lyrics_person_decades
!mkdir dataset_10_no_duplicates/data_lyrics_group_decades

In [6]:
def write_json_rows(file, df):

    with open(file, 'a') as ww:
        for idx, row in df.iterrows():
            ww.write(json.dumps(row.to_dict())+"\n")

In [9]:
# get number of songs across time

data_folders = ['data_lyrics_person_decades/', 'data_lyrics_group_decades/']

n_discarded_lyrics = 0
n_final_dataset = 0

for data_folder in data_folders:
    type_ = data_folder.split('_')[-2]
    for file in glob.glob(data_folder+'*_[!.]*.json.gz'): # skip songs with no dates

        file_to_save = 'dataset_10_no_duplicates/'+file.rstrip(".gz")

        print("Reading file: ", file)
        print("Saving on file: ", file_to_save)
        print()

        data_chunk = pd.read_json(file, orient='records', lines=True, chunksize=5000 )
        for chunk in data_chunk:        

            n_rows = chunk.shape[0]

            # remove duplicate songs
            chunk = chunk[~chunk.song_id.isin(ids_duplicated_songs)]

            n_discarded_lyrics += n_rows - chunk.shape[0]
            n_final_dataset += chunk.shape[0]

            # save it
            write_json_rows(file_to_save, chunk)

print("Final dataset size: ", n_final_dataset)
print("Number of duplicated lyrics discarded: ", n_discarded_lyrics)

Reading file:  data_lyrics_person_decades/lyrics_2000.json.gz
Saving on file:  dataset_10_no_duplicates/data_lyrics_person_decades/lyrics_2000.json

Reading file:  data_lyrics_person_decades/lyrics_1980.json.gz
Saving on file:  dataset_10_no_duplicates/data_lyrics_person_decades/lyrics_1980.json

Reading file:  data_lyrics_person_decades/lyrics_1970.json.gz
Saving on file:  dataset_10_no_duplicates/data_lyrics_person_decades/lyrics_1970.json

Reading file:  data_lyrics_person_decades/lyrics_1960.json.gz
Saving on file:  dataset_10_no_duplicates/data_lyrics_person_decades/lyrics_1960.json

Reading file:  data_lyrics_person_decades/lyrics_1990.json.gz
Saving on file:  dataset_10_no_duplicates/data_lyrics_person_decades/lyrics_1990.json

Reading file:  data_lyrics_group_decades/lyrics_2000.json.gz
Saving on file:  dataset_10_no_duplicates/data_lyrics_group_decades/lyrics_2000.json

Reading file:  data_lyrics_group_decades/lyrics_1980.json.gz
Saving on file:  dataset_10_no_duplicates/data_

In [10]:
# gzip all files
!gzip dataset_10_no_duplicates/data_lyrics_person_decades/*
!gzip dataset_10_no_duplicates/data_lyrics_group_decades/*

In [11]:
# move to Drive
!cp -r dataset_10_no_duplicates "drive/MyDrive/Artistic_Content_Creation/WASABI_gender_experiments/WASABI_gender_experiments_definitive/"
