dataset_filtering.ipynb
- Filter 'song_meta.json' from about 700,000 songs to about 80,000 songs on a specific condition
(e.g., ballad, R&B, Idol, 10s- ...)

In [43]:
# Load original 'song_meta.json' to @df
import json
import pandas as pd

with open('./song_meta.json', 'r', encoding='UTF8') as f:
    data = json.load(f)

df = pd.json_normalize(data)

In [44]:
df.head()

Unnamed: 0,song_gn_dtl_gnr_basket,issue_date,album_name,album_id,artist_id_basket,song_name,song_gn_gnr_basket,artist_name_basket,id
0,[GN0901],20140512,불후의 명곡 - 7080 추억의 얄개시대 팝송베스트,2255639,[2727],Feelings,[GN0900],[Various Artists],0
1,"[GN1601, GN1606]",20080421,"Bach : Partitas Nos. 2, 3 & 4",376431,[29966],"Bach : Partita No. 4 In D Major, BWV 828 - II....",[GN1600],[Murray Perahia],1
2,[GN0901],20180518,Hit,4698747,[3361],Solsbury Hill (Remastered 2002),[GN0900],[Peter Gabriel],2
3,"[GN1102, GN1101]",20151016,Feeling Right (Everything Is Nice) (Feat. Popc...,2644882,[838543],Feeling Right (Everything Is Nice) (Feat. Popc...,[GN1100],[Matoma],3
4,"[GN1802, GN1801]",20110824,그남자 그여자,2008470,[560160],그남자 그여자,[GN1800],[Jude Law],4


In [48]:
# filtering dataset on a specific condition
def cleaning_dataset(df) -> pd.DataFrame:
    
    # select genre
    # 발라드, 댄스, R&B, 인디, 아이돌
    select_genre_list = ['GN0100','GN0200','GN0400','GN0500','GN2500']
    filter = df.song_gn_gnr_basket.apply(lambda x: any(genre for genre in select_genre_list if genre in x))
    df = df[filter]
    
    # except genre
    # 록/메탈, 성인가요, 포크/블루스, POP, 록/메탈, 일렉트로니카, 랩/힙합, R&B/SOUL, 포크/블루스/컨트리 ... 
    except_genre_list = ['GN600', 'GN700', 'GN800', 'GN900', 'GN1000', 'GN1100', 'GN1200', 'GN1300', 'GN1400', 
                         'GN1500', 'GN1600', 'GN1700', 'GN1800', 'GN1900', 'GN2000', 'GN2100', 'GN2200', 'GN2300', 
                         'GN2400', 'GN2600', 'GN2700', 'GN2800', 'GN2900', 'GN3000']
    
    # except detail genre, e.g., ballad in 80s ~ 90s
    except_dt_genre_list = ['GN0102', 'GN0103', 'GN0104', 'GN0202', 'GN0203', 'GN0204', 'GN0504', 'GN0507', 'GN0508']

    filter = df.song_gn_gnr_basket.apply(lambda x: any(genre for genre in except_genre_list if genre in x))
    df = df[~filter]
    
    filter = df.song_gn_dtl_gnr_basket.apply(lambda x: any(genre for genre in except_dt_genre_list if genre in x))
    df = df[~filter]

    # except outdated music (target_date: 2000.01.01)
    target_date = '20100101'
    df = df[df['issue_date'] > target_date]

    # except other outliers
    filter = df.artist_name_basket.apply(lambda x: 'Various Artists' not in x)
    df = df[filter]
    df = df[~df['album_name'].str.contains('70|80|추억의|베스트|Live|모음|컬렉션', na=False)]
    df = df[~df['song_name'].str.contains('Inst.', na=False)]

    df.reset_index(drop=True, inplace=True)
    return df

new_df = cleaning_dataset(df)

In [52]:
new_df.head()

Unnamed: 0,song_gn_dtl_gnr_basket,issue_date,album_name,album_id,artist_id_basket,song_name,song_gn_gnr_basket,artist_name_basket,id
0,"[GN0105, GN0101]",20170320,Pastel Reflection,10047088,[753752],"사랑, 그대라는 멜로디",[GN0100],[진호],9
1,"[GN2503, GN0205, GN2501, GN2506, GN0201]",20160226,Melting,2669407,[750053],Girl Crush,"[GN2500, GN0200]",[마마무 (Mamamoo)],17
2,"[GN0805, GN0501, GN0502, GN0801, GN0509]",20150205,내가 부른 그림 2,2303168,[230399],무얼 기다리나 (Feat. 조원선),"[GN0500, GN0800]",[이영훈],19
3,"[GN0805, GN0501, GN0502, GN0801, GN0509]",20120629,남몰래 듣기,2133128,[681291],찾고 있니,"[GN0500, GN0800]",[이호석],35
4,"[GN0509, GN0501, GN0304, GN0505, GN0301]",20140828,Clarity,2278112,[588331],Walk Alone,"[GN0500, GN0300]",[LHA],47


In [53]:
new_df.to_json('cleaned_song_meta.json', orient='records')

In [54]:
with open('./cleaned_song_meta.json', 'r', encoding='UTF8') as f:
    data = json.load(f)
df = pd.json_normalize(data)

In [56]:
df.describe()

Unnamed: 0,album_id,id
count,79814.0,79814.0
mean,5808014.0,354658.953354
std,4033039.0,204096.191739
min,206677.0,9.0
25%,2215349.0,178057.0
50%,2682101.0,354440.0
75%,10156660.0,531668.25
max,10422060.0,707980.0
