In [1]:
from pathlib import Path
import pandas as pd
from collections import Counter
import random

### files number check (audios vs. sections)

In [2]:
AUDIO_DIR = Path("/home/z/下载/打包结果_audio/")
SEC_DIR   = Path("/home/z/下载/打包结果_section/")

In [3]:
AUDIO_EXT = ".mp3"
SEC_SUFFIX = "_measure.sec"

In [4]:
audio_files = sorted(AUDIO_DIR.rglob(f"*{AUDIO_EXT}"))
sec_files = sorted(SEC_DIR.rglob(f"*{SEC_SUFFIX}"))

print("audio_files:", len(audio_files))
print("sec_measure_files:", len(sec_files))

audio_files: 20072
sec_measure_files: 15629


In [6]:
def audio_id(p: Path) -> str:
    # filename without .mp3
    return p.name[:-len(AUDIO_EXT)]

def sec_id(p: Path) -> str:
    # filename without _measure.sec
    return p.name[:-len(SEC_SUFFIX)]

audio_df = pd.DataFrame({
    "id": [audio_id(p) for p in audio_files],
    "audio_path": [str(p) for p in audio_files],
})
sec_df = pd.DataFrame({
    "id": [sec_id(p) for p in sec_files],
    "sec_path": [str(p) for p in sec_files],
})

audio_df.head(), sec_df.head()

(                                              id  \
 0        070-shake_guilty-conscience_kygz-KaPmKB   
 1             07th-expansion_fascism_d_gw-JNboGV   
 2  100-gecs_hand-crushed-by-a-mallet_nLgaXlzGmYp   
 3     100-orange-juice_arthurs-theme_RPxeWMB-mb_   
 4     100-orange-juice_arthurs-theme_WeglbkMLorY   
 
                                           audio_path  
 0  /home/z/下载/打包结果_audio/070-shake_guilty-conscie...  
 1  /home/z/下载/打包结果_audio/07th-expansion_fascism_d...  
 2  /home/z/下载/打包结果_audio/100-gecs_hand-crushed-by...  
 3  /home/z/下载/打包结果_audio/100-orange-juice_arthurs...  
 4  /home/z/下载/打包结果_audio/100-orange-juice_arthurs...  ,
                                               id  \
 0        070-shake_guilty-conscience_kygz-KaPmKB   
 1             07th-expansion_fascism_d_gw-JNboGV   
 2  100-gecs_hand-crushed-by-a-mallet_nLgaXlzGmYp   
 3     100-orange-juice_arthurs-theme_RPxeWMB-mb_   
 4     100-orange-juice_arthurs-theme_WeglbkMLorY   
 
                       

In [7]:
print("audio unique ids:", audio_df["id"].nunique())
print("sec unique ids:", sec_df["id"].nunique())

audio unique ids: 20072
sec unique ids: 15629


In [12]:
len(set(audio_df["id"]) - set(sec_df["id"])), len(set(audio_df["id"]).intersection(set(sec_df["id"]))), len(set(sec_df["id"]) - set(audio_df["id"])) 

(4443, 15629, 0)

### labels check

In [5]:
SEC_DIR = Path("/home/z/下载/打包结果_section/")

In [6]:
sec_files = list(SEC_DIR.rglob("*_measure.sec"))

In [7]:
def extract_label_from_line(line):
    line = line.strip()
    if not line:
        return None

    parts = line.split()
    if len(parts) < 3:
        return None
    if len(parts) > 3:
        return 'several'

    label = parts[2]
    return label if label else None

#### How many unique labels?

In [8]:
unique_labels = set()
for fp in sec_files:
    with fp.open("r", encoding = "utf-8", errors = "ignore") as f:
        for line in f:
            label = extract_label_from_line(line)
            unique_labels.add(label)

In [9]:
unique_labels

{'Bridge',
 'Chorus',
 'Instrumental',
 'Intro',
 'Loop',
 None,
 'Outro',
 'Pre-Chorus',
 'Pre-Outro',
 'Solo',
 'Verse',
 'several'}

#### label distribution

In [12]:
label_lst = []
for fp in sec_files:
    with fp.open("r", encoding = "utf-8", errors = "ignore") as f:
        for line in f:
            label = extract_label_from_line(line)
            label_lst = label_lst + [label]

In [15]:
Counter(label_lst)

Counter({'Chorus': 5411,
         'Verse': 3832,
         'Intro': 1843,
         'several': 1730,
         'Pre-Chorus': 1018,
         'Bridge': 782,
         'Instrumental': 503,
         'Outro': 264,
         'Solo': 185,
         'Pre-Outro': 47,
         None: 11,
         'Loop': 3})

#### Case Study

In [12]:
# None label to be deleted
for fp in sec_files:
    with fp.open("r", encoding = "utf-8", errors = "ignore") as f:
        for line in f:
            label = extract_label_from_line(line)
            if not label:
                print(fp)

/home/z/下载/打包结果_section/the-prince-karma_later-bitches_WeglMrwlxrY_measure.sec
/home/z/下载/打包结果_section/chvrches_empty-threat_yvmrqq-doOW_measure.sec
/home/z/下载/打包结果_section/yeah-aightt_yeeee_d_gw--JeoGV_measure.sec
/home/z/下载/打包结果_section/ryuichi-sakamoto_solari_zngRBKOpgJj_measure.sec
/home/z/下载/打包结果_section/charlie-puth_girlfriend_nvgyJwe_xkA_measure.sec
/home/z/下载/打包结果_section/guided-by-voices_game-of-pricks_WeglGb--mrY_measure.sec
/home/z/下载/打包结果_section/op-shop_one-day_eWxLGMnKgaK_measure.sec
/home/z/下载/打包结果_section/the-beach-boys_passing-by_zngRJqrpgJj_measure.sec
/home/z/下载/打包结果_section/john-maus_hey-moon_nZgWrABwmry_measure.sec
/home/z/下载/打包结果_section/stone-temple-pilots_trippin-on-a-hole-in-a-paper-heart_nvgy-Ar-gkA_measure.sec
/home/z/下载/打包结果_section/ninja-sex-party_cool-patrol_ZbgOQwqwonY_measure.sec


In [9]:
# several labels, regard as dirty, delete
several_parts_records = set()
for fp in sec_files:
    with fp.open("r", encoding = "utf-8", errors = "ignore") as f:
        for line in f:
            label = extract_label_from_line(line)
            if label == 'several':
                several_parts_records.add(fp)

len(several_parts_records)

1730

In [14]:
# Lead-In label: 1 but the label is "Lead-In Alt", also hit the several labels rule
for fp in sec_files:
    with fp.open("r", encoding = "utf-8", errors = "ignore") as f:
        for line in f:
            label = extract_label_from_line(line)
            if label == 'Lead-In':
                print(fp)

In [17]:
# Loop label
unique_labels = set()
for fp in sec_files:
    with fp.open("r", encoding = "utf-8", errors = "ignore") as f:
        for line in f:
            label = extract_label_from_line(line)
            if label == 'Loop':
                print(fp)
            unique_labels.add(label)

/home/z/下载/打包结果_section/konami_gradius-ii-nes---something-ghostly_nLgaakwEgYp_measure.sec
/home/z/下载/打包结果_section/dung-truong_allusion-world_ZwxKJKYGged_measure.sec
/home/z/下载/打包结果_section/danny-baranowsky_serenity_QLgnWqZYx-V_measure.sec


In [10]:
# Pre-Outro label
unique_labels = set()
for fp in sec_files:
    with fp.open("r", encoding = "utf-8", errors = "ignore") as f:
        for line in f:
            label = extract_label_from_line(line)
            if label == 'Pre-Outro':
                print(fp)
            unique_labels.add(label)

/home/z/下载/打包结果_section/rebecca-sugar_other-friends---steven-universe_AQodR-DqgDl_measure.sec
/home/z/下载/打包结果_section/grouplove_shark-attack__NgbebEpmQA_measure.sec
/home/z/下载/打包结果_section/bart-peeters_allemaal-door-jou_Abm_WLpAmak_measure.sec
/home/z/下载/打包结果_section/wan-wan_yin-yang-shi-serpent_ZOxVrjaGxdq_measure.sec
/home/z/下载/打包结果_section/hirokazu-tanaka_metroid-zero-mission---title__NgbYllOmQA_measure.sec
/home/z/下载/打包结果_section/kikuo_nobore-susume-takai-tou_jDgXRAJnoKl_measure.sec
/home/z/下载/打包结果_section/katy-perry_the-one-that-got-away_YAg-yqRwgle_measure.sec
/home/z/下载/打包结果_section/nadia-ali_pressure_RZwxKpRWoed_measure.sec
/home/z/下载/打包结果_section/mariah-carey_lead-the-way_nvgyyMqZgkA_measure.sec
/home/z/下载/打包结果_section/snails-house_pixel-galaxy_jDgXpdEdgKl_measure.sec
/home/z/下载/打包结果_section/tobuscus_viral-song_ROmNrw_nmNw_measure.sec
/home/z/下载/打包结果_section/rob-cantor_shia-labeouf_DpgvaqRBmad_measure.sec
/home/z/下载/打包结果_section/sega_cyber-track-act-1_ZbgOeGAQonY_measure.sec
/