In [1]:
import glob
import math

# third-party libraries
import pandas as pd

pd.set_option('display.max_columns', None)  # or 1000
pd.set_option('display.max_rows', None)  # or 1000
pd.set_option('display.max_colwidth', -1)  # or 199

  pd.set_option('display.max_colwidth', -1)  # or 199


In [2]:
filenames = glob.glob(r'*\\*')
filenames

['0026_01\\easy.txt',
 '0026_01\\expert.txt',
 '0026_01\\hard.txt',
 '0026_01\\master.txt',
 '0026_01\\normal.txt',
 '0049_01\\easy #10.txt',
 '0049_01\\expert #6.txt',
 '0049_01\\hard #9.txt',
 '0049_01\\master #11.txt',
 '0049_01\\normal #7.txt',
 '0066_01\\easy #17.txt',
 '0066_01\\expert #12.txt',
 '0066_01\\hard #14.txt',
 '0066_01\\master #15.txt',
 '0066_01\\normal #13.txt']

In [3]:
def is_valid_note(line):
    if not line:
        return False
    for c in line:
        if c.isdigit() or c in '#:abcdef':
            pass
        else:
            return False
    return True

In [4]:
def parse_notes(line):
    notes = []
    
    part_a, part_b = line[1:].split(':')
    
    # part_a parser
    measure = int(part_a[:3])
    note_class = int(part_a[3])
    start_pos = int(part_a[4], 16)
    if len(part_a) == 6:
        # the notes during holding period
        hold_id = int(part_a[5])
    else:
        hold_id = -1
    
    # part_b parser
    note_scaling = len(part_b) // 2
    for i in range(note_scaling):
        note_property, note_width = int(part_b[i*2]), int(part_b[i*2+1], 16)
        if note_property == note_width == 0:
            continue
        notes.append({
            'measure': measure,
            'note_class': note_class,
            'start_pos': start_pos,
            'property': note_property,
            'width': note_width,
            'scaling': note_scaling,
            'timestamp': i,
            'hold_id': hold_id
        })

    return notes

In [5]:
def filename_to_notes(filename):

    note_lines = []
    with open(filename, 'r') as f:
        for line in f:
            line = line.strip()
            if is_valid_note(line):
                note_lines.append(line)

    notes = []
    for line in note_lines:
        notes.extend(parse_notes(line))
    return notes

In [6]:
def discard_duplicated_notes(df):
    # to get the ratio of full measure
    df['real_timestamp'] = df['timestamp'] / df['scaling']
    
    # A workaround to make the priority of note class is 3 > 5 > 1 ....
    df['priority'] = (df['note_class'] - 3.5).abs() * (-1)
    
    # drop the notes with same timestamp, position, and width
    # (will usually drop the one with note_class == 1 and property == 1 first)
    df = df.drop_duplicates(subset=['measure', 'real_timestamp', 'start_pos', 'width'], keep='last').reset_index(drop=True)
    return df

In [7]:
def discard_skill_notes(df):
    # if start_pos == 0 and class == 1 and property == 4 and width == 1,
    # or say, #xxx10:41 in the *.sus file, it's a skill note
    skill_note_indices = (df['note_class'] == 1) & \
                         (df['start_pos'] == 0) & \
                         (df['property'] == 4) & \
                         (df['width'] == 1)

    # extract skill notes, there should be six skill notes
    df_skill = df[skill_note_indices]
#     assert len(df_skill) == 6

    # filter out skill notes
    df = df[~skill_note_indices]
    return df

In [8]:
def discard_fever_indicator_notes(df):
    # if start_pos == 15 and class == 1 and property in [1, 2] and width == 1,
    # or say, #xxx1f:11 or #xxx1f:21 in the *.sus file, it's a fever indicator note.
    # #xxx1f:11 is start point of fever period, #xxx1f:21 is end point of fever period
    fever_indicator_indices = (df['note_class'] == 1) & \
                              (df['start_pos'] == 15) & \
                              ((df['property'] == 1) | (df['property'] == 2)) & \
                              (df['width'] == 1)

    # extract skill notes, there should be six skill notes
    df_fever_indicator = df[fever_indicator_indices]
    assert len(df_fever_indicator) == 2

    # filter out fever indicator note
    df = df[~fever_indicator_indices]
    return df

In [9]:
def holding_notes_to_holding_combos(real_holding_notes):
    holding_periods = real_holding_notes.query('property != 3'). \
                      sort_values(['measure', 'real_timestamp', 'hold_id', 'property']). \
                      reset_index(drop=True)
    
    tmp_holding_period_start = {}
    holding_combo = 0
    for row_id, row in holding_periods.iterrows():

        # if it is start of the period, property will be 1, otherwise 2
        is_start = row['property'] == 1
        if is_start:
            # if start, record the start timestamp
            tmp_holding_period_start[row['hold_id']] = (row['measure'], row['real_timestamp'])
        else:
            # if end, find the matched start point, calculate how many eighth note during period
            start_measure, start_timestamp = tmp_holding_period_start[row['hold_id']]
            end_measure, end_timestamp = row['measure'], row['real_timestamp']
    #         print(start_measure, start_timestamp, end_measure, end_timestamp)

            start_universe_timestamp = (start_measure + start_timestamp) * 8
            end_universe_timestamp = (end_measure + end_timestamp) * 8

            start_count = math.ceil(start_universe_timestamp)
            start_count += int(start_universe_timestamp == start_count)
            end_count = math.floor(end_universe_timestamp)
            end_count -= int(end_universe_timestamp == end_count)

#             print(start_universe_timestamp, end_universe_timestamp, start_count, end_count)
            holding_combo += end_count - start_count + 1

            tmp_holding_period_start[row['hold_id']] = []
    
    return holding_combo

In [10]:
def dataframe_to_combos(df):
    
    df = discard_duplicated_notes(df)
    df = discard_skill_notes(df)
    df = discard_fever_indicator_notes(df)
    
    # if hold_id != -1, it will be a note during holding period,
    # should be notated as #xxx3xx:xx (six digits before ':'), 
    # and the last digit before ':' is the holding_id
    holding_note_indices = (df['hold_id'] != -1)
    holding_notes = df[holding_note_indices]

    # if note_class == 3 and property == 5,
    # although it is a holding note, but its usage is only to draw the zig-zag holding line (zigzag note)
    zigzag_holding_notes = holding_notes.query('property == 5')
    real_holding_notes = holding_notes.query('property != 5')
    
    # calculate combos
    basic_notes = df.query('hold_id == -1')
    tap_combo = len(basic_notes) + len(real_holding_notes)
    holding_combo = holding_notes_to_holding_combos(real_holding_notes)
    
    return tap_combo, holding_combo

In [11]:
def parse_difficulty(difficulty):
    symbols = ['easy', 'normal', 'hard', 'expert', 'master']
    for symbol in symbols:
        if difficulty.startswith(symbol):
            return symbol
    return None

In [16]:
combos = []
for filename in filenames:
    
    song_id, difficulty = filename.split('\\')
    song_id = int(song_id[:-3])
    difficulty = parse_difficulty(difficulty)
#     print(song_id, difficulty)
    
    notes = filename_to_notes(filename)
    df = pd.DataFrame(notes)
    tap_combo, holding_combo = dataframe_to_combos(df)
    combos.append({
        'song_id': song_id,
        'difficulty': difficulty,
        'tap_combo': tap_combo,
        'holding_combo': holding_combo,
        'total_combo': tap_combo + holding_combo
    })

In [17]:
combos = pd.DataFrame(combos)
combos

Unnamed: 0,song_id,difficulty,tap_combo,holding_combo,total_combo
0,26,easy,78,70,148
1,26,expert,672,153,825
2,26,hard,386,134,520
3,26,master,847,105,952
4,26,normal,258,113,371
5,49,easy,183,193,376
6,49,expert,1218,284,1502
7,49,hard,870,246,1116
8,49,master,1463,203,1666
9,49,normal,496,255,751
