In [1]:
import pandas as pd
import pickle
import re
import ast
from collections import defaultdict, Counter

pd.set_option('display.max_rows', 200)

In [2]:
# Load the CSV file into a DataFrame
file_path = 'UltimateGuitarTabs_chords.csv'
df = pd.read_csv(file_path)

df = df.drop(columns=['id', 'tonality', 'capo'])

In [3]:
# Create a new column to hold the parsed chords as lists
df['parsed_chords'] = df['chords'].apply(lambda x: x.split(','))

In [4]:
# Function to find repeating patterns with variable window size and immediate repetition constraint
def find_immediate_repeating_patterns(chords, min_window_size=3, max_window_size=10):
    # Initialize a list to hold the identified repeating patterns
    repeating_patterns = []
    
    # Iterate through different window sizes starting from the largest
    for window_size in range(max_window_size, min_window_size - 1, -1):
        i = 0
        while i < len(chords) - window_size + 1:
            # Extract a subsequence using the current window size
            subsequence = tuple(chords[i:i + window_size])
            
            # Check for immediate repetition
            if chords[i:i + window_size] == chords[i + window_size:i + 2 * window_size]:
                if subsequence not in repeating_patterns:
                    repeating_patterns.append(subsequence)
                
                # Skip the repeated subsequences to avoid redundancy
                i += window_size
            else:
                i += 1
    
    return repeating_patterns

# Identify immediate repeating patterns for each song using the simplified parsed chords
# Start with a max window size of 10 and iterate down to a min window size of 3
df['chord_progressions'] = df['parsed_chords'].apply(
    lambda x: find_immediate_repeating_patterns(x, min_window_size=3, max_window_size=10)
)

# Function to expand DataFrame rows based on immediate repeating patterns
def expand_rows_based_on_patterns(row):
    expanded_rows = []
    for pattern in row['chord_progressions']:
        new_row = row.copy()
        new_row['chord_progression'] = list(pattern)
        expanded_rows.append(new_row)
    return expanded_rows

# Expand DataFrame to create a new row for each identified chord progression within each song
expanded_data_list = []
for _, row in df.iterrows():
    expanded_rows = expand_rows_based_on_patterns(row)
    if expanded_rows:
        expanded_data_list.append(pd.DataFrame(expanded_rows))

# Concatenate all the DataFrames in the list into a single DataFrame
df = pd.concat(expanded_data_list, ignore_index=True)


In [5]:
# loop through and replace duplicated progressions with half until no duplicates are left
for _ in range(100):
    counter = 0
    for i, row in df.iterrows():
        chords = row['chord_progression']
        if len(chords) == 1:
            continue
        if chords[:len(chords)//2] == chords[len(chords)//2:]:
            df.at[i, 'chord_progression'] = chords[:len(chords)//2]
            counter += 1
    if counter == 0:
        break

In [6]:
# update progressions to remove repeated chords
def remove_repeated_chords(progression):
    clean_progression = [progression[0]]
    for chord in progression[1:]:
        if chord != clean_progression[-1]:
            clean_progression.append(chord)
    return clean_progression

df['chord_progression'] = df['chord_progression'].apply(lambda row: remove_repeated_chords(row))

In [7]:
# # NOTE: also need dim7 converted to minor for simplified progressions
# df[df.chord_progression.apply(lambda x: 'dim' in str(x))][['song', 'artist', 'chord_progression', 'simplified_chord_progression']]

In [8]:
# Function to simplify chords to only include the root note and minor indication
def simplify_chord(chord):
    # Use regular expression to extract the root and minor indication
    match = re.match(r"([A-Ga-g])([#b])?(m)?([^/]*)?", chord)
    if match.group(4):
        if match.group(4) == 'dim' or match.group(4) == 'dim7':
            return match.group(1) + 'm'
    if match:
        return match.group(1) + (match.group(2) if match.group(2) else "") + (match.group(3) if match.group(3) else "")
    return chord

# Simplify chords and parse the chord sequences for each song again
df['simplified_chord_progression'] = df['chord_progression'].apply(lambda chords: [simplify_chord(chord) for chord in chords])

In [9]:
# Define dictionaries for major and minor keys and their associated chords
# treat 'dim' chords as 'm' chords for simplicity
key_signature = {
    'C': ['C', 'Dm', 'Em', 'F', 'G', 'Am', 'Bm'],
    'G': ['G', 'Am', 'Bm', 'C', 'D', 'Em', 'F#m'],
    'D': ['D', 'Em', 'F#m', 'G', 'A', 'Bm', 'C#m'],
    'A': ['A', 'Bm', 'C#m', 'D', 'E', 'F#m', 'G#m'],
    'E': ['E', 'F#m', 'G#m', 'A', 'B', 'C#m', 'D#m'],
    'B': ['B', 'C#m', 'D#m', 'E', 'F#', 'G#m', 'A#m'],
    'F#': ['F#', 'G#m', 'A#m', 'B', 'C#', 'D#m', 'E#m'],
    'Db': ['Db', 'Ebm', 'Fm', 'Gb', 'Ab', 'Bbm', 'Cm'],
    'Ab': ['Ab', 'Bbm', 'Cm', 'Db', 'Eb', 'Fm', 'Gm'],
    'Eb': ['Eb', 'Fm', 'Gm', 'Ab', 'Bb', 'Cm', 'Dm'],
    'Bb': ['Bb', 'Cm', 'Dm', 'Eb', 'F', 'Gm', 'Am'],
    'F': ['F', 'Gm', 'Am', 'Bb', 'C', 'Dm', 'Em'],
    'Am': ['Am', 'Bm', 'C', 'Dm', 'E', 'F', 'G#m'],
    'Em': ['Em', 'F#m', 'G', 'Am', 'B', 'C', 'D#m'],
    'Bm': ['Bm', 'C#m', 'D', 'Em', 'F#', 'G', 'A#m'],
    'F#m': ['F#m', 'G#m', 'A', 'Bm', 'C#', 'D', 'Fm'],
    'C#m': ['C#m', 'D#m', 'E', 'F#m', 'G#', 'A', 'B#m'],
    'G#m': ['G#m', 'A#m', 'B', 'C#m', 'D#', 'E', 'Gm'],
    'D#m': ['D#m', 'E#m', 'F#', 'G#m', 'A#', 'B', 'Dm'],
    'A#m': ['A#m', 'B#m', 'C#', 'D#m', 'E#', 'F#', 'Am'],
    'Dm': ['Dm', 'Em', 'F', 'Gm', 'A', 'Bb', 'C#m'],
    'Gm': ['Gm', 'Am', 'Bb', 'Cm', 'D', 'Eb', 'F#m'],
    'Cm': ['Cm', 'Dm', 'Eb', 'Fm', 'G', 'Ab', 'Bm']
}

def identify_key(chord_progression):
    
    # Count the occurrences of each chord
    chord_counts = Counter(chord_progression)
    
    # Initialize variables to store the most likely major and minor keys
    likely_key = None
    progression_length = len(chord_progression)
    
    # Check for exact match
    for key, chords in key_signature.items():
        if sum(chord_counts[chord] for chord in chords if chord in chord_counts) == progression_length:
            if likely_key:
                # prioritise first chord in progression where multiple eligible keys exist
                likely_key = key if key==chord_progression[0] else likely_key 
            else:
                likely_key = key
            
    # Return a key if there is an exact match
    if likely_key:
        return likely_key
    return 'Unknown'

# Apply the updated identify_key function to each chord progression in the DataFrame
df['key'] = df['simplified_chord_progression'].apply(identify_key)

In [10]:
# number of unknown keys
df[df.key == "Unknown"].shape[0]

3069

In [11]:
# clean up chord progression
df = df[['song', 'artist', 'chord_progression', 'key']]
df = df[df.key != "Unknown"].reset_index(drop=True)

In [12]:
## Transpose and clean the name of the chord

def clean_chordname(chord):
    return chord.replace('(b5)', 'flat5').replace('(', '').replace(')', '')

# Function to transpose a chord to the key of C, preserving any modifiers like 'm', 'sus4', etc.
def transpose_chord_to_C(chord, original_key):
    # Use regular expression to find the root note and any modifiers in the chord
    # also need to transpose any notes found after a / character (called extra in this function)
    match = re.match(r"([A-Ga-g])([#b])?(m)?([^/]*)?(?:/)?([A-Ga-g])?([#b])?(m)?([^/]*)?", chord)
    
    root_note, accidental, minor, modifiers, extra_note, extra_accidental, extra_minor, extra_modifiers = match.groups()
    
    # treat 'dim' chords as 'm' chords for simplicity
    root_diminished, extra_diminished = False, False
    if modifiers:
        root_diminished = True if modifiers=='dim' or modifiers=='dim7' else False
    if extra_modifiers:
        extra_diminished = True if extra_modifiers=='dim' or extra_modifiers=='dim7' else False
    
    if not match:
        return 'ERROR'
    
    root_chord = root_note
    if accidental:
        root_chord += accidental
    if minor or root_diminished:  
        root_chord += 'm'
    if extra_note:
        extra_chord = extra_note
        if extra_accidental:
            extra_chord += extra_accidental
        if extra_minor or extra_diminished:
            extra_chord += 'm'
    
    # Get the list of chords for the original key and the key of C or Am, as required
    original_key_chords = key_signature.get(original_key, [])
    key_chords = key_signature['C'] if original_key[-1]!='m' else key_signature['Am']
    
    try:
        # Find the index of the root chord in the original key
        index = original_key_chords.index(root_chord)
        # Find the corresponding chord in the key of C
        transposed_root_chord = key_chords[index]
        try:
            if extra_note:
                extra_index = original_key_chords.index(extra_chord)
                transposed_extra_chord = key_chords[extra_index]
        except ValueError:
            # If the extra chord is not found in the original_key_chords, just keep the root_chord
            return transposed_root_chord + modifiers
    except ValueError:
        print('ERROR:', chord, original_key)
        return 'ERROR'
    
    # Re-append any modifiers to the transposed chord, bring back dim where required
    transposed_chord = chord.replace(root_chord, transposed_root_chord, 1)
    if extra_note:
        transposed_chord = transposed_chord.replace(extra_chord, transposed_extra_chord, 1)
    
    # remove the minor used in place of the diminished
    if root_diminished: 
        transposed_chord = transposed_chord.replace(transposed_root_chord, transposed_root_chord[:-1], 1)
    if extra_diminished:
        transposed_chord = transposed_chord.replace(transposed_extra_chord, transposed_extra_chord[:-1], 1)
    
    return clean_chordname(transposed_chord)

# Function to transpose an entire chord progression to the key of C
def transpose_progression_to_C(chord_progression, original_key):
    return [transpose_chord_to_C(chord, original_key) for chord in chord_progression]

# Create a new column 'chord_progression_C_v2' that contains the improved transposed chord progressions
df['chord_progression_C'] = df.apply(lambda row: transpose_progression_to_C(row['chord_progression'], row['key']), axis=1)


ERROR: Bbdim C
ERROR: Bbdim C
ERROR: G#dim7 Eb
ERROR: Bbdim G
ERROR: Bbdim7 A
ERROR: Bbdim G
ERROR: Bbdim G


In [13]:
# inspect progressions with chords containing errors
df[df.chord_progression_C.astype(str).str.contains('ERROR')]

Unnamed: 0,song,artist,chord_progression,key,chord_progression_C
3709,You Are The Sunshine Of My Life,Stevie Wonder,"[C, F6, Em7, Bbdim, Dm7, G7, C, Dm7, G7]",C,"[C, F6, Em7, ERROR, Dm7, G7, C, Dm7, G7]"
3710,You Are The Sunshine Of My Life,Stevie Wonder,"[G7, C, F6, Em7, Bbdim, Dm7, G7, C, Dm7]",C,"[G7, C, F6, Em7, ERROR, Dm7, G7, C, Dm7]"
6178,A Little Piece Of Heaven,Avenged Sevenfold,"[Dm, Fm, G#dim7]",Eb,"[Bm, Dm, ERROR]"
6191,Lying Is The Most Fun A Girl Can Have Without ...,Panic! At the Disco,"[G, Bm, Bbdim]",G,"[C, Em, ERROR]"
10746,Friends In Low Places,Garth Brooks,"[A, Bbdim7, Bm, E]",A,"[C, ERROR, Dm, G]"
12409,Mirrors,Justin Timberlake,"[Bm, F#m, Em, G, Bm, F#m, Em, Bbdim]",G,"[Em, Bm, Am, C, Em, Bm, Am, ERROR]"
12410,Mirrors,Justin Timberlake,"[Bbdim, Bm, F#m, Em, G, Bm, F#m, Em]",G,"[ERROR, Em, Bm, Am, C, Em, Bm, Am]"


In [14]:
# remove progressions with chords containing errors - they cann't be transposed correctly, as they are not in the noted key
df = df[~df.chord_progression_C.astype(str).str.contains('ERROR')].reset_index(drop=True)

# backup clean chord progressions to pickle
df.to_pickle('df_clean.pickle')

In [27]:
df[['song', 'artist', 'chord_progression_C']].sample(22)

Unnamed: 0,song,artist,chord_progression_C
9066,Forever Reign,Hillsong Worship,"[Am, G, F]"
14456,No Longer Slaves,Bethel Music,"[Am, G, C, F]"
15452,Treat You Better,Shawn Mendes,"[Am, G, F, C]"
970,Summer Wine,Nancy Sinatra & Lee Hazlewood,"[Am, G, Am, G, Dm, Am, Dm, Am, Dm]"
7729,Valerie,Amy Winehouse,"[C, Dm, C, Dm, C, Dm]"
13454,Far From Any Road,The Handsome Family,[Am]
6846,Toes,Zac Brown Band,"[C, F, C, G, C]"
9567,Feel Good Inc,Gorillaz,"[Am, G, Dm, Em]"
59,Fool Again,Westlife,"[C, G, F, G, C]"
3530,Hallelujah,Jeff Buckley,"[C, Am]"
