In [1]:
import pandas as pd
import re
import ast
from collections import defaultdict, Counter

In [2]:
# Load the CSV file into a DataFrame
file_path = 'UltimateGuitarTabs_chords.csv'
df = pd.read_csv(file_path)

df = df.drop(columns=['id', 'tonality', 'capo'])

In [3]:
# Create a new column to hold the parsed chords as lists
df['parsed_chords'] = df['chords'].apply(lambda x: x.split(','))

In [4]:
# Function to find repeating patterns with variable window size and immediate repetition constraint
def find_immediate_repeating_patterns(chords, min_window_size=3, max_window_size=10):
    # Initialize a list to hold the identified repeating patterns
    repeating_patterns = []
    
    # Iterate through different window sizes starting from the largest
    for window_size in range(max_window_size, min_window_size - 1, -1):
        i = 0
        while i < len(chords) - window_size + 1:
            # Extract a subsequence using the current window size
            subsequence = tuple(chords[i:i + window_size])
            
            # Check for immediate repetition
            if chords[i:i + window_size] == chords[i + window_size:i + 2 * window_size]:
                if subsequence not in repeating_patterns:
                    repeating_patterns.append(subsequence)
                
                # Skip the repeated subsequences to avoid redundancy
                i += window_size
            else:
                i += 1
    
    return repeating_patterns

# Identify immediate repeating patterns for each song using the simplified parsed chords
# Start with a max window size of 10 and iterate down to a min window size of 3
df['chord_progressions'] = df['parsed_chords'].apply(
    lambda x: find_immediate_repeating_patterns(x, min_window_size=3, max_window_size=10)
)

# Function to expand DataFrame rows based on immediate repeating patterns
def expand_rows_based_on_patterns(row):
    expanded_rows = []
    for pattern in row['chord_progressions']:
        new_row = row.copy()
        new_row['chord_progression'] = list(pattern)
        expanded_rows.append(new_row)
    return expanded_rows

# Expand DataFrame to create a new row for each identified chord progression within each song
expanded_data_list = []
for _, row in df.iterrows():
    expanded_rows = expand_rows_based_on_patterns(row)
    if expanded_rows:
        expanded_data_list.append(pd.DataFrame(expanded_rows))

# Concatenate all the DataFrames in the list into a single DataFrame
df = pd.concat(expanded_data_list, ignore_index=True)


In [5]:
# loop through and replace duplicated progressions with half until no duplicates are left
for _ in range(100):
    counter = 0
    for i, row in df.iterrows():
        chords = row['chord_progression']
        if len(chords) == 1:
            continue
        if chords[:len(chords)//2] == chords[len(chords)//2:]:
            df.at[i, 'chord_progression'] = chords[:len(chords)//2]
            counter += 1
    if counter == 0:
        break

In [6]:
# Function to simplify chords to only include the root note and minor indication
def simplify_chord(chord):
    # Use regular expression to extract the root and minor indication
    match = re.match(r"([A-Ga-g])([#b])?(m)?", chord)
    if match:
        return match.group(1) + (match.group(2) if match.group(2) else "") + (match.group(3) if match.group(3) else "")
    return chord

# Simplify chords and parse the chord sequences for each song again
df['simplified_chord_progression'] = df['chord_progression'].apply(lambda chords: [simplify_chord(chord) for chord in chords])

In [7]:
# Define dictionaries for major and minor keys and their associated chords
major_keys = {
    'C': ['C', 'Dm', 'Em', 'F', 'G', 'Am', 'Bm'],
    'G': ['G', 'Am', 'Bm', 'C', 'D', 'Em', 'F#m'],
    'D': ['D', 'Em', 'F#m', 'G', 'A', 'Bm', 'C#m'],
    'A': ['A', 'Bm', 'C#m', 'D', 'E', 'F#m', 'G#m'],
    'E': ['E', 'F#m', 'G#m', 'A', 'B', 'C#m', 'D#m'],
    'B': ['B', 'C#m', 'D#m', 'E', 'F#', 'G#m', 'A#m'],
    'F#': ['F#', 'G#m', 'A#m', 'B', 'C#', 'D#m', 'E#m'],
    'Db': ['Db', 'Ebm', 'Fm', 'Gb', 'Ab', 'Bbm', 'Cm'],
    'Ab': ['Ab', 'Bbm', 'Cm', 'Db', 'Eb', 'Fm', 'Gm'],
    'Eb': ['Eb', 'Fm', 'Gm', 'Ab', 'Bb', 'Cm', 'Dm'],
    'Bb': ['Bb', 'Cm', 'Dm', 'Eb', 'F', 'Gm', 'Am'],
    'F': ['F', 'Gm', 'Am', 'Bb', 'C', 'Dm', 'Em']
}

def identify_key(chord_progression):
    
    # Count the occurrences of each chord
    chord_counts = Counter(chord_progression)
    
    # Initialize variables to store the most likely major and minor keys
    likely_key = None
    progression_length = len(chord_progression)
    
    # Check for exact match
    for key, chords in major_keys.items():
        if sum(chord_counts[chord] for chord in chords if chord in chord_counts) == progression_length:
            if likely_key:
                likely_key = key if key==chord_progression[0] else likely_key
            else:
                likely_key = key
            
    # Return a key if there is an exact match
    if likely_key:
        return likely_key
    return 'Unknown'

# Apply the updated identify_key function to each chord progression in the DataFrame
df['key'] = df['simplified_chord_progression'].apply(identify_key)

In [8]:
df_clean = df[['song', 'artist', 'chord_progression', 'key']]
df_clean = df_clean[df_clean.key != "Unknown"].reset_index(drop=True)

# backup clean chord progressions to pickle
df_clean.to_csv('D:\Code\Python (D)\Chord Progressions\df_clean.csv')

In [9]:
## Transpose and clean the name of the chord

def clean_chordname(chord):
    return chord.replace('(b5)', 'flat5').replace('(', '').replace(')', '')

# Function to transpose a chord to the key of C, preserving any modifiers like 'm', 'sus4', etc.
def transpose_chord_to_C(chord, original_key):
    # Use regular expression to find the root note and any modifiers in the chord
    # also need to transpose any notes found after a / character (called extra in this function)
    match = re.match(r"([A-Ga-g])([#b])?(m)?([^/]*)?(?:/)?([A-Ga-g])?([#b])?(m)?", chord)
    
    root_note, accidental, minor, modifiers, extra_note, extra_accidental, extra_minor = match.groups()
    
    if not match:
        return 'ERROR'
    
    root_chord = root_note
    if accidental:
        root_chord += accidental
    if minor:
        root_chord += minor
    if extra_note:
        extra_chord = extra_note
        if extra_accidental:
            extra_chord += extra_accidental
        if extra_minor:
            extra_chord += extra_minor
    
    # Get the list of chords for the original key and the key of C
    original_key_chords = major_keys.get(original_key, [])
    c_key_chords = major_keys['C']
    
    try:
        # Find the index of the root chord in the original key
        index = original_key_chords.index(root_chord)
        # Find the corresponding chord in the key of C
        transposed_root_chord = c_key_chords[index]
        try:
            if extra_note:
                extra_index = original_key_chords.index(extra_chord)
                transposed_extra_chord = c_key_chords[extra_index]
        except ValueError:
            # If the chord is not found in the original_key_chords, just keep the root_chord
            return transposed_root_chord + modifiers
    except ValueError:
        print('ERROR:', chord)
        # If the chord is not found in the original_key_chords, just keep the root_chord
        transposed_root_chord = 'ERROR'
    
    # Re-append any modifiers to the transposed chord
    transposed_chord = chord.replace(root_chord, transposed_root_chord, 1)
    if extra_note:
        transposed_chord = transposed_chord.replace(extra_chord, transposed_extra_chord, 1)
    
    return clean_chordname(transposed_chord)

# Function to transpose an entire chord progression to the key of C
def transpose_progression_to_C(chord_progression, original_key):
    return [transpose_chord_to_C(chord, original_key) for chord in chord_progression]

# Create a new column 'chord_progression_C_v2' that contains the improved transposed chord progressions
df_clean['chord_progression_C'] = df_clean.apply(lambda row: transpose_progression_to_C(row['chord_progression'], row['key']), axis=1)


In [10]:
# backup clean chord progressions to pickle
df_clean.to_pickle('df_clean.pickle')

In [11]:
# save an upsampled pickle
how_many_rows = 100_000
df_clean['chord_progression_C'].rename({'chord_progression_C':'chords'}).sample(frac=100_000/df_clean.shape[0], replace=True).to_pickle('chord_progressions.pickle')

In [12]:
# how many different chord progressions do we have?
f"{df_clean.chord_progression_C.astype('str').nunique()} unique chord progressions out of {df_clean.shape[0]} total chord progressions"

'5022 unique chord progressions out of 17000 total chord progressions'

In [13]:
# most common chord progressions
df_clean.chord_progression_C.astype('str').value_counts().head(20)

chord_progression_C
['Am', 'F', 'C', 'G']    422
['C', 'G', 'Am', 'F']    417
['F', 'C', 'G']          286
['F', 'C', 'G', 'Am']    243
['F', 'G', 'C']          186
['G', 'F', 'C']          184
['G', 'Am', 'F', 'C']    167
['C', 'G', 'F']          166
['F', 'G', 'Am']         153
['Am', 'G', 'F']         146
['C', 'F']               140
['C', 'F', 'G']          137
['Am', 'F', 'G']         134
['C', 'Am', 'F', 'G']    125
['C', 'G']               109
['F', 'Am', 'G']         109
['C', 'C', 'C']          102
['G', 'C', 'F']           99
['Em', 'C', 'G']          94
['Dm', 'F', 'C', 'G']     91
Name: count, dtype: int64

In [16]:
# what are all of the chords (tokens)?
tokens = set()
for _, row in df_clean.iterrows():
    tokens |= set(row.chord_progression_C)
    
# look good enough for me
i = 0
for token in sorted(list(tokens)):
    print(f'{token:8}', end='\t')
    i += 1
    if i%8 == 0:
        print()

Am      	Am/F    	Am11    	Am6     	Am7     	Am7/G   	Am7add11	Am9     	
Amadd9  	Amaj7   	Ammaj7  	Bm      	Bm11    	Bm6     	Bm7     	Bm9     	
Bmaj7   	Bmmaj7  	C       	C/G     	C2      	C4      	C5      	C6      	
C6add11 	C7      	C7add11 	C7sus2  	C7sus4  	C9      	Cadd#11 	Cadd11  	
Cadd2   	Cadd4   	Cadd9   	Caug    	Cdim    	Cflat5  	Csus    	Csus2   	
Csus4   	Dm      	Dm11    	Dm13    	Dm6     	Dm7     	Dm7/G   	Dm7add11	
Dm9     	Dmadd9  	Dmaj    	Dmaj7   	Dmaj9   	Em      	Em6     	Em7     	
Em9     	Emadd9  	Emaj7   	Emaj9   	F       	F/G     	F2      	F5      	
F6      	F6add9  	F6sus2  	F7      	F9      	F9b5    	Fadd#11 	Fadd2   	
Fadd4   	Fadd9   	Fdim    	Fflat5  	Fsus    	Fsus2   	Fsus4   	G       	
G/C     	G11     	G13     	G2      	G4      	G5      	G6      	G6add11 	
G6sus2  	G7      	G7b9    	G7sus   	G7sus2  	G7sus4  	G9      	Gadd11  	
Gadd4   	Gadd4add9	Gadd9   	Gaug    	Gdim    	Gsus    	Gsus2   	Gsus4   	


In [15]:
# how many tokens?
len(tokens)

104