In [1]:
import pandas as pd

# Load the CSV file into a DataFrame
file_path = 'D:/Code/Python (D)/Chord Progressions/UltimateGuitarTabs_chords.csv'
df = pd.read_csv(file_path)

df = df.drop(columns=['id', 'tonality', 'capo'])

In [2]:
# Step 1: Parse the chord sequences for each song

# Create a new column to hold the parsed chords as lists
df['parsed_chords'] = df['chords'].apply(lambda x: x.split(','))

In [3]:
from collections import defaultdict

# Function to find repeating patterns with variable window size and immediate repetition constraint
def find_immediate_repeating_patterns(chords, min_window_size=3, max_window_size=10):
    # Initialize a list to hold the identified repeating patterns
    repeating_patterns = []
    
    # Iterate through different window sizes starting from the largest
    for window_size in range(max_window_size, min_window_size - 1, -1):
        i = 0
        while i < len(chords) - window_size + 1:
            # Extract a subsequence using the current window size
            subsequence = tuple(chords[i:i + window_size])
            
            # Check for immediate repetition
            if chords[i:i + window_size] == chords[i + window_size:i + 2 * window_size]:
                if subsequence not in repeating_patterns:
                    repeating_patterns.append(subsequence)
                
                # Skip the repeated subsequences to avoid redundancy
                i += window_size
            else:
                i += 1
    
    return repeating_patterns

# Identify immediate repeating patterns for each song using the simplified parsed chords
# Start with a max window size of 10 and iterate down to a min window size of 3
df['chord_progressions'] = df['parsed_chords'].apply(
    lambda x: find_immediate_repeating_patterns(x, min_window_size=3, max_window_size=10)
)

# Function to expand DataFrame rows based on immediate repeating patterns
def expand_rows_based_on_patterns(row):
    expanded_rows = []
    for pattern in row['chord_progressions']:
        new_row = row.copy()
        new_row['chord_progression'] = list(pattern)
        expanded_rows.append(new_row)
    return expanded_rows

# Expand DataFrame to create a new row for each identified chord progression within each song
expanded_data_list = []
for _, row in df.iterrows():
    expanded_rows = expand_rows_based_on_patterns(row)
    if expanded_rows:
        expanded_data_list.append(pd.DataFrame(expanded_rows))

# Concatenate all the DataFrames in the list into a single DataFrame
df = pd.concat(expanded_data_list, ignore_index=True)

# loop through and replace duplicated progressions with half until no duplicates are left
for _ in range(100):
    counter = 0
    for i, row in df.iterrows():
        chords = row['chord_progression']
        if len(chords) == 1:
            continue
        if chords[:len(chords)//2] == chords[len(chords)//2:]:
            df.at[i, 'chord_progression'] = chords[:len(chords)//2]
            counter += 1
    if counter == 0:
        break

In [4]:
import re

# Function to simplify chords to only include the root note and minor indication
def simplify_chord(chord):
    # Use regular expression to extract the root and minor indication
    match = re.match(r"([A-Ga-g])([#b])?(m)?", chord)
    if match:
        return match.group(1) + (match.group(2) if match.group(2) else "") + (match.group(3) if match.group(3) else "")
    return chord

# Step 1: Simplify chords and parse the chord sequences for each song again
df['simplified_chord_progression'] = df['chord_progression'].apply(lambda chords: [simplify_chord(chord) for chord in chords])


In [5]:
from collections import Counter
import ast

# Define dictionaries for major and minor keys and their associated chords
major_keys = {
    'C': ['C', 'Dm', 'Em', 'F', 'G', 'Am', 'Bm'],
    'G': ['G', 'Am', 'Bm', 'C', 'D', 'Em', 'F#m'],
    'D': ['D', 'Em', 'F#m', 'G', 'A', 'Bm', 'C#m'],
    'A': ['A', 'Bm', 'C#m', 'D', 'E', 'F#m', 'G#m'],
    'E': ['E', 'F#m', 'G#m', 'A', 'B', 'C#m', 'D#m'],
    'B': ['B', 'C#m', 'D#m', 'E', 'F#', 'G#m', 'A#m'],
    'F#': ['F#', 'G#m', 'A#m', 'B', 'C#', 'D#m', 'E#m'],
    'Db': ['Db', 'Ebm', 'Fm', 'Gb', 'Ab', 'Bbm', 'Cm'],
    'Ab': ['Ab', 'Bbm', 'Cm', 'Db', 'Eb', 'Fm', 'Gm'],
    'Eb': ['Eb', 'Fm', 'Gm', 'Ab', 'Bb', 'Cm', 'Dm'],
    'Bb': ['Bb', 'Cm', 'Dm', 'Eb', 'F', 'Gm', 'Am'],
    'F': ['F', 'Gm', 'Am', 'Bb', 'C', 'Dm', 'Em']
}

def identify_key(chord_progression):
    
    # Count the occurrences of each chord
    chord_counts = Counter(chord_progression)
    
    # Initialize variables to store the most likely major and minor keys
    likely_key = None
    progression_length = len(chord_progression)
    
    # Check for exact match
    for key, chords in major_keys.items():
        if sum(chord_counts[chord] for chord in chords if chord in chord_counts) == progression_length:
            if likely_key:
                likely_key = key if key==chord_progression[0] else likely_key
            else:
                likely_key = key
            
    # Return a key if there is an exact match
    if likely_key:
        return likely_key
    return 'Unknown'

# Apply the updated identify_key function to each chord progression in the DataFrame
df['key'] = df['simplified_chord_progression'].apply(identify_key)

In [6]:
df_clean = df[['song', 'artist', 'chord_progression', 'key']]
df_clean = df_clean[df_clean.key != "Unknown"].reset_index(drop=True)

# backup clean chord progressions to pickle
df_clean.to_csv('D:\Code\Python (D)\Chord Progressions\df_clean.csv')

In [68]:
def transpose_chord_to_C(chord, original_key):
    # Use regular expression to find the root note and any modifiers in the chord
    # also need to transpose any notes found after a / character (called extra in this function)
    match = re.match(r"([A-Ga-g])([#b])?(m)?([^/]*)?(?:/)?([A-Ga-g])?([#b])?(m)?", chord)
    
    root_note, accidental, minor, modifiers, extra_note, extra_accidental, extra_minor = match.groups()
    
    if not match:
        return 'ERROR'
    
    root_chord = root_note
    if accidental:
        root_chord += accidental
    if minor:
        root_chord += minor
    if extra_note:
        extra_chord = extra_note
        if extra_accidental:
            extra_chord += extra_accidental
        if extra_minor:
            extra_chord += extra_minor
    
    # Get the list of chords for the original key and the key of C
    original_key_chords = major_keys.get(original_key, [])
    c_key_chords = major_keys['C']
    
    try:
        # Find the index of the root chord in the original key
        index = original_key_chords.index(root_chord)
        # Find the corresponding chord in the key of C
        transposed_root_chord = c_key_chords[index]
        try:
            if extra_note:
                extra_index = original_key_chords.index(extra_chord)
                transposed_extra_chord = c_key_chords[extra_index]
        except ValueError:
            # If the chord is not found in the original_key_chords, just keep the root_chord
            return transposed_root_chord + modifiers
    except ValueError:
        print('ERROR:', chord)
        # If the chord is not found in the original_key_chords, just keep the root_chord
        transposed_root_chord = 'ERROR'
    
    # Re-append any modifiers to the transposed chord
    transposed_chord = chord.replace(root_chord, transposed_root_chord, 1)
    if extra_note:
        transposed_chord = transposed_chord.replace(extra_chord, transposed_extra_chord, 1)
    
    return transposed_chord


chord = 'Gadd11/F#msus4'
original_key = 'G'

transpose_chord_to_C(chord, original_key)

'Cadd11/Bmsus4'

In [None]:
root_note, accidental, minor = 'F', '#', 'm'


In [7]:
## Transpose

# Function to transpose a chord to the key of C, preserving any modifiers like 'm', 'sus4', etc.
def transpose_chord_to_C(chord, original_key):
    # Use regular expression to find the root note and any modifiers in the chord
    match = re.match(r"([A-Ga-g])([#b])?(m)?", chord)
    if not match:
        return chord  # If the chord doesn't match the regular expression, return it as is
    
    root_note, accidental, minor = match.groups()
    root_chord = root_note
    if accidental:
        root_chord += accidental
    if minor:
        root_chord += minor
    
    # Get the list of chords for the original key and the key of C
    original_key_chords = major_keys.get(original_key, [])
    c_key_chords = major_keys['C']
    
    try:
        # Find the index of the root chord in the original key
        index = original_key_chords.index(root_chord)
        # Find the corresponding chord in the key of C
        transposed_root_chord = c_key_chords[index]
    except ValueError:
        print('ERROR:', chord, original_key)
        # If the chord is not found in the original_key_chords, keep it as is
        transposed_root_chord = root_chord
    
    # Re-append any modifiers to the transposed chord
    transposed_chord = chord.replace(root_chord, transposed_root_chord, 1)
    
    return transposed_chord

# Function to transpose an entire chord progression to the key of C
def transpose_progression_to_C(chord_progression, original_key):
    return [transpose_chord_to_C(chord, original_key) for chord in chord_progression]

# Create a new column 'chord_progression_C_v2' that contains the improved transposed chord progressions
df_clean['chord_progression_C'] = df_clean.apply(lambda row: transpose_progression_to_C(row['chord_progression'], row['key']), axis=1)


In [8]:
# backup clean chord progressions to pickle
df_clean.to_pickle('df_clean.pickle')

In [9]:
# save an upsampled pickle
how_many_rows = 100_000
df_clean['chord_progression_C'].rename({'chord_progression_C':'chords'}).sample(frac=100_000/df_clean.shape[0], replace=True).to_pickle('chord_progressions.pickle')

In [10]:
# how many different chord progressions do we have?
f"{df_clean.chord_progression_C.astype('str').nunique()} unique chord progressions out of {df_clean.shape[0]} total chord progressions"

'5300 unique chord progressions out of 17000 total chord progressions'

In [11]:
# most common chord progressions
df_clean.chord_progression_C.astype('str').value_counts().head(20)

chord_progression_C
['C', 'G', 'Am', 'F']    394
['Am', 'F', 'C', 'G']    384
['F', 'C', 'G']          274
['F', 'C', 'G', 'Am']    237
['F', 'G', 'C']          181
['G', 'F', 'C']          180
['G', 'Am', 'F', 'C']    163
['C', 'G', 'F']          162
['F', 'G', 'Am']         150
['Am', 'G', 'F']         144
['C', 'F']               140
['Am', 'F', 'G']         134
['C', 'F', 'G']          132
['C', 'Am', 'F', 'G']    125
['F', 'Am', 'G']         109
['C', 'G']               107
['C', 'C', 'C']          102
['Em', 'C', 'G']          94
['G', 'C', 'F']           94
['C', 'Am', 'F']          89
Name: count, dtype: int64

In [27]:
# what are all of the chords (tokens)?
# poop. need to also transpose the note after slashes...

tokens = set()
for _, row in df_clean.iterrows():
    tokens |= set(row.chord_progression_C)
    
tokens

{'Am',
 'Am/Ab',
 'Am/C#',
 'Am/F#',
 'Am/G#',
 'Am/Gb',
 'Am11',
 'Am6',
 'Am7',
 'Am7/Ab',
 'Am7/Cb',
 'Am7add11',
 'Am9',
 'Amadd9',
 'Amaj7',
 'Ammaj7',
 'Bm',
 'Bm/F#',
 'Bm/G#',
 'Bm11',
 'Bm6',
 'Bm7',
 'Bm9',
 'Bmaj7',
 'Bmmaj7/C#',
 'C',
 'C(2)',
 'C(9)',
 'C(add4)',
 'C(b5)',
 'C/Bb',
 'C/C#',
 'C/D#',
 'C/Eb',
 'C/F#',
 'C/G#',
 'C2',
 'C2/F#',
 'C4',
 'C5',
 'C5/F#',
 'C6',
 'C6add11',
 'C7',
 'C7(sus2)',
 'C7add11',
 'C7sus2',
 'C7sus4',
 'C9',
 'C9/C#',
 'Cadd#11',
 'Cadd11',
 'Cadd2',
 'Cadd4',
 'Cadd9',
 'Caug',
 'Cdim',
 'Csus',
 'Csus2',
 'Csus2/C#',
 'Csus2/F#',
 'Csus4',
 'Csus4/F#',
 'Dm',
 'Dm(add9)',
 'Dm/Db',
 'Dm11',
 'Dm13/C#',
 'Dm6',
 'Dm7',
 'Dm7/Eb',
 'Dm7/F#',
 'Dm7add11',
 'Dm9',
 'Dmadd9',
 'Dmaj',
 'Dmaj7',
 'Dmaj9',
 'Em',
 'Em/C#',
 'Em/Db',
 'Em/F#',
 'Em/Gb',
 'Em6',
 'Em7',
 'Em9',
 'Emadd9',
 'Emaj7',
 'Emaj9',
 'F',
 'F(add9)',
 'F(b5)',
 'F/Ab',
 'F/Bb',
 'F/C#',
 'F/F#',
 'F/G#',
 'F2',
 'F5',
 'F6',
 'F6add9',
 'F6sus2',
 'F7',
 'F9',
 'F9b5'