In [1]:
import os

folder_name = "\\songs"
files = os.listdir(os.getcwd() + folder_name)
input_songs = [file for file in files if file.endswith('.txt')]
print (input_songs) # Original songs

['keyboards_ray-charles_a-song-for-you.txt', 'keyboards_ray-charles_aint-misbehavin.txt', 'keyboards_ray-charles_aint-that-love.txt', 'keyboards_ray-charles_almost-like-being-in-love.txt', 'keyboards_ray-charles_alone-together.txt', 'keyboards_ray-charles_am-i-blue.txt', 'keyboards_ray-charles_america-the-beautiful.txt', 'keyboards_ray-charles_basin-street-blues.txt', 'keyboards_ray-charles_bewitched-bothered-and-bewildered.txt', 'keyboards_ray-charles_black-coffee.txt', 'keyboards_ray-charles_born-to-be-blue.txt', 'keyboards_ray-charles_born-to-lose.txt', 'keyboards_ray-charles_bulldog-bite-hunker-down-hairy-dawg.txt', 'keyboards_ray-charles_busted.txt', 'keyboards_ray-charles_california-here-i-come.txt', 'keyboards_ray-charles_candy.txt', 'keyboards_ray-charles_carry-me-back-to-old-virginny.txt', 'keyboards_ray-charles_come-rain-or-come-shine.txt', 'keyboards_ray-charles_cry-me-a-river.txt', 'keyboards_ray-charles_crying-time.txt', 'keyboards_ray-charles_dont-cry-for-me-argentina.txt

## Create General network

In [2]:
import re
import  numpy as np

def fetch_chords(line):
    """
    Run regex expression to isolate chords in the txt file.
    """
    notes = "[CDEFGAB]";
    ignores = "(?![A-Za-z\'])"
    additonas_1 = "(?:\/[CDEFGAB])?";
    accidentals = "(?:#|##|b|bb)?";
    chords = "(?:maj|min|m|sus|aug|dim|add)?";
    additions_2 = "(?:[0-9]+(?:\/[0-9]+)?-?\+?)?"
    additions_3 = "(?:\/[0-9]?-?[CDEFGAB]?)?"
    optional = "(?:[0-9]+[CDEFGABM])?"
    return re.findall(r'\b' + notes + accidentals + chords + additonas_1 + accidentals + additions_2 + additions_3 + optional + ignores + r'(?!\w)', line)

def flat_list(in_list):
    """
    Convert the nested list into a single flat list
    """
    return list(np.concatenate(in_list).flat)

def write_output_file(songs_name, path, chords):
    """
    Write the final output in to a .txt file
    """
    directory = os.path.join(os.getcwd(), path)
    if not os.path.exists(directory):
            os.makedirs(directory)
    try:
        out_file_name = f"{directory}\\{songs_name}.txt"
        with open(out_file_name, 'w') as file:
            for chord in chords:
                file.write(chord + '\n')
    except Exception as e:
        print(f"An error occurred: {str(e)}")

def write_to_csv(name, path, chords):
    """
    Write the source and destination chords to csv file.
    """
    try:
        directory = os.path.join(os.getcwd(), path)
        if not os.path.exists(directory):
                os.makedirs(directory)
        chords.to_csv(f'{directory}\{name}.csv', index=False, sep=';')
    except Exception as e:
        print(f"An error occurred: {str(e)}")

def generate_unique_chord_ids(unique_chords):
     """
     Generate unique ID's for each chord.
     """
     return dict(zip(unique_chords, range(1,len(unique_chords)+1)))

In [3]:
def extract_song_details(path):
    """
    Read the txt file and extract song name and chords
    """
    try:
        file = open(path, 'r') # open the file in read mode
        lines = file.readlines() # read lines in text file to a nested list

        song_name = lines.pop(0).replace('\n', '') # get the song name

        chords = list(map(fetch_chords, lines)) # fetch chords in each line
        chords_ = [chord for chord in chords if chord] # remove empty lists NOTE: empty lists are formed for the lyrics lines
        #f_chords.pop(len(f_chords)-1) # remove the unncessary E character fetched from the last line
        flat_vec_chords = flat_list(chords_)
        file.close() # close the file
        return song_name, flat_vec_chords # return
    except FileNotFoundError:
        print("File not found")
    except Exception as e:
        print(f"An error occurred: {str(e)}")

In [4]:
import pandas as pd

unique_chords = set()
song_names = []

for song in input_songs:
    name, chords = extract_song_details(f"songs\{song}")
    unique_chords.update(chords)
    write_output_file(name, "songs\\final", chords)
    song_names.append(name)

chord_ids = generate_unique_chord_ids(sorted(unique_chords)) # chord id dictionary
nodes = pd.DataFrame(list(chord_ids.items()), columns=['Label', 'Id'])
nodes = nodes[['Id', 'Label']]
write_to_csv('node', "songs\\final\\csv\\combined", nodes)

In [5]:
folder_name = "\\songs\\final"
final_files = os.listdir(os.getcwd() + folder_name)
final_songs = [file for file in final_files if file.endswith('.txt')]
print(final_songs)

['Ray Charles - A Song For You (Keyboard chords).txt', "Ray Charles - Ain't Misbehavin' (Keyboard chords).txt", 'Ray Charles - Aint That Love (Keyboard chords).txt', 'Ray Charles - Almost Like Being In Love (Keyboard chords).txt', 'Ray Charles - Alone Together (Keyboard chords).txt', 'Ray Charles - Am I Blue (Keyboard chords).txt', 'Ray Charles - America the beautiful (Keyboard chords).txt', 'Ray Charles - Basin Street Blues (Keyboard chords).txt', 'Ray Charles - Bewitched, Bothered And Bewildered (Keyboard chords).txt', 'Ray Charles - Black Coffee (Keyboard chords).txt', 'Ray Charles - Born To Be Blue (Keyboard chords).txt', 'Ray Charles - Born to lose (Keyboard chords).txt', 'Ray Charles - Bulldog Bite Hunker Down Hairy Dawg (Keyboard chords).txt', 'Ray Charles - Busted (Keyboard chords).txt', 'Ray Charles - California Here I Come (Keyboard chords).txt', 'Ray Charles - Candy (Keyboard chords).txt', 'Ray Charles - Carry Me Back To Old Virginny (Keyboard chords).txt', 'Ray Charles - Co

In [6]:
import pandas as pd

def replace_chords_with_ids(chords, chord_dic):
    """
    Replace chord names with respective ID's from the dictionary
    """
    final = [chord_dic[chord] for chord in chords]
    return final

def create_src_dst(chords):
    """
    Create a dataframe consiting of source chord and target chord
    """
    df = pd.DataFrame(chords, columns=['Target'])
    df['Source'] = df['Target'].shift(periods=[1])
    df = df.iloc[1:]
    df['Source'] = df['Source'].astype(int)
    df = df[['Source','Target']]
    df['Type'] = 'Directed'
    df = df.drop_duplicates()
    return df

def convert_to_gephi_format(path):
    """
    Read the txt file and convert them into gephi csv format
    """
    try:
        file = open(path, 'r') # open the file in read mode
        chords = file.readlines() # read lines in text file to a nested list
        chords = list(map(lambda a: a.replace('\n', ''), chords))
        trans_chord = replace_chords_with_ids(chords, chord_ids)
        df = create_src_dst(trans_chord)
        name = os.path.basename(file.name)
        write_to_csv(name.replace(".txt", ""), "songs\\final\\csv", df)
        file.close()
        return df
    except FileNotFoundError:
        print("File not found")
    except Exception as e:
        print(f"An error occurred: {str(e)}")

In [7]:
df_list = []
for song in final_songs:
    file = open(f"songs\\final\{song}", 'r')
    df = convert_to_gephi_format(f"songs\\final\{song}")
    df_list.append(df)
    
concat_df = pd.concat(df_list)
write_to_csv('edge', "songs\\final\\csv\\combined", concat_df)

## Create Jazz network and other genres network


In [8]:
jazz_songs = ["Ain't Misbehavin'","Almost Like Being In Love","Alone Together","Basin Street Blues","Bewitched, Bothered And Bewildered","Black Coffee","Bulldog Bite Hunker Down Hairy Dawg","California Here I Come","Ev'ry Time We Say Goodbye","Falling In Love With Love","It Had To Be You","Moon River","My Funny Valentine","My Old Flame","My Romance","Nancy (with The Laughing Face)","Rays Blues","She's Funny That Way","Some Enchanted Evening","Stella by Starlight","Sweet Georgia Brown","The Lady Is A Tramp","The Man I Love","Three Quarter Time","Willow Weep For Me","Â€˜deed I Do"]
jazz_csv_list = []
jazz_df_list = []
for song in jazz_songs:
    jazz_df = pd.read_csv(f"songs\\final\csv\Ray Charles - {song} (Keyboard chords).csv", delimiter=';')
    jazz_csv_list.append(f"Ray Charles - {song} (Keyboard chords).csv")
    jazz_df_list.append(jazz_df)
    
jazz_concat = pd.concat(jazz_df_list)
jazz_chords = list(set(pd.concat([jazz_concat['Source'], jazz_concat['Target']]).tolist()))
all_nodes = pd.read_csv(f"songs\\final\csv\\combined\\node.csv", delimiter=';')
jazz_nodes = all_nodes.loc[all_nodes['Id'].isin(jazz_chords)]

write_to_csv('jazz_edge', "songs\\final\\csv\\combined", jazz_concat)
write_to_csv('jazz_node', "songs\\final\\csv\\combined", jazz_nodes)


In [9]:
folder_name = "\\songs\\final\\csv"
files = os.listdir(os.getcwd() + folder_name)
input_all_songs = [file for file in files if file.endswith('.csv')]
input_other_songs = [file for file in input_all_songs if file not in jazz_csv_list]

other_df_list = []
for song in input_other_songs:
    other_df = pd.read_csv(f"songs\\final\\csv\\{song}", delimiter=';')
    other_df_list.append(other_df)
    
other_concat = pd.concat(other_df_list)
other_chords = list(set(pd.concat([other_concat['Source'], other_concat['Target']]).tolist()))
other_nodes = all_nodes.loc[all_nodes['Id'].isin(other_chords)]

write_to_csv('other_edge', "songs\\final\\csv\\combined", other_concat)
write_to_csv('other_node', "songs\\final\\csv\\combined", other_nodes)

## Filtering out self loops and select the dominant weights

In [10]:
all_songs_edges = pd.read_csv("songs\\final\\csv\\combined\\edge.csv", delimiter=';')
all_songs_edges = all_songs_edges[~(all_songs_edges['Source'] == all_songs_edges['Target'])] # Remove self loops
all_songs_uniqe_edges = all_songs_edges.drop_duplicates(subset=['Source', 'Target'])
# Select only dominant edges
for index, row in all_songs_uniqe_edges.iterrows():
    if any((all_songs_edges['Source'] == row['Target']) & (all_songs_edges['Target'] == row['Source'])):
        direct_prog_cnt = ((all_songs_edges['Source'] == row['Source']) & (all_songs_edges['Target'] ==  row['Target'])).sum()
        rev_prog_cnt = ((all_songs_edges['Source'] == row['Target']) & (all_songs_edges['Target'] ==  row['Source'])).sum()
        if direct_prog_cnt > rev_prog_cnt:  
            all_songs_edges = all_songs_edges[~((all_songs_edges['Source'] == row['Target']) & (all_songs_edges['Target'] ==  row['Source']))]
        else:
            all_songs_edges = all_songs_edges[~((all_songs_edges['Source'] == row['Source']) & (all_songs_edges['Target'] ==  row['Target']))]

write_to_csv('dominant_edge', "songs\\final\\csv\\combined", all_songs_edges)    

In [11]:
jazz_songs_edges = pd.read_csv("songs\\final\\csv\\combined\\jazz_edge.csv", delimiter=';')
jazz_songs_edges = jazz_songs_edges[~(jazz_songs_edges['Source'] == jazz_songs_edges['Target'])] # Remove self loops
jazz_songs_uniqe_edges = jazz_songs_edges.drop_duplicates(subset=['Source', 'Target'])
# Select only dominant edges
for index, row in jazz_songs_uniqe_edges.iterrows():
    if any((jazz_songs_edges['Source'] == row['Target']) & (jazz_songs_edges['Target'] == row['Source'])):
        direct_prog_cnt = ((jazz_songs_edges['Source'] == row['Source']) & (jazz_songs_edges['Target'] ==  row['Target'])).sum()
        rev_prog_cnt = ((jazz_songs_edges['Source'] == row['Target']) & (jazz_songs_edges['Target'] ==  row['Source'])).sum()
        if direct_prog_cnt > rev_prog_cnt:  
            jazz_songs_edges = jazz_songs_edges[~((jazz_songs_edges['Source'] == row['Target']) & (jazz_songs_edges['Target'] ==  row['Source']))]
        else:
            jazz_songs_edges = jazz_songs_edges[~((jazz_songs_edges['Source'] == row['Source']) & (jazz_songs_edges['Target'] ==  row['Target']))]

write_to_csv('dominant_jazz_edge', "songs\\final\\csv\\combined", jazz_songs_edges)    

In [12]:
other_songs_edges = pd.read_csv("songs\\final\\csv\\combined\\other_edge.csv", delimiter=';')
other_songs_edges = other_songs_edges[~(other_songs_edges['Source'] == other_songs_edges['Target'])] # Remove self loops
other_songs_uniqe_edges = other_songs_edges.drop_duplicates(subset=['Source', 'Target'])
# Select only dominant edges
for index, row in other_songs_uniqe_edges.iterrows():
    if any((other_songs_edges['Source'] == row['Target']) & (other_songs_edges['Target'] == row['Source'])):
        direct_prog_cnt = ((other_songs_edges['Source'] == row['Source']) & (other_songs_edges['Target'] ==  row['Target'])).sum()
        rev_prog_cnt = ((other_songs_edges['Source'] == row['Target']) & (other_songs_edges['Target'] ==  row['Source'])).sum()
        if direct_prog_cnt > rev_prog_cnt:  
            other_songs_edges = other_songs_edges[~((other_songs_edges['Source'] == row['Target']) & (other_songs_edges['Target'] ==  row['Source']))]
        else:
            other_songs_edges = other_songs_edges[~((other_songs_edges['Source'] == row['Source']) & (other_songs_edges['Target'] ==  row['Target']))]

write_to_csv('dominant_other_edge', "songs\\final\\csv\\combined", other_songs_edges)    

## Reconstructing the general jazz edges

In [13]:
gen_folder_name = "\\GENERAL JAZZ\\Songs graph"
gen_files = os.listdir(os.getcwd() + gen_folder_name)
input_gen_songs = [file for file in gen_files if file.endswith('.txt')]
#print(input_gen_songs) # Original general jazz songs

final_gen_list = []
for song in input_gen_songs:
    file = open(f"{os.getcwd() + gen_folder_name}\\{song}", "r") # read each song one by one
    lines = file.readlines()
    file.close()
    del lines[0:4] # ignore first 4 lines in txt file as that information is irrelavant
    chord_tup = [tuple(map(int, elem.split('\t'))) for elem in lines] # create chord pairs as tuples
    chord_tup = list(set(chord_tup))
    final_gen_list.extend(chord_tup)
gen_df = pd.DataFrame(final_gen_list, columns=['Source', 'Target'])
gen_df['Type'] = 'Directed'
gen_df['Id'] = None
gen_df['Label'] = None
gen_df['Weight'] = None
write_to_csv('corrected_edge', "GENERAL JAZZ", gen_df) # write the new corrected edges

## Create new dictionary combining general jazz and ray charles

In [16]:
rc_jazz_df = pd.read_csv("songs\\final\\csv\\combined\\node.csv", delimiter = ';')
general_jazz_df = pd.read_csv("GENERAL JAZZ\\node.csv")
merged_df = pd.concat([rc_jazz_df, general_jazz_df]).drop_duplicates().reset_index(drop=True)
unique_chords = set(merged_df['Label'])
new_chord_ids = generate_unique_chord_ids(sorted(list(unique_chords)))
new_nodes = pd.DataFrame(list(new_chord_ids.items()), columns=['Label', 'Id'])
new_nodes = new_nodes[['Id', 'Label']]

new_nodes_rc = new_nodes[(new_nodes["Id"].isin(rc_jazz_df['Id']))]
write_to_csv('new_node', "songs\\final\\csv\\combined", new_nodes_rc)

new_nodes_gen = new_nodes[(new_nodes['Id'].isin(general_jazz_df['Id']))]
write_to_csv('new_node', "GENERAL JAZZ", new_nodes_gen)

write_to_csv('common_node', "GENERAL JAZZ", new_nodes)

# Translate RC jazz edge ID's to new format

# Dictionary translator for RC music
rc_dic_trans = pd.merge(rc_jazz_df, new_nodes_rc, on='Label', how='left')
rc_dic_trans = rc_dic_trans.drop('Label', axis=1)

rc_edges = pd.read_csv("songs\\final\\csv\\combined\\edge.csv", delimiter=";")

merged_rc_edges = pd.merge(rc_edges, rc_dic_trans, left_on='Source', right_on="Id_x", how='left')
rc_edges['Source'] = round(merged_rc_edges['Id_y'].combine_first(rc_edges['Source']))

merged_rc_edges = pd.merge(rc_edges, rc_dic_trans, left_on='Target', right_on="Id_x", how='left')
rc_edges['Target'] = round(merged_rc_edges['Id_y'].combine_first(rc_edges['Target']))

write_to_csv('new_edge', "songs\\final\\csv\\combined", rc_edges)

# Translate general Jazz edge ID's to new format

# Dictionary translator for general Jazz music
gen_dic_trans = pd.merge(general_jazz_df, new_nodes_gen, on='Label', how='left')
gen_dic_trans = gen_dic_trans.drop('Label', axis=1)

general_edges = pd.read_csv("GENERAL JAZZ\\corrected_edge.csv", delimiter=";")

merged_gen_edges = pd.merge(general_edges, gen_dic_trans, left_on='Source', right_on="Id_x", how='left')
general_edges['Source'] = round(merged_gen_edges['Id_y'].combine_first(general_edges['Source']))

merged_gen_edges = pd.merge(general_edges, gen_dic_trans, left_on='Target', right_on="Id_x", how='left')
general_edges['Target'] = round(merged_gen_edges['Id_y'].combine_first(general_edges['Target']))

write_to_csv('new_edge', "GENERAL JAZZ", general_edges)