In [26]:
import time
import scipy
import numpy as np
import pandas as pd
import winsound

import networkx as nx
import matplotlib

from sklearn.preprocessing import LabelEncoder
import torch
from torch_geometric.data import HeteroData

from HeteroDataFunctions import Encoder, add_types, complete_graph, flatten_lol, node_cat_dict, midi_type

print(scipy.__version__)
print(matplotlib.__version__)
print(nx.__version__)

1.7.3
3.6.2
2.8.4


# 3 Songs Example

In [2]:
G = complete_graph(".\slac\embeddings\Test Edgelists")

loading edgelists...
- full_edgelist.edgelist
- notes.edgelist
- program.edgelist
- tempo.edgelist
- time.signature.edgelist
Nodes: 2703
Edges: 19366


In [3]:
nodes = pd.DataFrame((list(G.nodes)), columns=['name'])
edges = pd.DataFrame(np.array(list(G.edges)), columns=['source', 'target'])

In [4]:
node_categories = node_cat_dict(nodes)

node_categories.keys()

node_cat_dict took 0.01 secs to run


dict_keys(['note_group', 'pitch', 'program', 'MIDI', 'duration', 'velocity', 'time_sig', 'tempo'])

In [5]:
node_categories['tempo']

['9', '6', '11']

In [6]:
main_edge_types = ["MIDI__has__tempo",
                   "MIDI__in__time_sig",
                   "MIDI__has__program",
                   "MIDI__has__note_group",
                   "note_group__has__velocity",
                   "note_group__has__duration",
                   "note_group__contains__pitch"]



In [7]:
nodes_f, edges_f = add_types(nodes, edges, node_categories)


add_node_type took 0.54 secs to run
add_edge_type took 3.01 secs to run
add_types took 3.55 secs to run


In [8]:
edges_f.loc[edges_f['edge_type'] == 'MIDI__has__note_group', ['source', 'target']]

Unnamed: 0,source,target
0,-Albert_King_-_Born_Under_A_Bad_Sign,g1601074
1,-Albert_King_-_Born_Under_A_Bad_Sign,g1577049
2,-Albert_King_-_Born_Under_A_Bad_Sign,g1575127
3,-Albert_King_-_Born_Under_A_Bad_Sign,g-1872221027
4,-Albert_King_-_Born_Under_A_Bad_Sign,g795292196
...,...,...
18857,-B_B_King_-_Rock_Me_Baby,g1750992
18858,-B_B_King_-_Rock_Me_Baby,g415858330
18859,-B_B_King_-_Rock_Me_Baby,g1606844
18860,-B_B_King_-_Rock_Me_Baby,g1605881


## Graph Build

In [9]:
data = HeteroData()

data['MIDI'].x = node_categories['MIDI']
data['note_group'].x = node_categories['note_group']

data['MIDI', 'has', 'note_group'].edge_index = edges_f.loc[edges_f['edge_type'] == 'MIDI__has__note_group', ['source', 'target']]

In [10]:
data.metadata()

(['MIDI', 'note_group'], [('MIDI', 'has', 'note_group')])

In [11]:
data['MIDI']

{'x': ['-B_B_King_-_Rock_Me_Baby', '-Albert_King_-_Born_Under_A_Bad_Sign', '-B_B_King_-_How_Blue_Can_You_Get']}

In [12]:
node_edge_categories = node_categories.copy()
node_edge_categories['node_types'] = list(node_categories.keys())
node_edge_categories['main_edge_types'] = main_edge_types  # Dictionary containing every string that may be found in our Dataframes

In [13]:
names_list = flatten_lol(node_edge_categories.values())


In [14]:
len(names_list) == len(set(names_list))

True

In [15]:
nodes_f.iloc[0][0]

'-Albert_King_-_Born_Under_A_Bad_Sign'

In [16]:
encoder = Encoder(names_list)

In [17]:
nodes_ten = encoder.encode_nodes(nodes_f)

nodes_ten.numpy()

encode_nodes took 0.48 secs to run


array([[2639, 2706],
       [   0, 2703],
       [   1, 2703],
       ...,
       [2614, 2704],
       [2554, 2704],
       [2579, 2704]])

In [18]:
edges_ten = encoder.encode_edges(edges_f)
edges_ten.numpy()

encode_edges took 1.90 secs to run


array([[2639,    0, 2714],
       [2639,    1, 2714],
       [2639,    2, 2714],
       ...,
       [2552, 2565, 2717],
       [2553, 2609, 2717],
       [2553, 2667, 2716]])

In [19]:
edges_ten[:, :2]

tensor([[2639,    0],
        [2639,    1],
        [2639,    2],
        ...,
        [2552, 2565],
        [2553, 2609],
        [2553, 2667]], dtype=torch.int32)

In [20]:
node_type = nodes_f.iloc[:, 1]

# Get the source and target indices from the edges tensor
edge_index = edges_ten[:, :2]

# Get the edge types from the edges tensor
edge_type = edges_f.iloc[:, 2]


In [21]:
hetero_graph = HeteroData(x=nodes_ten, node_type=node_type, edge_index=edge_index, edge_type=edge_type)

In [22]:
print(hetero_graph)

HeteroData(
  x=[2703, 2],
  node_type=0             MIDI
1       note_group
2       note_group
3       note_group
4       note_group
           ...    
2698       program
2699         tempo
2700         pitch
2701         pitch
2702         pitch
Name: node_type, Length: 2703, dtype: object,
  edge_index=[19366, 2],
  edge_type=0              MIDI__has__note_group
1              MIDI__has__note_group
2              MIDI__has__note_group
3              MIDI__has__note_group
4              MIDI__has__note_group
                    ...             
19361      note_group__has__duration
19362    note_group__contains__pitch
19363    note_group__contains__pitch
19364    note_group__contains__pitch
19365      note_group__has__duration
Name: edge_type, Length: 19366, dtype: object
)


# Complete Dataset

In [2]:
C = complete_graph(".\slac\embeddings\\all")

loading edgelists...
- notes.edgelist
- program.edgelist
- tempo.edgelist
- time.signature.edgelist
Nodes: 93553
Edges: 786635


In [3]:
nodes_c = pd.DataFrame((list(C.nodes)), columns=['name'])
edges_c = pd.DataFrame(np.array(list(C.edges)), columns=['source', 'target'])

In [4]:
node_categories_c = node_cat_dict(nodes_c)

node_cat_dict took 0.18 secs to run


In [5]:
node_categories_c.keys()

dict_keys(['note_group', 'pitch', 'program', 'MIDI', 'duration', 'velocity', 'time_sig', 'tempo'])

In [6]:
%%script false --no-raise-error

nodes_full, edges_full = add_types(nodes_c, edges_c, node_categories_c)

winsound.Beep(400, 700)

nodes_full.to_csv('nodes_complete.csv')
edges_full.to_csv('edges_complete.csv')

Couldn't find program: 'false'


In [7]:
nodes_full = pd.read_csv('.\slac\Contents of Slac\\nodes_complete.csv')
edges_full = pd.read_csv('.\slac\Contents of Slac\edges_complete.csv')


In [8]:
main_edge_types = ["MIDI__has__tempo",
                   "MIDI__in__time_sig",
                   "MIDI__has__program",
                   "MIDI__has__note_group",
                   "note_group__has__velocity",
                   "note_group__has__duration",
                   "note_group__contains__pitch"]

In [9]:
nodes_full[nodes_full['node_type'] == "MIDI"]

Unnamed: 0.1,Unnamed: 0,name,node_type
0,0,Blues_-_Modern-Albert_King_-_Born_Under_A_Bad_...,MIDI
1029,1029,Blues_-_Modern-B_B_King_-_How_Blue_Can_You_Get,MIDI
2248,2248,Blues_-_Modern-B_B_King_-_Rock_Me_Baby,MIDI
2686,2686,Blues_-_Modern-B_B_King_-_The_Thrill_Is_Gone,MIDI
3738,3738,Blues_-_Modern-Buddy_Guy_-_Don't_Answer_the_Door,MIDI
...,...,...,...
92865,92865,Rock_-_Metal-Rage_Against_the_Machine_-_Bulls_...,MIDI
92925,92925,Rock_-_Metal-Rage_Against_the_Machine_-_Gueril...,MIDI
93030,93030,Rock_-_Metal-Rage_Against_the_Machine_-_Killin...,MIDI
93133,93133,Rock_-_Metal-Rage_Against_the_Machine_-_Know_Y...,MIDI


In [10]:
s = "Classical_-_Romantic-Berlioz_-_Harold_In_Italy_Op_16"
s.split('_-_')[0]

'Classical'

In [11]:
full_categories = node_categories_c.copy()
full_categories['node_types'] = list(node_categories_c.keys())
full_categories['main_edge_types'] = main_edge_types  # Dictionary containing every string that may be found in our Dataframes
names_list_full = flatten_lol(full_categories.values())

len(names_list_full) == len(set(names_list_full))

True

In [12]:
encoder_f = Encoder(names_list_full)

# nodes_ten_f = encoder_f.encode_nodes(nodes_full)

# edges_ten_f = encoder_f.encode_edges(edges_full)

winsound.Beep(400, 700)

In [13]:
nodes_full.loc[nodes_full['node_type'] == 'MIDI', ['name']]

Unnamed: 0,name
0,Blues_-_Modern-Albert_King_-_Born_Under_A_Bad_...
1029,Blues_-_Modern-B_B_King_-_How_Blue_Can_You_Get
2248,Blues_-_Modern-B_B_King_-_Rock_Me_Baby
2686,Blues_-_Modern-B_B_King_-_The_Thrill_Is_Gone
3738,Blues_-_Modern-Buddy_Guy_-_Don't_Answer_the_Door
...,...
92865,Rock_-_Metal-Rage_Against_the_Machine_-_Bulls_...
92925,Rock_-_Metal-Rage_Against_the_Machine_-_Gueril...
93030,Rock_-_Metal-Rage_Against_the_Machine_-_Killin...
93133,Rock_-_Metal-Rage_Against_the_Machine_-_Know_Y...


In [14]:
midi_ten = encoder_f.encode_nodes2(nodes_full.loc[nodes_full['node_type'] == 'MIDI', ['name']])
midi_ten

tensor([[92833],
        [92881],
        [92693],
        [92904],
        [92868],
        [92687],
        [92768],
        [92689],
        [92878],
        [92761],
        [92830],
        [92770],
        [92837],
        [92698],
        [92802],
        [92697],
        [92883],
        [92806],
        [92757],
        [92732],
        [92726],
        [92779],
        [92842],
        [92702],
        [92788],
        [92928],
        [92867],
        [92771],
        [92713],
        [92839],
        [92845],
        [92745],
        [92816],
        [92870],
        [92852],
        [92828],
        [92776],
        [92786],
        [92755],
        [92819],
        [92840],
        [92799],
        [92901],
        [92763],
        [92921],
        [92721],
        [92930],
        [92715],
        [92703],
        [92696],
        [92794],
        [92708],
        [92710],
        [92730],
        [92926],
        [92729],
        [92720],
        [92827],
        [92835

In [15]:
for _, s in nodes_full.loc[nodes_full['node_type'] == 'MIDI', ['name']].items():
    pass

midi_class = [midi_type(st) for st in s.values]
midi_class

['Blues',
 'Blues',
 'Blues',
 'Blues',
 'Blues',
 'Blues',
 'Blues',
 'Blues',
 'Blues',
 'Blues',
 'Blues',
 'Blues',
 'Blues',
 'Blues',
 'Blues',
 'Blues',
 'Blues',
 'Blues',
 'Blues',
 'Blues',
 'Blues',
 'Blues',
 'Blues',
 'Blues',
 'Blues',
 'Blues',
 'Blues',
 'Blues',
 'Blues',
 'Blues',
 'Blues',
 'Blues',
 'Blues',
 'Blues',
 'Blues',
 'Blues',
 'Blues',
 'Blues',
 'Blues',
 'Blues',
 'Blues',
 'Blues',
 'Blues',
 'Blues',
 'Blues',
 'Blues',
 'Blues',
 'Blues',
 'Blues',
 'Blues',
 'Classical',
 'Classical',
 'Classical',
 'Classical',
 'Classical',
 'Classical',
 'Classical',
 'Classical',
 'Classical',
 'Classical',
 'Classical',
 'Classical',
 'Classical',
 'Classical',
 'Classical',
 'Classical',
 'Classical',
 'Classical',
 'Classical',
 'Classical',
 'Classical',
 'Classical',
 'Classical',
 'Classical',
 'Classical',
 'Classical',
 'Classical',
 'Classical',
 'Classical',
 'Classical',
 'Classical',
 'Classical',
 'Classical',
 'Classical',
 'Classical',
 'Classica

In [27]:
lb = LabelEncoder()
y =  lb.fit_transform(midi_class)
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
       4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
       4, 4, 4, 4, 4, 4, 4, 4], dtype=int64)

In [15]:
node_type = nodes_full.iloc[:, 1]

# Get the source and target indices from the edges tensor
edge_index = edges_ten_f[:, :2]

# Get the edge types from the edges tensor
edge_type = edges_full.iloc[:, 2]


In [16]:
full_hetero_graph = HeteroData(x=nodes_ten_f, node_type=node_type, edge_index=edge_index, edge_type=edge_type)

In [16]:
full_hetero_graph

HeteroData(
  x=[93553, 2],
  node_type=0              MIDI
1        note_group
2             pitch
3          duration
4          velocity
            ...    
93548      time_sig
93549      time_sig
93550      time_sig
93551      time_sig
93552      time_sig
Name: node_type, Length: 93553, dtype: object,
  edge_index=[786635, 2],
  edge_type=0         MIDI__has__note_group
1         MIDI__has__note_group
2         MIDI__has__note_group
3         MIDI__has__note_group
4         MIDI__has__note_group
                  ...          
786630       MIDI__has__program
786631       MIDI__has__program
786632       MIDI__has__program
786633         MIDI__has__tempo
786634       MIDI__in__time_sig
Name: edge_type, Length: 786635, dtype: object
)

In [28]:
hetero_test = HeteroData({'MIDI': {'x': midi_ten, 'y': y}})

In [32]:
hetero_test['MIDI'].y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
       4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
       4, 4, 4, 4, 4, 4, 4, 4], dtype=int64)