In [2]:
import time
import scipy
import numpy as np
import pandas as pd
import winsound

import networkx as nx
import matplotlib

import torch
from torch_geometric.data import HeteroData

from HeteroDataFunctions import Encoder, add_types, complete_graph, flatten_lol, node_cat_dict

print(scipy.__version__)
print(matplotlib.__version__)
print(nx.__version__)

1.7.3
3.6.2
2.8.4


# 3 Songs Example

In [3]:
G = complete_graph(".\slac\embeddings\Test Edgelists")

loading edgelists...
- full_edgelist.edgelist
- notes.edgelist
- program.edgelist
- tempo.edgelist
- time.signature.edgelist
Nodes: 2703
Edges: 19366


In [4]:
nodes = pd.DataFrame((list(G.nodes)), columns=['name'])
edges = pd.DataFrame(np.array(list(G.edges)), columns=['source', 'target'])

In [5]:
node_categories = node_cat_dict(nodes)

node_categories.keys()

dict_keys(['note_group', 'pitch', 'program', 'MIDI', 'duration', 'velocity', 'time_sig', 'tempo'])

In [6]:
node_categories['tempo']

['9', '11', '6']

In [7]:
main_edge_types = ["MIDI__has__tempo",
                   "MIDI__in__time_sig",
                   "MIDI__has__program",
                   "MIDI__has__note_group",
                   "note_group__has__velocity",
                   "note_group__has__duration",
                   "note_group__contains__pitch"]



In [8]:
nodes_f, edges_f = add_types(nodes, edges, node_categories)


In [9]:
edges_f.loc[edges_f['edge_type'] == 'MIDI__has__note_group', ['source', 'target']]

Unnamed: 0,source,target
0,-Albert_King_-_Born_Under_A_Bad_Sign,g1601074
1,-Albert_King_-_Born_Under_A_Bad_Sign,g1577049
2,-Albert_King_-_Born_Under_A_Bad_Sign,g1575127
3,-Albert_King_-_Born_Under_A_Bad_Sign,g-1872221027
4,-Albert_King_-_Born_Under_A_Bad_Sign,g795292196
...,...,...
18857,-B_B_King_-_Rock_Me_Baby,g1750992
18858,-B_B_King_-_Rock_Me_Baby,g415858330
18859,-B_B_King_-_Rock_Me_Baby,g1606844
18860,-B_B_King_-_Rock_Me_Baby,g1605881


## Graph Build

In [10]:
data = HeteroData()

data['MIDI'].x = node_categories['MIDI']
data['note_group'].x = node_categories['note_group']

data['MIDI', 'has', 'note_group'].edge_index = edges_f.loc[edges_f['edge_type'] == 'MIDI__has__note_group', ['source', 'target']]

In [11]:
data.metadata()

(['MIDI', 'note_group'], [('MIDI', 'has', 'note_group')])

In [12]:
data['MIDI']

{'x': ['-B_B_King_-_How_Blue_Can_You_Get', '-Albert_King_-_Born_Under_A_Bad_Sign', '-B_B_King_-_Rock_Me_Baby']}

In [13]:
node_edge_categories = node_categories.copy()
node_edge_categories['node_types'] = list(node_categories.keys())
node_edge_categories['main_edge_types'] = main_edge_types  # Dictionary containing every string that may be found in our Dataframes

In [14]:
names_list = flatten_lol(node_edge_categories.values())


In [15]:
len(names_list) == len(set(names_list))

True

In [16]:
nodes_f.iloc[0][0]

'-Albert_King_-_Born_Under_A_Bad_Sign'

In [17]:
encoder = Encoder(names_list)

In [18]:
nodes_ten = encoder.encode_nodes(nodes_f)

nodes_ten.numpy()

array([[2639, 2706],
       [   0, 2703],
       [   1, 2703],
       ...,
       [2573, 2704],
       [2588, 2704],
       [2601, 2704]])

In [19]:
edges_ten = encoder.encode_edges(edges_f)
edges_ten.numpy()

array([[2639,    0, 2714],
       [2639,    1, 2714],
       [2639,    2, 2714],
       ...,
       [2552, 2614, 2717],
       [2553, 2563, 2717],
       [2553, 2649, 2716]])

In [20]:
edges_ten[:, :2]

tensor([[2639,    0],
        [2639,    1],
        [2639,    2],
        ...,
        [2552, 2614],
        [2553, 2563],
        [2553, 2649]], dtype=torch.int32)

In [21]:
node_type = nodes_f.iloc[:, 1]

# Get the source and target indices from the edges tensor
edge_index = edges_ten[:, :2]

# Get the edge types from the edges tensor
edge_type = edges_f.iloc[:, 2]


In [22]:
hetero_graph = HeteroData(x=nodes_ten, node_type=node_type, edge_index=edge_index, edge_type=edge_type)

In [23]:
print(hetero_graph)

HeteroData(
  x=[2703, 2],
  node_type=0             MIDI
1       note_group
2       note_group
3       note_group
4       note_group
           ...    
2698       program
2699         tempo
2700         pitch
2701         pitch
2702         pitch
Name: node_type, Length: 2703, dtype: object,
  edge_index=[19366, 2],
  edge_type=0              MIDI__has__note_group
1              MIDI__has__note_group
2              MIDI__has__note_group
3              MIDI__has__note_group
4              MIDI__has__note_group
                    ...             
19361      note_group__has__duration
19362    note_group__contains__pitch
19363    note_group__contains__pitch
19364    note_group__contains__pitch
19365      note_group__has__duration
Name: edge_type, Length: 19366, dtype: object
)


# Complete Dataset

In [24]:
C = complete_graph(".\slac\embeddings\edgelist0")

loading edgelists...
- notes.edgelist
- program.edgelist
- tempo.edgelist
- time.signature.edgelist
Nodes: 84093
Edges: 702340


In [25]:
nodes_c = pd.DataFrame((list(C.nodes)), columns=['name'])
edges_c = pd.DataFrame(np.array(list(C.edges)), columns=['source', 'target'])

In [26]:
tic = time.perf_counter()
node_categories_c = node_cat_dict(nodes_c)
toc = time.perf_counter()
print(f"Run the script in {toc - tic:0.4f} secs")

Run the script in 0.1545 secs


In [27]:
node_categories_c.keys()

dict_keys(['note_group', 'pitch', 'program', 'MIDI', 'duration', 'velocity', 'time_sig', 'tempo'])

In [None]:
%%script false --no-raise-error

tic = time.perf_counter()
nodes_cf, edges_cf = add_types(nodes_c, edges_c, node_categories_c)
toc = time.perf_counter()
print(f"Run the script in {(toc-tic)/60:0.2f} mins")
winsound.Beep(400, 700)

nodes_cf.to_csv('nodes_complete.csv')
edges_cf.to_csv('edges_complete.csv')

In [33]:
nodes_full = pd.read_csv('.\slac\Contents of Slac\\nodes_complete.csv')
edges_full = pd.read_csv('.\slac\Contents of Slac\edges_complete.csv')


In [37]:
full_categories = node_categories_c.copy()
full_categories['node_types'] = list(node_categories_c.keys())
full_categories['main_edge_types'] = main_edge_types  # Dictionary containing every string that may be found in our Dataframes
names_list_full = flatten_lol(full_categories.values())

len(names_list_full) == len(set(names_list_full))

True

In [39]:
tic = time.perf_counter()

encoder_f = Encoder(names_list_full)

nodes_ten_f = encoder_f.encode_nodes(nodes_full)

edges_ten_f = encoder_f.encode_edges(edges_full)

toc = time.perf_counter()
print(f"Run the script in {(toc-tic)/60:0.2f} mins")
winsound.Beep(400, 700)

Run the script in 2.01 mins


In [40]:
#TODO: Check what goes wrong
node_type = nodes_full.iloc[:, 1]

# Get the source and target indices from the edges tensor
edge_index = edges_ten_f[:, :2]

# Get the edge types from the edges tensor
edge_type = edges_full.iloc[:, 2]


In [41]:
full_hetero_graph = HeteroData(x=nodes_ten_f, node_type=node_type, edge_index=edge_index, edge_type=edge_type)

In [42]:
full_hetero_graph

HeteroData(
  x=[84093, 2],
  node_type=0        Blues_-_Modern-B_B_King_-_Rock_Me_Baby
1                                   g1365629521
2              http://purl.org/midi-ld/notes/55
3              http://purl.org/midi-ld/notes/67
4                                         dur:2
                          ...                  
84088                               timesig:9/8
84089                               timesig:2/2
84090                               timesig:8/4
84091                               timesig:1/8
84092                               timesig:6/4
Name: name, Length: 84093, dtype: object,
  edge_index=[702340, 2],
  edge_type=0                                 g1365629521
1                                    g1693331
2                                g-1059250695
3                                    g1694292
4                                g-1130800403
                         ...                 
702335    http://purl.org/midi-ld/programs/30
702336    http://purl.org/midi