In [1]:
import time
import scipy
import numpy as np
import pandas as pd
import winsound

import networkx as nx
import matplotlib

from sklearn.preprocessing import LabelEncoder
import torch

from torch_geometric.data import HeteroData
import torch_geometric.transforms as T
from torch_geometric.nn import HGTConv, SAGEConv, GATConv, Linear, to_hetero
import torch_geometric.nn.functional as F

from HeteroDataFunctions import Encoder, add_types, complete_graph, flatten_lol, node_cat_dict, midi_type

print(scipy.__version__)
print(matplotlib.__version__)
print(nx.__version__)

1.7.3
3.6.2
2.8.4


In [2]:
# Complete Dataset
G = complete_graph(".\slac\embeddings\\all")

loading edgelists...
- notes.edgelist
- program.edgelist
- tempo.edgelist
- time.signature.edgelist
Nodes: 93553
Edges: 786635


In [3]:
nodes = pd.DataFrame((list(G.nodes)), columns=['name'])
edges = pd.DataFrame(np.array(list(G.edges)), columns=['source', 'target'])

In [4]:
node_categories = node_cat_dict(nodes)
node_categories.keys()

node_cat_dict took 0.18 secs to run


dict_keys(['note_group', 'pitch', 'program', 'MIDI', 'duration', 'velocity', 'time_sig', 'tempo'])

In [5]:
%%script false --no- raise -error

nodes_df_complete, edges_df_complete = add_types(nodes, edges, node_categories)

winsound.Beep(400, 700)

nodes_df_complete.to_csv('nodes_complete.csv')
edges_df_complete.to_csv('edges_complete.csv')

Couldn't find program: 'false'


In [6]:
nodes_df_complete = pd.read_csv('.\slac\Contents of Slac\\nodes_complete.csv')
edges_df_complete = pd.read_csv('.\slac\Contents of Slac\edges_complete.csv')
print('Done')

Done


In [7]:
edge_types = ["MIDI__has__tempo",
                   "MIDI__in__time_sig",
                   "MIDI__has__program",
                   "MIDI__has__note_group",
                   "note_group__has__velocity",
                   "note_group__has__duration",
                   "note_group__contains__pitch"]

In [8]:
full_categories = node_categories.copy()
full_categories['node_types'] = list(node_categories.keys())
full_categories['edge_types'] = edge_types  # Dictionary containing every string that may be found in our Dataframes
names_list_full = flatten_lol(full_categories.values())

len(names_list_full) == len(set(names_list_full))

True

In [9]:
encoder = Encoder(names_list_full, n_labels=5)

encoder.decode_value(5)

'g1601074'

In [10]:
node_types = set(nodes_df_complete['node_type'])
node_types

{'MIDI',
 'duration',
 'note_group',
 'pitch',
 'program',
 'tempo',
 'time_sig',
 'velocity'}

In [11]:
input_node_dict = {node_type: {'x': encoder.
                    encode_nodes(nodes_df_complete.
                    loc[nodes_df_complete['node_type'] == node_type, ['name']])}
                    for node_type in node_types}

encode_nodes took 0.00 secs to run
encode_nodes took 0.00 secs to run
encode_nodes took 0.00 secs to run
encode_nodes took 0.00 secs to run
encode_nodes took 0.02 secs to run
encode_nodes took 3.22 secs to run
encode_nodes took 0.00 secs to run
encode_nodes took 0.01 secs to run


In [12]:
input_node_dict['note_group']

{'x': tensor([[5.0000e+00],
         [6.0000e+00],
         [7.0000e+00],
         ...,
         [9.2486e+04],
         [9.2487e+04],
         [9.2488e+04]])}

In [13]:
edge_types

['MIDI__has__tempo',
 'MIDI__in__time_sig',
 'MIDI__has__program',
 'MIDI__has__note_group',
 'note_group__has__velocity',
 'note_group__has__duration',
 'note_group__contains__pitch']

In [14]:
edges_df_complete.loc[edges_df_complete['edge_type'] == 'MIDI__has__tempo', ['source', 'target']]

Unnamed: 0,source,target
936,Blues_-_Modern-Albert_King_-_Born_Under_A_Bad_...,11
571698,Blues_-_Modern-B_B_King_-_How_Blue_Can_You_Get,6
658833,Blues_-_Modern-B_B_King_-_Rock_Me_Baby,9
662826,Blues_-_Modern-B_B_King_-_The_Thrill_Is_Gone,9
666395,Blues_-_Modern-Buddy_Guy_-_Don't_Answer_the_Door,5
...,...,...
786125,Rock_-_Metal-Rage_Against_the_Machine_-_Bulls_...,8
786237,Rock_-_Metal-Rage_Against_the_Machine_-_Gueril...,11
786347,Rock_-_Metal-Rage_Against_the_Machine_-_Killin...,12
786441,Rock_-_Metal-Rage_Against_the_Machine_-_Know_Y...,8


In [15]:
input_edge_dict = {edge_type: {'edge_index': encoder.encode_edges(edges_df_complete.loc[
                    edges_df_complete['edge_type'] == edge_type, ['source', 'target']])} for edge_type in edge_types}

encode_edges took 0.02 secs to run
encode_edges took 0.02 secs to run
encode_edges took 0.09 secs to run
encode_edges took 8.85 secs to run
encode_edges took 7.58 secs to run
encode_edges took 5.97 secs to run
encode_edges took 27.85 secs to run


In [16]:
for key in input_edge_dict.keys():
    dim1 = input_edge_dict[key]['edge_index'].shape[0]
    input_edge_dict[key]['edge_index'] = input_edge_dict[key]['edge_index'].reshape(-1, dim1).to(torch.int32)

In [17]:
input_edge_dict

{'MIDI__has__tempo': {'edge_index': tensor([[92898, 93541, 92740, 93557, 92921, 93542, 92729, 93542, 92929, 93540,
           92738, 93537, 92785, 93542, 92726, 93542, 92724, 93540, 92804, 93540,
           92938, 93535, 92728, 93557, 92732, 93536, 92890, 93538, 92754, 93552,
           92744, 93536, 92820, 93553, 92896, 93553, 92839, 93542, 92802, 93541,
           92821, 93553, 92878, 93550, 92872, 93535, 92857, 93551, 92789, 93536,
           92860, 93536, 92767, 93551, 92750, 93542, 92928, 93536, 92861, 93554,
           92704, 93554, 92827, 93554, 92911, 93544, 92903, 93552, 92791, 93536,
           92866, 93544, 92766, 93536, 92859, 93554, 92799, 93542, 92881, 93542,
           92783, 93551, 92855, 93539, 92776, 93554, 92733, 93536, 92755, 93554,
           92706, 93541, 92939, 93536, 92808, 93554, 92907, 93554, 92760, 93554,
           92764, 93554, 92917, 93538, 92902, 93538, 92877, 93554, 92924, 93538,
           92757, 93554, 92748, 93552, 92718, 93552, 92895, 93539, 92876, 9

In [18]:
# Extract the label of each Midi.
midi_val = nodes_df_complete.loc[nodes_df_complete['node_type'] == 'MIDI', ['name']].values
midi_class = [midi_type(s[0]) for s in midi_val]

lb = LabelEncoder()
y = torch.from_numpy(lb.fit_transform(midi_class)).to(torch.float32)

lb.classes_

array(['Blues', 'Classical', 'Jazz', 'Rap', 'Rock'], dtype='<U9')

In [19]:
input_node_dict['MIDI']['y'] = y

In [30]:
H = HeteroData(input_node_dict, **input_edge_dict)

In [31]:
H = T.ToUndirected()(H)
H = T.AddSelfLoops()(H)

In [32]:
H.edge_items()

[(('MIDI', 'has', 'tempo'),
  {'edge_index': tensor([[92898, 93541, 92740, 93557, 92921, 93542, 92729, 93542, 92929, 93540,
           92738, 93537, 92785, 93542, 92726, 93542, 92724, 93540, 92804, 93540,
           92938, 93535, 92728, 93557, 92732, 93536, 92890, 93538, 92754, 93552,
           92744, 93536, 92820, 93553, 92896, 93553, 92839, 93542, 92802, 93541,
           92821, 93553, 92878, 93550, 92872, 93535, 92857, 93551, 92789, 93536,
           92860, 93536, 92767, 93551, 92750, 93542, 92928, 93536, 92861, 93554,
           92704, 93554, 92827, 93554, 92911, 93544, 92903, 93552, 92791, 93536,
           92866, 93544, 92766, 93536, 92859, 93554, 92799, 93542, 92881, 93542,
           92783, 93551, 92855, 93539, 92776, 93554, 92733, 93536, 92755, 93554,
           92706, 93541, 92939, 93536, 92808, 93554, 92907, 93554, 92760, 93554,
           92764, 93554, 92917, 93538, 92902, 93538, 92877, 93554, 92924, 93538,
           92757, 93554, 92748, 93552, 92718, 93552, 92895, 93539,

# GNN

In [33]:
class HGT(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels, num_heads, num_layers):
        super().__init__()

        self.lin_dict = torch.nn.ModuleDict()
        for node_type in H.node_types:
            self.lin_dict[node_type] = Linear(-1, hidden_channels)

        self.convs = torch.nn.ModuleList()
        for _ in range(num_layers):
            conv = HGTConv(hidden_channels, hidden_channels, H.metadata(),
                           num_heads, group='sum')
            self.convs.append(conv)

        self.lin = Linear(hidden_channels, out_channels)

    def forward(self, x_dict, edge_index_dict):
        for node_type, x in x_dict.items():
            x_dict[node_type] = self.lin_dict[node_type](x).relu_()

        for conv in self.convs:
            x_dict = conv(x_dict, edge_index_dict)


        return self.lin(x_dict['MIDI'])


model = HGT(hidden_channels=64, out_channels=len(set(lb.classes_)),
            num_heads=2, num_layers=2)

In [34]:
with torch.no_grad():  # Initialize lazy modules.
    out = model(H.x_dict, H.edge_index_dict)

ValueError: Found indices in 'edge_index' that are larger than 249 (got 93557). Please ensure that all indices in 'edge_index' point to valid indices in the interval [0, 250) in your node feature matrix and try again.

In [None]:
optimizer_name = "Adam"
lr = 1e-1
optimizer = getattr(torch.optim, optimizer_name)(model.parameters(), lr=lr)


In [None]:
def train():
    model.train()
    optimizer.zero_grad()
    out = model(H.x_dict, H.edge_index_dict)
    mask = H['MIDI'].train_mask
    loss = F.cross_entropy(out['MIDI'][mask], H['MIDI'].y[mask])
    loss.backward()
    optimizer.step()
    return float(loss)

In [None]:
train()

 # Old Implementation

In [None]:
# nodes_ten_ = encoder.encode_nodes(nodes_df_complete)
# edges_ten_ = encoder.encode_edges(edges_df_complete)

# node_type_ = nodes_df_complete.iloc[:, 1]

# Get the source and target indices from the edges tensor
# edge_index = edges_ten_[:, :2]

## Get the edge types from the edges tensor
#edge_type_ = edges_df_complete.iloc[:, 2]

#full_hetero_graph = HeteroData(x=nodes_ten_, node_type=node_type_, edge_index=edge_index, edge_type=edge_type_)