In [1]:
import os
from typing import Optional
import timeit
import itertools

import scipy
import numpy as np
import pandas as pd

import networkx as nx
import matplotlib

from sklearn.preprocessing import OneHotEncoder

import torch
from torch_geometric.data import HeteroData


print(scipy.__version__)
print(matplotlib.__version__)
print(nx.__version__)

1.7.3
3.6.2
2.8.4


In [2]:
def complete_graph(input_path) -> nx.Graph:
    """
    Compile all edgelists in input_path directory into a nx.Graph instance.

    :param input_path: Directory containing the edgelists to compile.
    :return: Complete graph specified by the edgelists.
    """
    edgelists = [qf for qf in os.listdir(input_path)
                 if qf.endswith('.edgelist') and not qf.startswith('_')]
    g = None

    print('loading edgelists...')
    for eg in edgelists:
        print('- ' + eg)
        h = nx.read_edgelist(os.path.join(input_path, eg), nodetype=str, create_using=nx.DiGraph(), delimiter=' ')
        for edge in h.edges():
            h[edge[0]][edge[1]]['weight'] = 1

        g = h if g is None else nx.compose(g, h)

    g = g.to_undirected()

    print('Nodes: %d' % nx.number_of_nodes(g))
    print('Edges: %d' % nx.number_of_edges(g))
    return g


In [3]:
G = complete_graph(".\slac\embeddings\Test Edgelists")

loading edgelists...
- full_edgelist.edgelist
- notes.edgelist
- program.edgelist
- tempo.edgelist
- time.signature.edgelist
Nodes: 2703
Edges: 19366


In [4]:
nodes = pd.DataFrame((list(G.nodes)), columns=['name'])
edges = pd.DataFrame(np.array(list(G.edges)), columns=['source', 'target'])

In [5]:
note_groups = [n for n in nodes['name'] if n[0] == 'g' and n[1] in [str(i) for i in range(10)] + ['-'] ]

not_group_nodes = [n for n in nodes['name'] if n not in note_groups]

url = [n for n in not_group_nodes if n[:4] == 'http']
program_nodes = []
note_nodes = []
for u in url:
    if "programs" in u:
        program_nodes.append(u)
    elif "notes" in u:
        note_nodes.append(u)
    else:
        print(u)

name_nodes = [n for n in not_group_nodes if n[0] == '-']
dur_nodes = [n for n in not_group_nodes if n[:3] == 'dur']
vel_nodes = [n for n in not_group_nodes if n[:3] == 'vel']
time_nodes = [n for n in not_group_nodes if n[:4] == 'time']
tempo_nodes = [n for n in not_group_nodes if n not in set(dur_nodes).union(vel_nodes, time_nodes, name_nodes, url)]

tempo_nodes

['11', '6', '9']

In [6]:
node_categories = {"note_group": note_groups,
                    "pitch": note_nodes,
                    "program": program_nodes,
                    "MIDI": name_nodes,
                    "duration": dur_nodes,
                    "velocity": vel_nodes,
                    "time_sig": time_nodes,
                    "tempo": tempo_nodes
                   }

node_categories.keys()

dict_keys(['note_group', 'pitch', 'program', 'MIDI', 'duration', 'velocity', 'time_sig', 'tempo'])

In [7]:
main_edge_types = ["MIDI__has__tempo",
                   "MIDI__in__time_sig",
                   "MIDI__has__program",
                   "MIDI__has__note_group",
                   "note_group__has__velocity",
                   "note_group__has__duration",
                   "note_group__contains__pitch"]



In [8]:
def reverse_edge(df: pd.DataFrame, row: int, inplace: bool = False) -> Optional[pd.DataFrame]:
    """Reverse the source and target of a single edge(row) in the edge dataframe."""
    if inplace:
        df.iloc[row]['source'], df.iloc[row]['target'] = df.iloc[row]['target'], df.iloc[row]['source']
        return None
    elif not inplace:
        tmp = df.copy()
        tmp.iloc[row]['source'], tmp.iloc[row]['target'] = tmp.iloc[row]['target'], tmp.iloc[row]['source']
        return tmp


def format_edge_name(source: str, target: str) -> str:
    """Combine source and target names in the correct form."""
    edge_name = ""

    if source == "MIDI":
        if target == "tempo":
            edge_name = source + "__has__" + target
        elif target == "time_sig":
            edge_name = source + "__in__" + target
        elif target == "program":
            edge_name = source + "__has__" + target
        elif target == "note_group":
            edge_name = source + "__has__" + target
    elif source == "note_group":
        if target == "velocity":
            edge_name = source + "__has__" + target
        elif target == "duration":
            edge_name = source + "__has__" + target
        elif target == "pitch":
            edge_name = source + "__contains__" + target
    else:
        edge_name = source + "__?__" + target
        print("Not known edge detected: " + edge_name)
        return edge_name

    return edge_name


In [9]:
def add_node_type(nodes_df: pd.DataFrame, node_cat: dict) -> pd.DataFrame:
    """
    Return input node Dataframe with a new column named "node_type", which specifies the type of the node.

    :param nodes_df: Dataframe containing the original node Dataframe (without type column).
    :param node_cat: Dictionary with keys: node names, values: nodes of specified category.
    :return: Node Dataframe with the new "node_type" column.
    """
    node_type = []
    augmented_nodes_df = nodes_df.copy()
    for i in range(len(nodes_df.index)):
        for key in node_cat.keys():
            if nodes.iloc[i]['name'] in node_cat[key]:
                node_type.append(key)
    augmented_nodes_df['node_type'] = node_type
    
    return augmented_nodes_df


def add_edge_type(edges_df: pd.DataFrame, node_cat: dict) -> pd.DataFrame:
    """
    Return input edge Dataframe with a new column named "edge_type", which specifies the type of the edge.

    :param edges_df: Dataframe containing the original edge Dataframe (without type column).
    :param node_cat: Dictionary with keys: node names, values: nodes of specified category.
    :return: Edge Dataframe with the new "edge_type" column.
    """
    edge_type = []

    edge_name_source = ""
    edge_name_target = ""

    augmented_edges_df = edges_df.copy()

    for i in range(len(edges_df.index)):
        for name in node_cat.keys():
            if edges_df.iloc[i]['source'] in node_cat[name]:
                edge_name_source = name
                break
        for name in node_cat.keys():
            if edges_df.iloc[i]['target'] in node_cat[name]:
                edge_name_target = name
                break

        if (edge_name_source not in ("MIDI", "note_group")) or (edge_name_source == "note_group" and edge_name_target == "MIDI"):
            reverse_edge(augmented_edges_df, row=i, inplace=True)
            edge_name_source, edge_name_target = edge_name_target, edge_name_source

        edge_name = format_edge_name(edge_name_source, edge_name_target)
        edge_type.append(edge_name)

    augmented_edges_df['edge_type'] = edge_type
    return augmented_edges_df


In [10]:
def add_types(nodes_df: pd.DataFrame, edges_df: pd.DataFrame, node_cat: dict) -> (pd.DataFrame, pd.DataFrame):
    """Execute add_node_type and add_edge_type, and return a tuple of the new Dataframes."""
    return add_node_type(nodes_df, node_cat), add_edge_type(edges_df, node_cat)


In [11]:
nodes_f, edges_f = add_types(nodes, edges, node_categories)


In [20]:
edges_f.loc[edges_f['edge_type'] == 'MIDI__has__note_group', ['source', 'target']]

Unnamed: 0,source,target
0,-Albert_King_-_Born_Under_A_Bad_Sign,g1601074
1,-Albert_King_-_Born_Under_A_Bad_Sign,g1577049
2,-Albert_King_-_Born_Under_A_Bad_Sign,g1575127
3,-Albert_King_-_Born_Under_A_Bad_Sign,g-1872221027
4,-Albert_King_-_Born_Under_A_Bad_Sign,g795292196
...,...,...
18857,-B_B_King_-_Rock_Me_Baby,g1750992
18858,-B_B_King_-_Rock_Me_Baby,g415858330
18859,-B_B_King_-_Rock_Me_Baby,g1606844
18860,-B_B_King_-_Rock_Me_Baby,g1605881


In [36]:
edges_f.columns

Index(['source', 'target', 'edge_type'], dtype='object')

# Graph Build

In [21]:
data = HeteroData()

data['MIDI'].x = node_categories['MIDI']
data['note_group'].x = node_categories['note_group']

data['MIDI', 'has', 'note_group'].edge_index = edges_f.loc[edges_f['edge_type'] == 'MIDI__has__note_group', ['source', 'target']]

In [22]:
data.metadata()

(['MIDI', 'note_group'], [('MIDI', 'has', 'note_group')])

In [24]:
data['MIDI']

{'x': ['-Albert_King_-_Born_Under_A_Bad_Sign', '-B_B_King_-_How_Blue_Can_You_Get', '-B_B_King_-_Rock_Me_Baby']}

In [26]:
node_edge_categories = node_categories.copy()
node_edge_categories['node_types'] = list(node_categories.keys())
node_edge_categories['main_edge_types'] = main_edge_types  # Dictionary containing every string that may be found in our Dataframes

In [27]:
def flatten_lol(lol: list) -> list:
    """Flatten list of lists (lol)."""
    return list(itertools.chain( *list( lol ) ) )


names_list = flatten_lol(node_edge_categories.values())


In [28]:
len(names_list) == len(set(names_list))

True

In [52]:
nodes_f.iloc[0][0]

'-Albert_King_-_Born_Under_A_Bad_Sign'

In [79]:
#TODO: Make class for easier use
class Encoder:
    def __init__(self, str_list: list):
        self.mapping = {string: i for i, string in enumerate(str_list)}

    def encode_nodes(self, df: pd.DataFrame) -> torch.Tensor:
        out = torch.zeros([len(df.index),2], dtype=torch.int32)

        for i in range(len(df.index)):
            out[i,0], out[i,1] = self.mapping[df.iloc[i]['name']], self.mapping[df.iloc[i]['node_type']]
        return out

    def encode_edges(self, df: pd.DataFrame) -> torch.Tensor:
        out = torch.zeros([len(df.index),3], dtype=torch.int32)

        for i in range(len(df.index)):
            out[i,0], out[i,1], out[i,2] = self.mapping[df.iloc[i]['source']], self.mapping[df.iloc[i]['target']], self.mapping[df.iloc[i]['edge_type']]
        return out


    def decode_value(self, value: int) -> str:
        return list(self.mapping.keys())[list(self.mapping.values()).index(value)]

    def decode_df(self, ten: torch.Tensor):
        out = pd.DataFrame(index=range(ten.size(0)),columns=range(ten.size(1)))
        if ten.size(1) == 2:
            for i in range(len(out.index)):
                out.iloc[i][0], out.iloc[i][1]  = self.decode_value(ten[i][0].item()), self.decode_value(ten[i][1].item())
        elif ten.size(1) == 3:
            for i in range(len(out.index)):
                out.iloc[i][0], out.iloc[i][1], out.iloc[i][2]  = \
                    self.decode_value(ten[i][0].item()), self.decode_value(ten[i][1].item()), self.decode_value(ten[i][2].item())

        return out




In [61]:
encoder = Encoder(names_list)

In [73]:
nodes_ten = encoder.encode_nodes(nodes_f)
nodes_ten.numpy()

array([[2638, 2706],
       [   0, 2703],
       [   1, 2703],
       ...,
       [2622, 2704],
       [2623, 2704],
       [2624, 2704]])

In [102]:
# Convert the nodes dataframe to a PyTorch tensor
nodes_ten = torch.tensor(nodes_f.values)

# Convert the edges dataframe to a PyTorch tensor
edges_ten = torch.tensor(edges_f.values)

# Get the source and target indices from the edges tensor
edge_index = edges[:, :2]

# Get the edge types from the edges tensor
edge_type = edges[:, 2]


TypeError: can't convert np.ndarray of type numpy.object_. The only supported types are: float64, float32, float16, complex64, complex128, int64, int32, int16, int8, uint8, and bool.

# Old Functions; Not used.

In [84]:
def add_edge_type2(edges_pd: pd.DataFrame, node_cat: dict) -> pd.DataFrame:
    """Not working correctly (making more types than wanted); Do not use"""
    edge_type = []

    edge_name_source = None
    edge_name_target = None

    augmented_edges_pd = edges_pd.copy()

    for i in range(len(edges.index)):
        for name in node_cat.keys():
            if edges.iloc[i]['source'] in node_cat[name]:
                edge_name_source = name + "__"
                break
        for name in node_cat.keys():
            if edges.iloc[i]['target'] in node_cat[name]:
                edge_name_target = name
                break

        edge_name = edge_name_source + edge_name_target
        edge_type.append(edge_name)

    augmented_edges_pd['edge_type'] = edge_type
    return augmented_edges_pd


def add_types2(nodes_df: pd.DataFrame, edges_df: pd.DataFrame, node_cat: dict) -> (pd.DataFrame, pd.DataFrame):
    """Too long to execute; do not use."""
    edge_type = []

    augmented_nodes_df = add_node_type(nodes_df, node_cat)  # Node dataframe with the categories column
    augmented_edges_df = edges_df.copy() # Edge dataframe with the categories column

    for i in range(len(edges_df.index)):
        source = edges_df.iloc[i]['source']
        edge_name_source = augmented_nodes_df.loc[augmented_nodes_df['name'] == source, ['node_type']].iloc[0]['node_type']

        target = edges_df.iloc[i]['target']
        edge_name_target = augmented_nodes_df.loc[augmented_nodes_df['name'] == target, ['node_type']].iloc[0]['node_type']

        if (edge_name_source not in ("MIDI", "note_group")) or (edge_name_source == "note_group" and edge_name_target == "MIDI"):
            reverse_edge(augmented_edges_df, row=i, inplace=True)
            edge_name_source, edge_name_target = edge_name_target, edge_name_source

        edge_name = format_edge_name(edge_name_source, edge_name_target)
        edge_type.append(edge_name)

    augmented_edges_df['edge_type'] = edge_type
    return augmented_nodes_df, augmented_edges_df


In [None]:
edges_test = add_edge_type2(edges, node_categories)

edges_test.loc[edges_test['edge_type'] == 'MIDI__program', ['source', 'target']]

In [89]:
print("Double iteration function: ", timeit.timeit(lambda: add_types(nodes, edges, node_categories), number=1))
print("Nodes and check function: ", timeit.timeit(lambda: add_types2(nodes, edges, node_categories), number=1))


Double iteration function:  3.470187100000203
Nodes and check function:  21.959933399999954
