# Generating the disruption network
## And calculating the disruption index

One of the most important steps in studying the effect of disruption in songs is actually calculating it based on the similarity matrix we obtained before. That way we use the same list of songs that was used to generate the similarity matrix and use to calculate the disruption score for each song in our dataset. That means that after calculating we can find who were the songs most disruptive in our dataset based on its metadata.

## The code below generates a songs disruption network using:
1. list of songs (in the same order as the similarity matrix)
2. A similarity matrix (ordered by release) so we know that the next song i + 1 is a song released after i
3. A similarity threshold that will determine if there is an edge between nodes

In [1]:
import networkx as nx
import pandas as pd
import numpy as np
import collections

from tqdm import tqdm
from networkx.drawing.nx_pylab import draw_networkx

In [2]:
def generate_network(list_of_songs, similarity_matrix, similarity_threshold=0.80):
    """ Generate a disruption network based on: 
    If a song has a similarity with another over the threshold than an edge is made to connect both of them.

    Args:
        1. list of songs (in the same order as the similarity matrix)
        2. A similarity matrix (ordered by release) so we know that the next song i + 1 is a song released after i
        3. A similarity threshold that will determine if there is an edge between nodes
    Returns:
        A network as a networkx.DiGraph Object
    """
    slice_index = 0
    G = nx.DiGraph()

    for i in tqdm(range(len(list_of_songs))):
        edge_count = 0
        
        G.add_node(i)
        
        for j in range(i + 1, len(list_of_songs)):
            # If there is a high similarity, create an edge between the nodes
            if similarity_matrix[i][j] > similarity_threshold:
                G.add_edge(j, i)
                edge_count += 1
        
        # If this node does not have a similarity with any other node, then remove the node
        if edge_count < 1:
            G.remove_node(i)

    return G

def get_disruption_index_for_nodes(list_of_songs, graph):
    """ Compute the actual disruption indexes for the graph based on the nodes(songs) and its
    connections (if its influenced or if it influenced another song) """
    disruption_info = {}

    for i in tqdm(range(len(list_of_songs))):    
        if graph.has_node(i):
            songs_after = range(i + 1, len(list_of_songs))
            song_influences = [edge[1] for edge in graph.edges(i) if edge[1] != i]

            ni = 0
            nj = 0
            nk = 0
            
            for song_after in songs_after:
                consolidating_influence = False
                if graph.has_edge(song_after , i):
                
                    for influence in song_influences:
                        if graph.has_edge(song_after, influence):
                            consolidating_influence = True
                            break
                
                    if consolidating_influence:
                        nj += 1
                    else:
                        ni += 1
                
                else:
                    for influence in song_influences:
                        if graph.has_edge(song_after, influence):
                            nk += 1
    
            disruption_info[list_of_songs.iloc[i]['id']] = [ni, nj, nk, float((ni-nj)) / float((ni+nj+nk))] if (ni + nj + nk) > 0 else [ni, nj, nk, 0]
    
    return disruption_info


## Loading our features, matrix and dataframe
The first step to build the network it is to load the files generated in prior steps

### `Information about the dataset:`
 - The filtered dataset refers removed the songs that their mp3 had no sound.

### `Information about the feature vectors:`
 -  Features can be both calculated using the MFCC or the concatenation of the features from the transfer learning convnet


### `Information about the similarity matrix:`

Similarity matrix can have more or less similarities between songs based on the gamma value
 - Higher means it is more strict
 - Lesser means that songs that are different will be deemed as similar

In [3]:
from pathlib import Path

DATASET_PATH = Path("./dataset")

def load_npy(file_path):
    print(f"Loading: {file_path} ...")
    return np.load(file_path)

def load_features_from_file(file_name, feature_type):
    if feature_type == "mfcc":
        return load_npy(DATASET_PATH / "input" / "feature_vectors" / "mfcc" / file_name)
    elif feature_type == "transfer_learning":
        return load_npy(DATASET_PATH / "input" / "feature_vectors" / "transfer_learning" / file_name)
    else:
        raise TypeError("This feature type is not supported")

def load_similarity_matrix(file_name, feature_type):
    if feature_type == "mfcc":
        return load_npy(DATASET_PATH / "input" / "similarity_matrices" / "mfcc" / file_name)
    elif feature_type == "transfer_learning":
        return load_npy(DATASET_PATH / "input" / "similarity_matrices" / "transfer_learning" / file_name)
    else:
        raise TypeError("This feature type is not supported")

def load_dataframe(dataframe_file):
    return pd.read_csv(DATASET_PATH / "input" / "csvs" / dataframe_file)

feat_type = "transfer_learning"
datset_size = 30000
gamma = 0.1

DF_PATH = f"sorted_song_info_{datset_size}.csv"
FEATS_PATH = f"{feat_type}_feature_vector_{datset_size}_samples.npy"
SIMILARITY_MATRIX_PATH = f"{feat_type}_{datset_size}_samples_{gamma}_gamma.npy"

dataframe = load_dataframe(DF_PATH)
features = load_features_from_file(FEATS_PATH, feat_type)
similarity_matrix = load_similarity_matrix(SIMILARITY_MATRIX_PATH, feat_type)

Loading: dataset/input/feature_vectors/transfer_learning/transfer_learning_feature_vector_30000_samples.npy ...
Loading: dataset/input/similarity_matrices/transfer_learning/transfer_learning_30000_samples_0.1_gamma.npy ...


Now we can call the functions defined above to generate the network and calculate the disruption index!

In [4]:
graph = generate_network(dataframe, similarity_matrix)

  6%|▌         | 1772/30000 [04:08<1:06:02,  7.12it/s]


KeyboardInterrupt: 

# Exporting the graph generated

Here we export the generated graph in a way we can analyse it on Gephi later

In [None]:
disruption_index = get_disruption_index_for_nodes(dataframe, graph)

100%|██████████| 20000/20000 [1:33:05<00:00,  3.58it/s] 


In [None]:
nx.write_gexf(graph, DATASET_PATH / "output" / "graphs" / f"{feat_type}_{len(disruption_index)}_{gamma}.gexf")

Some of 29000+ songs used to build the network had no connection with any other, so they didn't even enter the network:

```
# If this node does not have a similarity with any other node, then remove the node
if edge_count < 1:
    G.remove_node(i)
```

That is why we have only 26091 with a disruption index

In [None]:
len(disruption_index)

16976

# Exporting the disruption index

In [None]:
import pickle

In [None]:
# Store data (serialize)
with open(DATASET_PATH / "output" / "disruption_index" / f'{feat_type}_{len(disruption_index)}_{gamma}.pickle', 'wb') as handle:
    pickle.dump(disruption_index, handle, protocol=pickle.HIGHEST_PROTOCOL)

### We should store as a dataframe too

In [None]:
with open(DATASET_PATH / "output" / "disruption_index" / f'{feat_type}_{len(disruption_index)}_{gamma}.pickle', 'rb') as handle:
    loaded_disruption_index = pickle.load(handle)


### Generating the dataframe with the disruption index

In [None]:
disruption_index_df = pd.DataFrame(loaded_disruption_index).T
disruption_index_df.reset_index(inplace=True)
disruption_index_df.columns = ['id', 'ni', 'nj', 'nk', 'disruption']
disruption_index_df.head()

Unnamed: 0,id,ni,nj,nk,disruption
0,3MEb9LZbB80nQ1a8,20.0,0.0,0.0,1.0
1,1Z7Pb158yANCZ7zN,1.0,0.0,0.0,1.0
2,AHJbjIlp98gVY3Pj,1.0,0.0,0.0,1.0
3,8rCzU7kVpoJ0Z37D,55.0,0.0,0.0,1.0
4,1EhoPstBUguE4Btf,1.0,0.0,0.0,1.0


Joining song info and song disruption datasets 

In [None]:
song_info_with_disruption = pd.merge(disruption_index_df, dataframe, on='id')
song_info_with_disruption.head()

Unnamed: 0.1,id,ni,nj,nk,disruption,Unnamed: 0,index,artist,song,album_name,...,popularity,release,danceability,energy,key,mode,valence,tempo,duration_ms,mapping_to_fv_index
0,3MEb9LZbB80nQ1a8,20.0,0.0,0.0,1.0,0,5986,Louis Armstrong,St. James Infirmary,The Complete Hot Five And Hot Seven Recordings...,...,29.0,1928,0.693,0.182,5.0,0.0,0.588,116.508,191867,71045
1,1Z7Pb158yANCZ7zN,1.0,0.0,0.0,1.0,1,2840,Billie Holiday,Georgia On My Mind,Lady Day: The Complete Billie Holiday On Colum...,...,24.0,1933,0.489,0.0571,7.0,0.0,0.273,80.172,198560,91346
2,AHJbjIlp98gVY3Pj,1.0,0.0,0.0,1.0,2,18091,Billie Holiday,Gloomy Sunday,Lady Day: The Complete Billie Holiday On Colum...,...,49.0,1933,0.484,0.0823,7.0,0.0,0.191,127.089,190800,94072
3,8rCzU7kVpoJ0Z37D,55.0,0.0,0.0,1.0,4,15581,Billie Holiday,A Fine Romance,Lady Day: The Complete Billie Holiday On Colum...,...,24.0,1933,0.596,0.182,5.0,1.0,0.711,123.961,171467,108063
4,1EhoPstBUguE4Btf,1.0,0.0,0.0,1.0,6,2259,Fred Astaire,The Way You Look Tonight,The Essential Fred Astaire,...,32.0,1935,0.453,0.159,2.0,1.0,0.18,75.682,188240,49408


In [None]:
song_info_with_disruption.to_csv(DATASET_PATH / "output" / "csv_with_disruption" / f"song_info_with_disruption_{len(song_info_with_disruption)}_feat_{feat_type}_gamma_{gamma}.csv", index=False)