In [None]:
import numpy as np
import pandas as pd
from operator import itemgetter
import scipy
import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.style.use('ggplot')
from mpl_toolkits.axes_grid1 import make_axes_locatable
import os
import networkx as nx
import plotly
plotly.tools.set_credentials_file(username='JCrobe', api_key='vZ65FhHdcxZEuQik1HBM')
import plotly.plotly as py
import plotly.graph_objs as go

In [None]:
nx.__version__

<img src="FWBF_NTDS_Graphics.png" alt="Alt text that describes the graphic" title="FWBF_NTDS_Graphics" />

**Abstract**:  Music is something universal and has been known for connecting people for a long time. In the age when diversity in music industry is at its peak, it seems that music has lost its universality and people's musical taste varies to a great extent. In this project, we seek to analyse whether music truly connects people and how one's musical taste is reflected in one's friendships. Using network analysis tools, we want to understand if real friends also have greater similarity in music taste, and how this diffuses throught their friendships. \\ After this, we use classification techniques to suggest a tool that can be used by artists and music producers to estimate how popular their music track will be before they even release it. We then test our tool on the different clusters of users we defined to see how it performs in different groups. 

***
***

# Part I: The Spotify_Top_100 dataset

In [None]:
def standardize(x):
    centered_data = x - np.mean(x, axis=0)
    std_data = centered_data / np.std(centered_data, axis=0)
    return std_data

## When you're in need of love, they give you data and affection

The first thing we have to do is to load the data previously wrapped using the [Spotify API](https://developer.spotify.com/documentation/web-api/) and the [Spotipy](https://github.com/plamere/spotipy) python library. The data is contained in a different `.csv` file per user, and the data has to be merged into a single datframe to ease the utilisation and analysis.

> Note that the data has been gratefully shared by friends of our team. We kept the playlist's names and user as anonymous as possible for the general public, but it needs to be quite transparent for us.

So, let's go:

In [None]:
# Get a list of available playlists
filenames = os.listdir(os.getcwd()+'/user_data')
files_list = [ filename for filename in filenames if filename.endswith('csv') ]

# Remove already loaded playlists
try:
    songs
except NameError:
    pass
else:
    del songs
# and load them up in a Data Frame
for item in files_list:
    try:
        songs
    except NameError:
        songs = pd.read_csv(os.getcwd()+'/user_data/'+item, sep=';')
    else:
        songs = songs.append(pd.read_csv(os.getcwd()+'/user_data/'+item, sep=';'), ignore_index=True)

songs['User'] = songs['Playlist Origin'].map(lambda x: str(x)[:-10])

# Drop duplicates of songs that are in more than one playlist
songs_unique = songs.drop_duplicates(subset=['Track Id'], keep='first')
songs_unique = songs_unique.reset_index(drop=True)

# Create a lookup table (can be looked up using the index or the unique Spotify track ID)
lookup_table = songs_unique.drop([
                             'Unnamed: 0',
                             'Playlist Origin',
                             'Playlist order',
                             'Track Duration MS',
                             'Danceability',
                             'Energy',
                             'Key',
                             'Loudness',
                             'Mode',
                             'Speechiness',
                             'Acousticness',
                             'Instrumentalness',
                             'Liveness',
                             'Valence',
                             'Tempo',
                             'Valence.1',
                             'Track_href',
                             'Time_signature',
                             'uri'], axis=1)
# Create a features array (will be used for building graphs)
features = songs_unique.drop([
                             'Unnamed: 0',
                             'Playlist Origin',
                             'Playlist order',
                             'Artist',
                             'Track Name',
                             'Album Name',
                             'Track Number',
                             'Track Id',
                             'Track Duration MS',
                             'Track_href',
                             'Time_signature',
                             'uri','Genres','User'], axis=1)
# Converting release dates to timestamps (need to have type float on all features)
import dateutil.parser
timestamps = features['Album Release Date'].apply(lambda s: dateutil.parser.parse(s))
features.update({'Album Release Date': timestamps}, raise_conflict=False)
features.rename(columns={'Album Release Date': 'Timestamp'}, inplace=True)

# Putting it all in numpy array format
np_features = np.array(features).astype(float)
# Remove mean and divide by std. deviation on each feature
np_features = standardize(np_features)

In [None]:
#Save the dataframe into a .csv file
file_name = 'spotify_top_100DB/spotify_top_100DB.csv'
songs.to_csv(file_name, sep=';')

## Descriptive analysis

Now that the data is ready to be analysed, let's look into it to see exactly what is inside and what features might help us make meaningful connections.
The dataframe columns are the following and corresponds to the response of the Spotify API to a specific request.

- **Playlist Origin**: The playlist were the data comes from
- **Playlist order**: The rank of the given row on the original playlist
- **Artist**: The artist corresponding to the given song, track
- **Track Name**: The track name
- **Album Name**: The album name of the given song, track ... 
- **Album Release Date**: ... and its release date
- **Track Number**
- **Track Popularity**: 
- **Track Id**
- **Track Duration MS**: The duration of the track in milliseconds.

- **Danceability**:Danceability describes how suitable a track is for dancing based on a combination of musical elements including tempo, rhythm stability, beat strength, and overall regularity. A value of 0.0 is least danceable and 1.0 is most danceable.

- **Energy**:Energy is a measure from 0.0 to 1.0 and represents a perceptual measure of intensity and activity. Typically, energetic tracks feel fast, loud, and noisy. For example, death metal has high energy, while a Bach prelude scores low on the scale. Perceptual features contributing to this attribute include dynamic range, perceived loudness, timbre, onset rate, and general entropy.

- **Key**: The key the track is in. Integers map to pitches using standard [Pitch Class notation](https://en.wikipedia.org/wiki/Pitch_class). E.g. 0 = C, 1 = C♯/D♭, 2 = D, and so on.

- **Loudness**: The overall loudness of a track in decibels (dB). Loudness values are averaged across the entire track and are useful for comparing relative loudness of tracks. Loudness is the quality of a sound that is the primary psychological correlate of physical strength (amplitude). Values typical range between -60 and 0 db.

- **Mode**: Mode indicates the modality (major or minor) of a track, the type of scale from which its melodic content is derived. Major is represented by 1 and minor is 0.

- **Speechiness**: Speechiness detects the presence of spoken words in a track. The more exclusively speech-like the recording (e.g. talk show, audio book, poetry), the closer to 1.0 the attribute value. Values above 0.66 describe tracks that are probably made entirely of spoken words. Values between 0.33 and 0.66 describe tracks that may contain both music and speech, either in sections or layered, including such cases as rap music. Values below 0.33 most likely represent music and other non-speech-like tracks.

- **Acousticness**:A confidence measure from 0.0 to 1.0 of whether the track is acoustic. 1.0 represents high confidence the track is acoustic.

- **Instrumentalness**: Predicts whether a track contains no vocals. “Ooh” and “aah” sounds are treated as instrumental in this context. Rap or spoken word tracks are clearly “vocal”. The closer the instrumentalness value is to 1.0, the greater likelihood the track contains no vocal content. Values above 0.5 are intended to represent instrumental tracks, but confidence is higher as the value approaches 1.0.

- **Liveness**: Detects the presence of an audience in the recording. Higher liveness values represent an increased probability that the track was performed live. A value above 0.8 provides strong likelihood that the track is live.

- **Tempo**: The overall estimated tempo of a track in beats per minute (BPM). In musical terminology, tempo is the speed or pace of a given piece and derives directly from the average beat duration.

- **Valence**: A measure from 0.0 to 1.0 describing the musical positiveness conveyed by a track. Tracks with high valence sound more positive (e.g. happy, cheerful, euphoric), while tracks with low valence sound more negative (e.g. sad, depressed, angry).

- **Track_href**: A link to the Web API endpoint providing full details of the track.

- **Time_signature**: An estimated overall time signature of a track. The time signature (meter) is a notational convention to specify how many beats are in each bar (or measure).

- **uri**: 	The Spotify URI for the track.

- **Genres**: A list of the genres the artist is associated with. For example: "Prog Rock" , "Post-Grunge". (If not yet classified, the array is empty.)

- **User**: The user associated with the playlist (no link with Spotify username in that case)


[Complete description](https://developer.spotify.com/documentation/web-api/reference/object-model/#audio-features-object)

In [None]:
print('The data frame contains %d different songs and %d diffrent artists' % (len(songs['Track Name'].unique()),len(songs['Artist'].unique())))

In [None]:
#Extract the user name from the song dataframe
users_df = pd.DataFrame(songs['User'].unique())
users_df.columns = ['Name']

In [None]:
#Extract a single user data from the song dataframe. If unknown is specified, return the complete choice list.
def single_user_dataframe(name): 
    if name == 'unknown':
        print('Which user do you want? Enter corresponding number')
        print(users_df['Name'])
        selected_one = int(input())
        name = users_df.loc[selected_one]['Name']    
    user_dataframe = songs[songs['User']==name]
    user_name = name
    return user_name,user_dataframe

In [None]:
#Extract the top tracks of a given user, dataframe.
def compute_top_tracks(dataframe):
    top_tracks = []
    top_tracks = dataframe.groupby(['Track Name','Artist']).count() #we group by categories as we want one single quantity per unique category
    top_tracks = top_tracks.add_suffix('_count').reset_index() #reset the index to get the original data frame headers
    top_tracks = top_tracks.sort_values(by=['Danceability_count'],ascending=False)
    top_tracks['All_infos'] = (top_tracks['Artist'] + ': ' + top_tracks['Track Name'])
    top_tracks = top_tracks[['All_infos','Danceability_count']].reset_index(drop=True)
    top_tracks.columns = ['Item','Count']
    return top_tracks

In [None]:
#Extract the top tracks of a given dataframe.
def compute_top_artists(dataframe):
    top_artists = []
    top_artists = dataframe.groupby(['Artist']).count() #we group by categories as we want one single quantity per unique category
    top_artists = top_artists.add_suffix('_count').reset_index() #reset the index to get the original data frame headers
    top_artists = top_artists.sort_values(by=['Danceability_count'],ascending=False)
    top_artists = top_artists[['Artist','Danceability_count']].reset_index(drop=True)
    top_artists.columns = ['Item','Count']
    return top_artists

In [None]:
#Plot the top n items of a given dataframe.
def plot_tops(tops,nitems,title,y_axe):
    fig, ax = plt.subplots()
    chosen_number = nitems
    y_pos = np.arange(chosen_number)
    ax.barh(y_pos, tops.Count[:chosen_number], align='center', ecolor='black')
    ax.set_yticks(y_pos)
    ax.set_yticklabels(tops['Item'])
    ax.invert_yaxis()
    ax.set_xlabel('Number of occurences')
    ax.set_ylabel(y_axe)
    ax.set_title(title)
    plt.show

In [None]:
#Transposed the dataframe to make it fit the plotly radarchart requierements.
def get_transposed(user_dataframe):
    transposed = user_dataframe.groupby(['Playlist Origin']).mean()
    transposed = transposed[['Track Popularity','Danceability','Energy','Acousticness','Instrumentalness','Liveness','Speechiness','Valence']]
    transposed = transposed.T.reset_index()
    transposed.columns = ['Score','Value']
    transposed.loc[1:10,['Value']]=transposed['Value'][1:10]*100
    return transposed

In [None]:
#Construct a radar plot of the percentages
def make_radar_chart(user_dataframe,user_name):
    transposed = get_transposed(user_dataframe)
    data = [
        go.Scatterpolar(
            r = transposed['Value'],
            theta = transposed['Score'],  
            fill = 'toself',
            name = user_name
        )
    ]
    layout = go.Layout(
        polar = dict(
            radialaxis = dict(
                visible = True
            )
        ),
        showlegend = False
    )
    fig = go.Figure(data=data, layout=layout)
    py.iplot(fig, filename = user_name+'_single_analysis')
    return

In [None]:
#Print the user summary corresponding to the given user_name and dataframe
def user_summary(user_name,user_dataframe):
    print('In 2018, the playlist from %s contained %d different songs among %d different artists' % (user_name,len(user_dataframe['Track Name'].unique()),len(user_dataframe['Artist'].unique())))
    
    user_top_artists = compute_top_artists(user_dataframe)
    plot_tops(user_top_artists,10,user_name+' Top Artists','Name of the artist')
    
    mean_features = user_dataframe.groupby(['Playlist Origin']).mean()
    print(user_name,'in stats:')
    print('\tPopularity score is : %.2f' %(mean_features['Track Popularity']))
    print('\tDanceability score is : %.2f' %(mean_features['Danceability']))
    print('\tEnergy score is : %.2f' %(mean_features['Energy']))
    print('\tLoudness score is : %.2f' %(mean_features['Loudness']))
    print('\tSpeechiness score is : %.2f' %(mean_features['Speechiness']))
    print('\tAcousticness score is : %.2f' %(mean_features['Acousticness']))
    print('\tInstrumentalness score is : %.2f' %(mean_features['Instrumentalness']))
    print('\tLiveness score is : %.2f' %(mean_features['Liveness']))
    print('\tTempo score is : %.2f' %(mean_features['Tempo']))
    print('\tValence score is : %.2f' %(mean_features['Valence']))
    
    make_radar_chart(user_dataframe,user_name)
    

In [None]:
# Load some data from a specific user. 
user_name,user_dataframe = single_user_dataframe('unknown')

In [None]:
#Load some data with the knowledge of who to extract.
JC,JC_dataframe = single_user_dataframe('JC')
AsN,AsN_dataframe = single_user_dataframe('AsN')
AxN,AxN_dataframe = single_user_dataframe('AxN')

In [None]:
#Construct a radar plot (using plotly)
transposed_JC = get_transposed(JC_dataframe)
transposed_AsN = get_transposed(AsN_dataframe)
transposed_AxN = get_transposed(AxN_dataframe)
data = [
    go.Scatterpolar(
      r = transposed_JC['Value'],
      theta = transposed_JC['Score'],
      fill = 'toself',
      name = JC
    ),
    go.Scatterpolar(
      r = transposed_AsN['Value'],
      theta = transposed_AsN['Score'],
      fill = 'toself',
      name = AsN
    ),    
    go.Scatterpolar(
      r = transposed_AxN['Value'],
      theta = transposed_AxN['Score'],
      fill = 'toself',
      name = AxN
    )
]
layout = go.Layout(
  polar = dict(
    radialaxis = dict(
      visible = True,
    )
  ),
  showlegend = False
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename = "Nilsson"+'_test')

In [None]:
#Extract data and display summary
MM,MM_dataframe = single_user_dataframe('MM')
user_summary(MM,MM_dataframe)

In [None]:
#Compute the mean and median of the quantitative variables of the dataframe.
mean_features_all_users = songs.groupby(['Playlist Origin']).mean().reset_index()
med_features_all_users = songs.groupby(['Playlist Origin']).median().reset_index()

#Sort the users based on the median of a given criteria.
parameter_tosort = 'Danceability'
sorted_by_med = med_features_all_users.sort_values(by=[parameter_tosort],ascending=False).reset_index()
print('Top 3 users using the parameter %s are:' %(parameter_tosort))
print(sorted_by_med.loc[0:2,['Playlist Origin']])

In [None]:
#Visualization of the top tracks and related artists for the whole dataframe
top_tracks = compute_top_tracks(songs)
top_artists = compute_top_artists(songs)

plot_tops(top_tracks,20,'2018 Top Tracks','Name of the track')
plot_tops(top_artists,20,'2018 Top Artists','Name of the artist')

## Song adjacency matrix construction

Here we define the importance of the features, and construct an adjacency matrix based on the relative weights of each feature. 

In [None]:
# Timestamp (0), Popularity (1), Danceability (2), Energy (3), Key (4), Loudness (5), Mode (6), Speechiness (7)
# Acousticness (8), Instrumentalness (9), Liveness (10), Valence (11), Tempo (12)

# Define relative importance of the features before defining a metric between any pair of them...
ponderation = np.ones((np_features.shape[1]))
ponderation[2] = 3
ponderation[4] = 0
ponderation[8] = 2
ponderation[9] = 2
ponderation[10] = 2

# Apply ponderation to features (weights will broadcast to all lines of the features array)
np_features_pondered = np_features*ponderation

In [None]:
ponderation

Building an adjacency matrix based of cosine similarity of tracks

In [None]:
# Calculating the distance between all pairs of songs and building the adjacency matrix
from scipy.spatial.distance import pdist, squareform
adjacency = squareform(pdist(np_features_pondered,metric='cosine'))

# Putting some order to the chaos
# Strong links between similar songs, weak links otherwise and all links between 0 and 1
adjacency = 1.5*(1-adjacency);
# Remove negative links
adjacency[adjacency < 0] = 0.0
# Bound all links to 1.0
adjacency[adjacency > 1.0] = 1.0
# Remove self links
np.fill_diagonal(adjacency, 0.0)

In [None]:
fig, ax = plt.subplots(figsize=(8, 8))
plt.title('Adjacency visualization')
im = ax.imshow(adjacency, cmap='YlOrBr')

> **NOTE**: Maybe what could be great would be to use the same concepts as in Milestone 3 (we have the work of the best group on github), on that dataset, spatial embedding and so on?

In [None]:
max_link_no = adjacency.shape[0]*adjacency.shape[1]

We then show some statistics on our music adjacency graph to see how strong and frequent the connections are. 

In [None]:
print('Links with non-zero weight : %d per cent' % int(100*np.count_nonzero(adjacency!=0)/max_link_no))
print('Links with weight one : %d per cent' % int(100*np.count_nonzero(adjacency==1)/max_link_no))
print('Mean of non-zero links : %.3f' % np.mean(adjacency!=0))
print('Standard deviation of non-zero links : %.3f' % np.std(adjacency!=0))

In [None]:
significant_adjacency = adjacency.copy()
for line in range(significant_adjacency.shape[0]):
    significant_adjacency[line, significant_adjacency[line,:].argsort()[0:-13:1]] = 0.0
significant_adjacency = 0.5*(significant_adjacency + significant_adjacency.T)
#significant_adjacency[significant_adjacency<395] = 0.0
#significant_adjacency[significant_adjacency>390] = 1.0

In [None]:
G_songs_unique = nx.from_numpy_matrix(significant_adjacency)
G_songs_unique.name = 'Unique songs'
print(nx.info(G_songs_unique))

In [None]:
def graph_visualizer_from_npadj(Graph,title):
    #Some global parameters to pass in attributes.
    options = {
        'node_color': 'black',
        'node_size': 50,
        'line_color': 'grey',
        'linewidths': 0,
        'width': 0.1,
    }
    #Display the graph with desired layout.
    fig = plt.figure(figsize = (15, 15))
    nx.draw(Graph, pos=nx.spring_layout(Graph),**options)
    plt.title(title)
    plt.show()
    plt.savefig("test.svg")

In [None]:
graph_visualizer_from_npadj(G_songs_unique, '')

***
***

# Part II: The Social Network

## Make it last forever, friendship never ends

Now, we want to construct a network based on real friendship relations. To do so, an adjacency matrix $A$ was constructed offline and imported on the notebook. It contains information about the relationship between the Spotify users. It is a binary matrix that can be read as follow:

- If the users $i$ and $j$ known each other more that meeting only one time, $A(i,j)$ is equal to one
- If they don't, $A(i,j)$ is equal to zero

In [None]:
# Import of the user_adjacency_file and processing to make it symmetric. (Only one diagonal was filled by hand)
adjacency_df = pd.read_csv('user_adjacency.csv', sep=',').fillna(0)
adjacency_df = adjacency_df.set_index('Unnamed: 0')
adjacency_df.index.names = ['Users']
adjacency_complete = adjacency_df + adjacency_df.T
np.fill_diagonal(adjacency_complete.values, 0) #no self loop

In [None]:
## FRIENDSHIP MATRIX DISPLAY ##
# According to the given color bar, the higher the score the closer the two playlists.
fig, ax = plt.subplots(figsize=(8, 8))
im = ax.imshow(adjacency_complete, cmap='YlOrBr')

# Each tick corresponds to the corresponding user.
ax.set_xticks(np.arange(adjacency_complete.shape[1]))
ax.set_yticks(np.arange(adjacency_complete.shape[0]))
ax.set_xticklabels(adjacency_complete.columns)
ax.set_yticklabels(adjacency_complete.index)

plt.setp(ax.get_xticklabels(), rotation=45, ha="right",rotation_mode="anchor")
divider = make_axes_locatable(ax)
plt.title("Friendship Matrix")
cax = divider.append_axes("right", size="5%", pad=0.05)
cbar = fig.colorbar(im,cax=cax)

Above is the matrix visulatization of the friendship relations. When the square is brown, it means that the two users are related and no otherwise.

In [None]:
## GRAPH CONSTRUCTION ##
to_graph_from = adjacency_complete #the adjacency to generate the graph from.

# From the adjacency generated by the users/artists graphs
F = nx.from_pandas_adjacency(to_graph_from)
F.name = 'Friends Graph'
print(nx.info(F))

Among the 23 friends, the average degree is 4.08, meaning that each one of the user is related to an average 4 other people. The distribution however, as represented by the matrix visualization above shows that the distribution of degrees is wide, with some users being very connected and others having only 1 connection. 

In [None]:
## GRAPH VISUALIZATION ##
#Some global parameters to pass in attributes.
options = {
    'node_color': 'cyan',
    'node_size': 500,
    'line_color': 'black',
    'linewidths': 0,
    'width': 3
}

#Display the graph with desired layout and labels
plt.figure(figsize = (10, 10))
pos=nx.spring_layout(F) # positions for all nodes
nx.draw_networkx_nodes(F,pos,cmap=plt.cm.jet,**options)# nodes
nx.draw_networkx_edges(F,pos,alpha=0.2,edge_color='grey',**options)# edges
nx.draw_networkx_labels(F,pos,font_size=10,font_family='sans-serif',font_color='k',**options)# labels
plt.title('Friendship Network')
plt.show()

Here is a visual inspection of the friendship network. We can distinguish a center cluster that is surrounded by smaller groups often connected to the main one by one simple user. This reflects the data collection process given playlists were obtained by asking other friends of several members of our group. 

In [None]:
# Calculation of the betweenness centrality of the User Graph
betweenness = nx.betweenness_centrality(F)

# Assign each to an attribute in the User Graph
nx.set_node_attributes(F, betweenness, 'betweenness')
sorted_betweenness = sorted(betweenness.items(), key=itemgetter(1), reverse=True)

#And the top 5 higher betweeness users are: 
for user, bw in sorted_betweenness[:5]:
    print(user,'has betweeness: %.3f' %bw)

Betweenness centrality of a node v is the sum of the fraction of all-pairs shortest paths that pass through v:

$$c_B(v) =\sum_{s,t \in V} \frac{\sigma(s, t|v)}{\sigma(s, t)}$$

where $V$ is the set of nodes, $\sigma(s, t)$ is the number of shortest $(s, t)-paths$, and $\sigma(s, t|v)$ is the number of those paths passing through some node v other than s, t. If s = t, $\sigma(s, t) = 1$, and if $v \in {s, t}$, $\sigma(s, t|v) = 0$. More in the [Networkx reference page](https://networkx.github.io/documentation/networkx-1.10/reference/generated/networkx.algorithms.centrality.betweenness_centrality.html)

We used this relationship to distinguish the cohesive users, meaning the ones that you should go to if you want to meet the people of the network that you do not know.

In [None]:
#Display the above resutls directly on the graph
list_nodes =list(F.nodes())
list_nodes.reverse()

plt.figure(figsize = (10, 5))
pos = nx.spring_layout(F)
ec = nx.draw_networkx_edges(F, pos, alpha=0.1)
nc = nx.draw_networkx_nodes(F, pos, nodelist=list_nodes, node_color=[F.nodes[n]["betweenness"] for n in list_nodes],with_labels=True, alpha=0.8, node_shape = '.',node_size = 2000, cmap=plt.cm.jet)
lc = nx.draw_networkx_labels(F,pos,font_size=10,font_family='sans-serif',font_color='w')# labels
plt.colorbar(nc)
plt.title('Betweeness of the users')
plt.axis('off')
plt.show()
plt.savefig('betweeness.png')

<div class="alert alert-block alert-warning">
<font color='#B8860B'>
<b>Interpretation</b>
</font>
<font color='black'>
<br>The color gradient represents the betweeness of a given node. In that graph, the result is not surprising as JC was the one to originaly ask friends for data. LaB has also an high score because of the very same reason. The other bluer nodes are basically end nodes, when the data collection chain stopped.
</font>
</div>

# Part III: The Musical Network

We now want to see how well the real friendship network correlates with the one generated through musical affinities between users. For that, we will need to construct some basic networks to build our further analysis on. 

In [None]:
## NODE DATAFRAME CONSTRUCTION ##
#Extract proper column from playlists and add node caracteristics
playlists = pd.DataFrame(songs['Playlist Origin'].unique())
playlists['Type'] = 'Playlist'
playlists['Color'] = 'orange'

#Extract proper column from artists and add node caracteristics
artists = pd.DataFrame(songs['Artist'].unique())
artists['Type'] = 'Artists'
artists['Color'] = 'black'

#Extract proper column from tracks and add node caracteristics
tracks = pd.DataFrame(songs['Track Name'].unique())
#Add some caracteristics
tracks['Type'] = 'Titles'
tracks['Color'] = 'grey'

In [None]:
#Merge the users (playlists), artists and tracks into one single dataframe
nodes = playlists.append(artists).append(tracks).drop_duplicates(keep='first').reset_index(drop=True)
nodes.columns=['Name','Type','Color']
nodes.set_index('Name', inplace=True)

In [None]:
def graph_visualizer(Graph,title):
    #Some global parameters to pass in attributes.
    options = {
        'node_color': [Graph.node[n]['Color'] for n in Graph.nodes()],
        'node_size': 50,
        'line_color': 'grey',
        'linewidths': 0,
        'width': 0.1,
    }
    #Display the graph with desired layout.
    fig = plt.figure(figsize = (15, 15))
    nx.draw(Graph, pos=nx.spring_layout(Graph),**options)
    plt.title(title)
    plt.show()

### User / Artists graph

In [None]:
## GRAPH CONSTRUCTION ##
G=nx.from_pandas_edgelist(songs, 'Playlist Origin', 'Artist',edge_attr=None, create_using= nx.Graph())
G.name = 'User/Artists Graph'
#print(nx.info(G))

#Set node attributes from nodes dataframe
nx.set_node_attributes(G, nodes['Type'].to_dict(), 'Type')
nx.set_node_attributes(G, nodes['Color'].to_dict(), 'Color')

In [None]:
## GRAPH VISUALIZATION ##
graph_visualizer(G,'User/Artist Graph')

<div class="alert alert-block alert-warning">
<font color='#B8860B'>
<b>Interpretation</b>
</font>
<font color='black'>
<br>Here, the orange nodes are the users, and the black one represent the different artists. We can observe that the vast majority of the artists are present in a single playlist linked to a single user. There is nevertheless some artists that are shared by two or more users. This first insight will be developped later, let see the Artist/Track distribution
</font>
</div>

### Artist / Track Graph

In [None]:
## GRAPH CONSTRUCTION ##
G4=nx.from_pandas_edgelist(songs,'Artist','Track Name')
G4.name = 'Artists/Tracks Graph'
nx.set_node_attributes(G4, nodes['Type'].to_dict(), 'Type')
nx.set_node_attributes(G4,nodes['Color'].to_dict(), 'Color')
#print(nx.info(G4))

In [None]:
## GRAPH VISUALIZATION ##
graph_visualizer(G4,'Songs/Artist Graph')

<div class="alert alert-block alert-warning">
<font color='#B8860B'>
<b>Interpretation</b>
</font>
<font color='black'>
<br>Here, we can observe that the mast majority of the artists are contained only one time in the Spotify Data base. Some are present more than that, as we have previously observed in the dataset descriptive analysis. Now, let's put everything together.
</font>
</div>

### User / Artists / Tracks Graph

In [None]:
## EDGE DATAFRAME CONSTRUCTION ##
#*The edge dataframe to be passed on the graph constructor
# The links between users and artists
playlist_artists = songs[['Playlist Origin','Artist']]
playlist_artists.columns=['Start','End']

# The links between artists and tracks
artists_tracks = songs[['Artist','Track Name']]
artists_tracks.columns=['Start','End']

#Merge everything into a single dataframe
edges_full = playlist_artists.append(artists_tracks).drop_duplicates(keep='first').reset_index(drop=True)

In [None]:
## GRAPH CONSTRUCTION ##
G2=nx.from_pandas_edgelist(edges_full,'Start','End')
G2.name = 'User/Artists/Tracks Graph'
#print(nx.info(G2))

#Adding some node attributes to ease the visualization.
nx.set_node_attributes(G2, nodes['Type'].to_dict(), 'Type')
nx.set_node_attributes(G2, nodes['Color'].to_dict(), 'Color')

In [None]:
## GRAPH VISUALIZATION ##
graph_visualizer(G2,'User/Artist/Tracks Graph')

In [None]:
#Export the graph intp a .gexf file to visualize it.
nx.write_gexf(G2,'User_Artist_Tracks.gexf')

<div class="alert alert-block alert-warning">
<font color='#B8860B'>
<b>Interpretation</b>
</font>
<font color='black'>
<br> Here is a complete visualization of the dataset, where the users (orange) are linked to to artists (black) linked to songs (grey). This visualization put in front the fact that one tends to listen to more one track per given artist.
</font>
</div>

## Who are the closest friends ?

Based on the User/Artist graph, we will assess the musical affinities between users. It will be done by conting the number of shortest paths of lenght two between users, meaning the presence of the same artist in the two playlists. In the case where shortest path of lenght two does not exist between users, no link is reported. 

### The similarities dataframe

In [None]:
#Extract the playlist name from the song dataframe
playlists_list = songs['Playlist Origin'].unique()
#Keep only the user initials for further analysis
users = [playlists_list[i][:-10] for i in range(0,len(playlists_list))]

In [None]:
## SIMILARITIES CALCULATIONS ##
# Initialization of the differents parameters to assess
current = 0 #current user number to analyse.
shortest_paths = np.zeros(shape=(len(playlists),len(playlists))) #Length of the shortest paths between users.
similarities = np.zeros(shape=(len(playlists),len(playlists))) #Number of direct shortest paths betweenn users.

for i in range(0,len(playlists_list)):
    for k in range(0,len(playlists_list)):
        if playlists_list[k] != playlists_list[i]:
            shortest_path = nx.shortest_path_length(G, source=playlists_list[i], target=playlists_list[k])
            #print('Link between %s and %s is %d' % (playlists[i],playlists[k],shortest_path))
            shortest_paths[i,k] = shortest_path
            if shortest_path == 2:
                similarities[i,k] = sum(1 for _ in nx.all_shortest_paths(G, source=playlists_list[i], target=playlists_list[k]))
    current = i+1

In [None]:
#Transform the similarities array into dataframe for readability.
similarities_df = pd.DataFrame(similarities)
#Associate each row and columns with their users.
similarities_df.columns=users
similarities_df.index=users

In [None]:
## SIMILARITIES DISPLAY ##
# According to the givene color bar, the higher the score the closer the two playlists.
fig, ax = plt.subplots(figsize=(8, 8))
im = ax.imshow(similarities_df, cmap='YlOrBr')

# Each tick corresponds to the corresponding user.
ax.set_xticks(np.arange(similarities_df.shape[1]))
ax.set_yticks(np.arange(similarities_df.shape[0]))
ax.set_xticklabels(similarities_df.columns)
ax.set_yticklabels(similarities_df.index)

plt.setp(ax.get_xticklabels(), rotation=45, ha="right",rotation_mode="anchor")
divider = make_axes_locatable(ax)
plt.title("Matching Matrix")
cax = divider.append_axes("right", size="5%", pad=0.05)
cbar = fig.colorbar(im,cax=cax)

<div class="alert alert-block alert-warning">
<font color='#B8860B'>
<b>Interpretation</b>
</font>
<font color='black'>
<br> The above visualization presents the number of "matches" between users. The more red the color, the more two users are linked. For exemple MC and MM are really strongly linked with around 25 artists shared. On the opposite, DL is not really related to the majority of the people with more that with one or two artists.
</font>
</div>

### The Users Graph 

Now that we have a proper adjacency matrix, we can generate a weighted graph based on it. The stronger the link between two users, the stronger the weight of the edge will be. Let's take a look

In [None]:
# Filtering option if we want the low connectivities no to be taken into account.
threshold = 2
similarities_df_filtered=similarities_df[similarities_df>=threshold]
similarities_df_filtered = similarities_df_filtered.fillna(0)

In [None]:
## GRAPH CONSTRUCTION ##
to_graph_from = similarities_df #the adjacency to generate the graph from.

# From the adjacency generated by the users/artists graphs
G3 = nx.from_pandas_adjacency(to_graph_from)
G3.name = 'Users Graph'
print(nx.info(G3))

We can observe that the average degree is already higher in that graph. This is probably due to the fact that 100 songs is a number high enough to be somehow related to a single or more song in another playlist.

In [None]:
## GRAPH VISUALIZATION ##
#Some global parameters to pass in attributes.
options = {
    'node_color': 'white',
    'node_size': 500,
    'width': [d['weight'] for (u, v, d) in G3.edges(data=True) if d['weight'] > 0]
}

#Display the graph with desired layout and labels
list_edges =list(G3.edges())
list_edges.reverse()

plt.figure(figsize = (10, 10))
pos=nx.spring_layout(G3) # positions for all nodes
nx.draw_networkx_nodes(G3,pos,node_color='white')# nodes
nx.draw_networkx_labels(G3,pos,font_size=10,font_family='sans-serif',font_color='k')# labels
nx.draw_networkx_edges(G3,pos,nodelist=list_edges,alpha=0.5,edge_color=[G3.edges[n]['weight'] for n in list_edges],cmap=plt.cm.jet,**options)# edges
plt.title('User affinities')
plt.show()

In [None]:
## DEGREE CALCULATIONS ##
# Exctraction of the degrees of each users from the User Graphs
degrees = dict(G3.degree(G3.nodes()))
sorted_degree = sorted(degrees.items(), key=itemgetter(1), reverse=True)

#And the top 5 most linked users  are.. 
for user, degree in sorted_degree[:]:
    print(user,'is linked to', degree, 'people')

<div class="alert alert-block alert-warning">
<font color='#B8860B'>
<b>Interpretation</b>
</font>
<font color='black'>
<br> Due to the network structure and the corresponding higher degree, there is visualy an abondance of edges. But the number of links between people can vary greatly from simple (DL or LaB) to triple (MM or MC). This may be explained by the presence of several confidential artists with respect to the trend and popularity numbers in the playlist of certain users. In the graph above, the thicker the line, the higher the degree
</font>
</div>

### Betweeness centrality

`TO DO: Get more info about the algorithms and what does it mean`

In [None]:
# Calculation of the betweenness centrality of the User Graph
betweenness = nx.betweenness_centrality(G3)

# Assign each to an attribute in the User Graph
nx.set_node_attributes(G3, betweenness, 'betweenness')
sorted_betweenness = sorted(betweenness.items(), key=itemgetter(1), reverse=True)

#And the top 5 higher betweeness users are: 
for user, bw in sorted_betweenness[:5]:
    print(user,'has betweeness: %.3f' %bw)

In [None]:
#Display the above resutls directly on the graph
list_nodes =list(G3.nodes())
list_nodes.reverse()
plt.figure(figsize = (10, 5))
pos = nx.spring_layout(G3)
ec = nx.draw_networkx_edges(G3, pos, alpha=0.1)
nc = nx.draw_networkx_nodes(G3, pos, nodelist=list_nodes, node_color=[G3.nodes[n]["betweenness"] for n in list_nodes],with_labels=True, alpha=0.8, node_shape = '.',node_size = 2000, cmap=plt.cm.jet)
lc = nx.draw_networkx_labels(G3,pos,font_size=10,font_family='sans-serif',font_color='w')# labels
plt.colorbar(nc)
plt.title('Betweeness of the users')
plt.axis('off')
plt.show()

<div class="alert alert-block alert-warning">
<font color='#B8860B'>
<b>Interpretation</b>
</font>
<font color='black'>
    It is clear that in the social network, JC and LaB are the points of data collection, demonstrated by their higher betweenness than the rest of the graph. The relationships in the music friendship graph however transcend the social connections and are much more varied. LaB in particular is not a point in the music network that leads to others in the group as she is in the social network. 
</font>
</div>

### Users communities

Now we want to do a community analysis, or clustering on the graph. To do so, we use the Louvain method which is based on the optimisation as the algorithm progresses of a quantity called modularity. Modularity is defined as a value between -1 and 1 that measures the density of links inside communities compared to links between communities. For a weighted graph, modularity is defined as:

${\displaystyle Q={\frac {1}{2m}}\sum \limits _{ij}{\bigg [}A_{ij}-{\frac {k_{i}k_{j}}{2m}}{\bigg ]}\delta (c_{i},c_{j})}$

where

- $A_{ij}$ represents the edge weight between nodes $i$ and $j$

- $k_{i}$ and $k_j$ are the sum of the weights of the edges attached to nodes {\displaystyle i} $i$ and $j$, respectively

- $2m$ is the sum of all of the edge weights in the graph

- $c_{i}$ and $c_{j}$ are the communities of the nodes

- $\delta$  is a simple delta function.

[Source](https://en.wikipedia.org/wiki/Louvain_Modularity)

In the Louvain Method of community detection, first small communities are found by optimizing modularity locally on all nodes, then each small community is grouped into one node and for each community it is tested whether by joining it to a neighboring community, we can obtain a better clustering. The processus is repeated until no more advancement are made. We will use this tool to decect possible communities starting from our Network

In [None]:
from community import community_louvain

In [None]:
#Partition the data into communities from the User Graph
partition = community_louvain.best_partition(G3)
# add it as an attribute to the nodes
for n in G3.nodes:
    G3.nodes[n]["louvain"] = partition[n]

In [None]:
#Visualize the partitioning
plt.figure(figsize = (10, 5))
pos = nx.spring_layout(G3,k=0.2)
ec = nx.draw_networkx_edges(G3, pos, alpha=0.2)
nc = nx.draw_networkx_nodes(G3, pos, nodelist=F.nodes(), node_color=[G3.nodes[n]["louvain"] for n in G3.nodes], with_labels=False, node_size=500, cmap=plt.cm.jet)
lc = nx.draw_networkx_labels(G3,pos,font_size=10,font_family='sans-serif',font_color='w',**options)# labels
plt.title('Obtained partitioning')
plt.axis('off')
plt.show()

<div class="alert alert-block alert-warning">
<font color='#B8860B'>
<b>Interpretation</b>
</font>
<font color='black'>
<br> From the figure above, we can see that there is three major groups obtained. Knowing the different relationship between users, one can remark that the blue cluster is composed mainly of non-EPFL users, with few links with the main connected friends from the real friendship adjacency. The green cluster contains the _core_ group of student, that are apart some exceptions, close friends. The third cluster is composed of foreign students or friends that live or come from far away from Switzerland. Differences in music culture, ages or simply the fact that these users have only few relation with the major part of the dataset can explain why they are appart in one cluster. It is really interesting to see that without knowing anything about real-life relations, music tastes can lead to cluster of this kind, that are somewhat close to what we get in reality. 
</font>
</div>

<div class="alert alert-block alert-info"><b>End :</b><br />
<br />
<br />
</div>

To compare the clustering of the music based and social based groups, we now run the louvain method on the social network as well. 

In [None]:
#Partition the data into communities from the User Graph
partition = community_louvain.best_partition(F)
# add it as an attribute to the nodes
for n in F.nodes:
    F.nodes[n]["louvain"] = partition[n]

In [None]:
#Visualize the partitioning
plt.figure(figsize = (10, 5))
pos = nx.spring_layout(F,k=0.2)
ec = nx.draw_networkx_edges(F, pos, alpha=0.2)
nc = nx.draw_networkx_nodes(F, pos, nodelist=F.nodes(), node_color=[F.nodes[n]["louvain"] for n in F.nodes], with_labels=False, node_size=500, cmap=plt.cm.jet)
lc = nx.draw_networkx_labels(F,pos,font_size=10,font_family='sans-serif',font_color='w',**options)# labels
plt.title('Obtained partitioning')
plt.axis('off')
plt.show()

In [None]:
nx.optimize_graph_edit_distance(F, G3)
nx.graph_edit_distance(F, G3)

## Song Popularity Prediction Using FMA Dataset

For this part of the project, we are going to focus on FMA dataset. In particular, we suggest an approach to predict track 'popularity'. Using this predictive model, artists and music producers will be able to estimate the success of their track before actually uploading it to the platform. It will also help them understand what plays an important role in a track's success.

In [None]:
from keras.models import Sequential
from keras.layers import Dense
from keras.callbacks import EarlyStopping
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn import preprocessing
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier  
from IPython.display import display
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from scipy import sparse
import scipy.sparse.linalg
pd.options.display.max_columns = None
import warnings
warnings.filterwarnings('ignore')

In [None]:
##Loading and formatting tracks data

df=pd.read_csv('tracks.csv', header=None)
df.iloc[1,0]='track_id'
df.iloc[1,8]='album_listens'
df.iloc[1,47]='track_listens'
header = df.iloc[1]
df=df[3:]
df=df.rename(columns = header)

In [None]:
##Dropping unnecessary features, fixing types

df=df.drop(['id','engineer', 'information','tags','bio','website','wikipedia_page','split','genres','genres_all','subset','lyricist',
        'publisher','composer','associated_labels','active_year_end','producer','comments'], axis=1)
df[['track_id']]=df[['track_id']].astype(int)
df[['tracks']]=df[['tracks']].astype(int)
df[['bit_rate']]=df[['bit_rate']].astype(int)
df[['duration']]=df[['duration']].astype(int)
df[['interest']]=df[['interest']].astype(int)
df[['track_listens']]=df[['track_listens']].astype(int)
df[['album_listens']]=df[['album_listens']].astype(int)

In [None]:
##Loading and formatting echonest dataset
echo=pd.read_csv('echonest.csv', engine = 'python',encoding='utf-8', error_bad_lines=False, header=None)
echo.iloc[2,0]='track_id'
header = echo.iloc[2]
echo=echo[4:]
echo=echo.rename(columns = header)
echo=echo.loc[:,['track_id', 'acousticness', 'danceability', 'energy',
       'instrumentalness', 'liveness', 'speechiness', 'tempo', 'valence',
       'artist_discovery', 'artist_familiarity', 'artist_hotttnesss','song_currency', 'song_hotttnesss']]
echo=echo.astype(float)
echo[['track_id']]=echo[['track_id']].astype(int)

In [None]:
##Merging two tables 
new_df =df.merge(echo, on='track_id', how='inner')
df =new_df.select_dtypes(include=['int64','float64','int32'])

In [None]:
pd.set_option('display.max_columns', 40)

In [None]:
df.head()

In [None]:
##Calculating feature quantiles
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1

##Removing some extreme outliers
df=df[df.track_listens<20000]
df=df[df.duration<1000]
df=df[df.interest<50000]
df=df[df.song_currency<0.01]
df=df[df.album_listens<80000]


Let's have a look on how the number of track listens is distributed. As it can be seen, tracks having more than 5000 listens are a rarity. Average number of the listens is not more than 1242.

In [None]:
plt.hist(df.track_listens)
plt.title('Distribution of Track Listens')
plt.xlabel('Number of Listens')
plt.ylabel('Frequency')

In [None]:
print('Average number of listens:',np.mean(df.track_listens))
print('Maximum number of listens:',np.max(df.track_listens))
print('Minimum number of listens:',np.min(df.track_listens))

Having the number of times a track was listened, we decided to design a classification problem with the objective of predicting a popularity score for a track. 
To that end, we decided to assign 'popularity' scores on a scale of 1 to 5 based on the quantile interval that a track's number of listens falls into. 

In [None]:
target=pd.qcut(df.track_listens,5,labels=False)+1
data=df.drop(['track_listens','track_id','album_listens'], axis=1)
colnames=data.columns

train_X, test_X, train_Y,test_Y = train_test_split(data, target, test_size=0.30, random_state=20)

##Standardizing the data
std_scale = preprocessing.StandardScaler().fit(train_X)
train_X = std_scale.transform(train_X)
test_X  = std_scale.transform(test_X)

To explore feature importacne, we run Random Forest Classifier on our data. Note that we omit the variable 'album_listens' in our model as we want this to be useful for new tracks which will not have available number of album listens.

In [None]:
rf = RandomForestClassifier() 
rf.fit(train_X, train_Y) 
feature_importances = pd.DataFrame(rf.feature_importances_,
                                   index = colnames,
                                    columns=['importance']).sort_values('importance',      ascending=False)
feature_importances.head()

In [None]:
predictions = rf.predict(test_X)
errors = np.abs(predictions - test_Y)
mape = 100 * (errors / test_Y)
accuracy = 100 - np.mean(mape)

In [None]:
##Narrowing down the features
small_data=df[['interest','acousticness','artist_hotttnesss','artist_familiarity']]
train_X, test_X, train_Y,test_Y = train_test_split(small_data, target, test_size=0.30, random_state=20)
##Standardizing the data
std_scale = preprocessing.StandardScaler().fit(train_X)
train_X = std_scale.transform(train_X)
test_X  = std_scale.transform(test_X)

In [None]:
rf = RandomForestClassifier() 
rf.fit(train_X, train_Y) 
predictions = rf.predict(test_X)
errors = np.abs(predictions - test_Y)
mape = 100 * (errors / test_Y)
accuracy = 100 - np.mean(mape)
accuracy

In [None]:
error = []

# Calculating error for K values between 1 and 40
for i in range(1, 10):  
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(train_X, train_Y)
    pred_i = knn.predict(test_X)
    error.append(np.mean(pred_i != test_Y))

In [None]:
plt.figure(figsize=(12, 6))  
plt.plot(range(1, 10), error, color='red', linestyle='dashed', marker='o',  
         markerfacecolor='blue', markersize=10)
plt.title('Error Rate K Value')  
plt.xlabel('K Value')  
plt.ylabel('Mean Error')  

In [None]:

classifier = KNeighborsClassifier(n_neighbors=6)  
classifier.fit(train_X, train_Y) 
y_pred = classifier.predict(test_X)

print(confusion_matrix(test_Y, y_pred))  
print(classification_report(test_Y, y_pred)) 

In [None]:
errors = np.abs(y_pred - test_Y)
mape = 100 * (errors / test_Y)
accuracy = 100 - np.mean(mape)
accuracy

Although KNN has descent performance, it did not manage to perform better than Random Forest Classifier. 

### Visualization of the FMA Data 

In [None]:
small_data= small_data.sample(frac=0.1, random_state=1)
target = target.sample(frac=0.1, random_state=1)

In [None]:
# First create an adjacency matrix to visualize popularity 

# Calculating the distance between song popularity to build adjacency matrix
test = small_data
from scipy.spatial.distance import pdist, squareform
distances = squareform(pdist(test, metric='euclidean'))
kernel_width = distances.mean()
adjacency = np.exp(-1*(distances*2/(kernel_width*2)))

# Remove self links
np.fill_diagonal(adjacency, 0.0)

In [None]:
_ = plt.hist(adjacency.reshape(-1), bins=200)
_ = plt.yscale('log')

In [None]:
ajdacency = adjacency.astype(np.float32)

In [None]:
adjacency_sparse = sparse.csr_matrix(adjacency, dtype=np.float32)

In [None]:
adjacency_sparse.dtype

In [None]:
# From the adjacency to graph, visualize clusters
P = nx.from_scipy_sparse_matrix(adjacency_sparse)
P.name = 'FMA_Graph'
print(nx.info(P))

In [None]:
from pygsp import plotting
import numpy as np
from scipy import sparse, spatial
import pandas as pd
import matplotlib.pyplot as plt
from pygsp import graphs, filters, plotting
import scipy as sp


In [None]:
G = graphs.Graph(adjacency_sparse)

In [None]:
G.set_coordinates(kind='spring')
G.plot()

In [None]:
eig_val, U = sp.linalg.eigh(G.L.toarray())

In [None]:
eig_val.shape

In [None]:
_ = plt.hist(eig_val, bins=200)

## CNN Prediction

The following code was adapted from the usage file in the git hub for "Convolutional Neural Networks on Graphs with Fast Localized Spectral Filtering". The adjacency matrix from reduced features from the fma dataset was used. 
https://github.com/mdeff/cnn_graph


In [None]:
from lib import models, graph, coarsening, utils

In [None]:
graphs, perm = coarsening.coarsen(adjacency_sparse, levels=3, self_connections=False)

In [None]:
for i in graphs:
    print(i.dtype)

In [None]:
small_data= small_data.astype(np.float32)
target= target.astype(np.float32)

In [None]:
target= target.values
target = target.astype(int)

In [None]:
train_X, test_X, train_Y,test_Y = train_test_split(small_data, target, test_size=0.40, random_state=20)

In [None]:
val_X, test_X, val_Y,test_Y = train_test_split(test_X, test_Y, test_size=0.50, random_state=20)

In [None]:
train_X = train_X.values
val_X = val_X.values
test_X = test_X.values

In [None]:
n_train= len(train_X)
len(val_X)
len(test_X)

In [None]:
train_X = coarsening.perm_data(train_X, perm)
val_X = coarsening.perm_data(val_X, perm)
test_X = coarsening.perm_data(test_X, perm)

In [None]:
train_X = train_X.astype(np.float32)
val_X = val_X.astype(np.float32)
test_X = test_X.astype(np.float32)

In [None]:
L = [graph.laplacian(adjacency_sparse, normalized=True) for adjacency_sparse in graphs]
graph.plot_spectrum(L)

In [None]:
params = dict()
params['dir_name']       = 'demo'
params['num_epochs']     = 40
params['batch_size']     = 100
params['eval_frequency'] = 200

# Building blocks.
params['filter']         = 'chebyshev5'
params['brelu']          = 'b1relu'
params['pool']           = 'apool1'

# Number of classes.
C = test_Y.max() + 1

# Architecture.
params['F']              = [32, 64]  # Number of graph convolutional filters.
params['K']              = [20, 20]  # Polynomial orders.
params['p']              = [4, 2]    # Pooling sizes.
params['M']              = [512, C]  # Output dimensionality of fully connected layers.

# Optimization.
params['regularization'] = 5e-4
params['dropout']        = 1
params['learning_rate']  = 1e-5
params['decay_rate']     = 0.95
params['momentum']       = 0.9
params['decay_steps']    = n_train / params['batch_size']

In [None]:
model = models.cgcnn(L, **params)
accuracy, loss, t_step = model.fit(train_X, train_Y, val_X, val_Y)

In [None]:
fig, ax1 = plt.subplots(figsize=(15, 5))
ax1.plot(accuracy, 'b.-')
ax1.set_ylabel('validation accuracy', color='b')
ax2 = ax1.twinx()
ax2.plot(loss, 'g.-')
ax2.set_ylabel('training loss', color='g')
plt.show()

In [None]:
res = model.evaluate(test_X, test_Y)
print(res[0])

<div class="alert alert-block alert-warning">
<font color='#B8860B'>
<b>Interpretation</b>
</font>
<font color='black'>
<br> We do not get exceptionally good results using the CNN of the Spectral Filtering. We have used a reduced dataset due to the computationally intense nature of this function and the creation of the adjacency matrix. 
With more computational power, more features could be added, and we could increase the number of samples used. 
At present the CNN is running on only 5 features, and 10% of the possible samples.
</font>
</div>