In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
df = pd.read_csv('1980.csv')

In [None]:
df['genres'].unique()

Check for duplicate values

In [None]:
df.shape

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
df.shape

There were a lot of duplicate values..

Check for null values

In [None]:
df.isna().any()

There exist Null values for the Chart Power and for the isrc. The null values for the Chart Power are valid, because not every song was in the Charts. Lets have a look at the null values for isrc:

In [None]:
df[df['isrc'].isna()]

In [None]:
df_categorical = df[['spotify_id', 'genres', 'name', 'artists', 'album', 'release_date', 'release_date_precision', 'uri', 'isrc']]

In [None]:
display(df_categorical.head())
df_categorical.shape

In [None]:
df_categorical = df_categorical.groupby('spotify_id').agg(lambda x: list(x)).reset_index()

In [None]:
display(df_categorical.head())
df_categorical.shape

In [None]:
df_numerical = df.drop(columns=['genres', 'name', 'artists', 'album', 'release_date', 'release_date_precision', 'uri', 'isrc'])

In [None]:
df_numerical.drop_duplicates(inplace=True)

In [None]:
df_numerical.head()

In [None]:
df_numerical.shape

In [None]:
df_numerical = df_numerical.groupby('spotify_id').mean()

In [None]:
df_numerical.shape

In [None]:
df_numerical.reset_index(inplace=True)

In [None]:
df_categorical = df_categorical.merge(df_numerical, how='left', on='spotify_id')

In [None]:
df_categorical

Create a DataFrame which contains only the relevant features for the recommender system

In [None]:
df_recommender = df.drop(columns=['artists', 'genres','album', 'release_date', 'release_date_precision', 'chart_power', 'uri', 'popularity', 'name', 'isrc'])

Drop duplicate values..

In [None]:
df_recommender.shape

In [None]:
df_recommender.drop_duplicates(inplace=True)

In [None]:
df_recommender.shape

There were a lot of duplicates..

In [None]:
df_recommender.head()

Since all features except the spotify_id are numerical features and the spotify_id should be an unique identifier, try to group by the id to reduce the DataFrame even more.

In [None]:
df_recommender = df_recommender.groupby(['spotify_id']).mean()

In [None]:
display(df_recommender.head())
df_recommender.shape

The shape stayed the same, so there were no duplicate ids.

Scale the data, so every feature has the same influence.

In [None]:
scaler = MinMaxScaler()
df_recommender_scaled = scaler.fit_transform(df_recommender)
df_recommender_scaled = pd.DataFrame(df_recommender_scaled, columns=df_recommender.columns, index = df_recommender.index)

In [None]:
df_recommender_scaled.head()

In [None]:
df_recommender_scaled.reset_index(inplace=True)
df_recommender.reset_index(inplace=True)

Convert every genre to a feature. If a song is part of a genre it should contain the value 1 otherwise 0.

In [None]:
ct = pd.crosstab(df['spotify_id'], df['genres'])
ct.reset_index(inplace=True)

In [None]:
display(ct.head())
ct.shape

Merge both DataFrames together to create the Recommender System.

In [None]:
ct_merged = ct.merge(df_recommender_scaled, on=['spotify_id'], how='left')

In [None]:
ct_merged.set_index(['spotify_id'], inplace=True)

In [None]:
display(ct_merged.head())
ct_merged.shape

Overall there are 108 features used for the Recommendation system.

Compute the cosine similarity between each song. A value close to one means that 2 two songs are equal a value close to 0 means, that they are completley different.

In [None]:
cosine_similarity_array = cosine_similarity(ct_merged)

In [None]:
cosine_similarity_array.shape

In [None]:
cosine_similarity_df = pd.DataFrame(cosine_similarity_array, index=ct_merged.index, columns=ct_merged.index)

In [None]:
cosine_similarity_df

In [None]:
df.sort_values('popularity', ascending=False).head(10)

In [None]:
def recommend_tracks(track: str):
    '''
    Recommends tracks that are similar to the provided track.

    Parameter
    ---------
    track: str
        Provided track
    
    '''
    ids = df[df.name == track]['spotify_id'].values
    for id in ids:
        recommender_df = cosine_similarity_df[[id]]
        recommender_df.reset_index(inplace=True)
        recommender_df = recommender_df.rename(columns={id: 'Score'})
        recommender_df = recommender_df.merge(df_categorical, how='left', on='spotify_id')
        display(recommender_df.sort_values(by='Score', ascending=False).head(10))


In [None]:
recommend_tracks('Back In Black')