# Spotify Recommendation Engine

https://www.kaggle.com/leonardopena/top50spotify2019


In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans

In [3]:
df = pd.read_csv("top50.csv")

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,Track.Name,Artist.Name,Genre,Beats.Per.Minute,Energy,Danceability,Loudness..dB..,Liveness,Valence.,Length.,Acousticness..,Speechiness.,Popularity
0,1,Se�orita,Shawn Mendes,canadian pop,117,55,76,-6,8,75,191,4,3,79
1,2,China,Anuel AA,reggaeton flow,105,81,79,-4,8,61,302,8,9,92
2,3,boyfriend (with Social House),Ariana Grande,dance pop,190,80,40,-4,16,70,186,12,46,85
3,4,Beautiful People (feat. Khalid),Ed Sheeran,pop,93,65,64,-8,8,55,198,12,19,86
4,5,Goodbyes (Feat. Young Thug),Post Malone,dfw rap,150,65,58,-4,11,18,175,45,7,94


In [5]:
class SpotifyRecommendation(object):
    def __init__(self):
        self.df = pd.read_csv("top50.csv",encoding = "ISO-8859-1")
        self.numerical_df = None
        self.results = dict()
        
        
    def columns_renaming(self):
        self.df.columns = [i.lower().replace(".", "") for i in self.df]
        
    def cleaning_str_col(self):
        for i in ['trackname', 'artistname', 'genre']:
            self.df[i] = self.df[i].str.lower()
            
    def removed_unwanted_col(self):
        self.df.drop(columns=['unnamed: 0'],inplace=True,axis=1)
    
    def normalized_algo(self,col):
        max_d = self.df[col].max()
        min_d = self.df[col].min()
        return (self.df[col] - min_d)/(max_d - min_d)

    def normalizing_col(self):
        df_numerical = self.df.select_dtypes(include=['int64'])
        self.numerical_df =df_numerical
        for col in df_numerical.columns:
            self.df[col] = self.normalized_algo(col)
            
    def cluster_creation(self):
        km = KMeans(n_clusters=15)
        pred = km.fit_predict(self.numerical_df)
        self.df['cluster_val'] = pred
        self.df['cluster_val'] = self.normalized_algo('cluster_val')
        
    def fit(self):
        
        total_songs = list(self.df['trackname'].unique())
        for song_name in total_songs:
            distances = []
            song_res = self.df[(self.df["trackname"]== song_name)].head(1).values[0]
            rem_data = self.df[self.df['trackname']!= song_name]
            for r_song in rem_data.values:
                dist = 0
                for idx, col in enumerate(rem_data.columns):
                    #indeces of non-numerical columns(id,Release date,name,artists)
                    if not col in ["trackname","artistname","genre"]:
                        #calculating the manhettan distances for each numerical feature
                        dist = dist + np.absolute(float(song_res[idx]) - float(r_song[idx]))
                        
                distances.append(dist)
            rem_data['distance'] = distances
            rem_data = rem_data.sort_values('distance')
            temp = rem_data.to_dict(orient='records')
            self.results[song_name] = temp

    def predict(self,song_name,top=5):
        return self.results[song_name][0:5]
        

        
        

In [6]:
s = SpotifyRecommendation()

In [7]:
s.columns_renaming()

In [8]:
s.cleaning_str_col()

In [9]:
s.removed_unwanted_col()

In [10]:
s.normalizing_col()

In [11]:
s.cluster_creation()

In [12]:
s.fit()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rem_data['distance'] = distances
