In [1]:
import pandas as pd
# Import Pandas to use DataFrames
import swifter
# To efficiently apply any function to a Pandas Data Frame or Series object in the quickest available method
import numpy as np
# Import Numpy which will hel in mathematical calculations 
import matplotlib.pyplot as plt
import seaborn as sns
# To Visualize Data
import os
# To Access system files/options
import glob
# To return all file paths that match a specific pattern
from tqdm import tqdm
# To Display progress when using loops



import statsmodels
import sklearn
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from scipy import stats
from sklearn import preprocessing
from sklearn.cluster import KMeans


pd.set_option('display.max_columns', None)
pd.set_option("display.precision", 2)
# Tune some options in pandas 


%matplotlib inline
# To make any plot appear in jupyter notebook

DATASETS_PATH = "C:/Users/Amir/technolab"
df_main = pd.read_csv(os.path.join(DATASETS_PATH, 'feature_engineering_dataset.csv'))
df_add = df_main.copy()

happy_yes = df_add['valence'].copy()
happy_yes[df_add['valence']>=0.5] = 1
happy_yes[df_add['valence']<0.5] = 0
happy_yes.name = "happy_yes"
happy_yes

happy_no = df_add['valence'].copy()
happy_no[df_add['valence'] >= 0.5] = 0
happy_no[df_add['valence'] < 0.5] = 1
happy_no.name = "happy_no"
happy_no

speech_yes = df_add['speechiness'].copy()
speech_yes.name = "speech_yes"
speech_yes[(df_add['speechiness']>=0.66)] = 1
speech_yes[(df_add['speechiness']<0.66)] = 0
speech_yes

speech_no = df_add['speechiness'].copy()
speech_no.name = "speech_no"
speech_no[(df_add['speechiness']>=0.66)] = 0
speech_no[(df_add['speechiness']<0.66)] = 1
speech_no

collaborative_yes = df_add['collaborative'].copy()
collaborative_yes.name = "collaborative_yes"
collaborative_yes[(df_add['collaborative'] == 1)] = 1
collaborative_yes[(df_add['collaborative'] == 0)] = 0
collaborative_yes

collaborative_no = df_add['collaborative'].copy()
collaborative_no.name = "collaborative_no"
collaborative_no[(df_add['collaborative'] == 1)] = 0
collaborative_no[(df_add['collaborative'] == 0)] = 1
collaborative_no

mode_yes = df_add['mode'].copy()
mode_yes.name = "mode_yes"
mode_yes[(df_add['mode']>=0.66)] = 1
mode_yes[(df_add['mode']<0.66)] = 0
mode_yes

mode_no = df_add['mode'].copy()
mode_no.name = "mode_no"
mode_no[(df_add['mode'] == 1)] = 0
mode_no[(df_add['mode'] == 0)] = 1
mode_no

# We have to drop original boolean columns
df_add = df_add.drop(columns = ["collaborative", "mode"], axis = 1)
# Add all the newly created columns by ONE HOT ENCODING technique to our dataframe
df_add = pd.concat([df_add, mode_no, mode_yes, collaborative_no, collaborative_yes, speech_no, speech_yes, happy_no, happy_yes], axis=1)
df_add

df_add.info()

playlist = df_add[["pid", "name", "description", "playlist_duration_ms", "num_edits", "num_followers", "num_tracks", "num_albums", "num_artists", "modified_at", "collaborative_yes", "collaborative_no"]]
playlist = playlist.drop(playlist[playlist.duplicated()].index)

# Reset index for DataFrame
playlist = playlist.reset_index(drop=True)
playlist

track = df_add[["track_name","artist_name", "album_name", "duration_ms", "danceability", "energy", "key", "loudness", "speechiness", "acousticness", "instrumentalness", "liveness", "valence", "tempo", "mode_no", "mode_yes", "speech_no", "speech_yes", "happy_no", "happy_yes"]]

# We will get popularity feature by the weights of some features which we engineered from our pre-feature engineering notebook
# In our pre-feature engineering notebook, we did EDA and noticed that some features have more variability than the others
# That is the reason of the multiplied weights to get our popularity feature  
track["popularity"] = (0.2*df_add["playlist_followers_track"])+(2*df_add["playlist_followers_artist"])+(1.5*df_add["playlist_followers_album"])

# Drop duplicates
track = track.drop(track[track.duplicated()].index)

# Display first 5 rows
track.head()

track.shape[0]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 266363 entries, 0 to 266362
Data columns (total 44 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   pid                        266363 non-null  int64  
 1   name                       266363 non-null  object 
 2   description                5333 non-null    object 
 3   modified_at                266363 non-null  int64  
 4   num_artists                266363 non-null  int64  
 5   num_albums                 266363 non-null  int64  
 6   num_tracks                 266363 non-null  int64  
 7   num_followers              266363 non-null  int64  
 8   num_edits                  266363 non-null  int64  
 9   playlist_duration_ms       266363 non-null  int64  
 10  pos                        266363 non-null  int64  
 11  artist_name                266363 non-null  object 
 12  track_uri                  266363 non-null  object 
 13  artist_uri                 26

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  track["popularity"] = (0.2*df_add["playlist_followers_track"])+(2*df_add["playlist_followers_artist"])+(1.5*df_add["playlist_followers_album"])


93123

In [None]:
features = [track.columns[0], track.columns[1], track.columns[2]]
temp = track.copy()
for feature in tqdm(features):
    for i in range(track.shape[0]):
        temp[feature].iloc[i] = str.lower((track[feature].iloc[i]).replace(" ",""))

track["metadata"] = temp[temp.columns[0]] + " " + temp[temp.columns[1]] + " " + temp[temp.columns[2]]
track

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp[feature].iloc[i] = str.lower((track[feature].iloc[i]).replace(" ",""))


In [None]:
# Create CountVectorizer object to transform text into vector
track_vectorizer = CountVectorizer()

# Fit the vectorizer on "metadata" field of song_library DataFrame
track_vectorizer.fit(track['metadata'])

In [None]:
tracks = track.sort_values("popularity", ascending = False)[:5000]
tracks

In [None]:
tracks.describe().T

In [None]:
# This csv file contains the all the tracks which we will use in our content based recommendation system
track.to_csv(os.path.join(DATASETS_PATH, 'content_recommend_dataset.csv'), index=False)

In [None]:
def song_recommender(song_name, n = 5):
    try:
        # Numeric columns (audio features) in track DataFrame
        num_cols = ['duration_ms', 'danceability', 'energy', 'key','loudness','speechiness','acousticness','instrumentalness','liveness', 'valence', 'tempo', 'mode_no', 'mode_yes', 'speech_no', 'speech_yes', 'happy_no', 'happy_yes', 'popularity']

        # Create vector from "metadata" field (text data) for given song
        text_vec1 = track_vectorizer.transform(tracks[tracks['track_name']==str(song_name)]['metadata']).toarray()

        # Create vector from numerical columns for given song
        num_vec1 = tracks[tracks['track_name']==str(song_name)][num_cols].to_numpy()

        # Initialise empty list to store similarity scores
        sim_scores=  []

        # For every song/track in song library, determine cosine similarity with given song
        for index, row in tqdm(tracks.iterrows()):
            name = row['track_name']

            # Create vector from "metadata" field for other songs
            text_vec2 = track_vectorizer.transform(tracks[tracks['track_name']==name]['metadata']).toarray()

            # Create vector from numerical columns for other songs
            num_vec2 = tracks[tracks['track_name']==name][num_cols].to_numpy()

            # Calculate cosine similarity using text vectors
            text_sim = cosine_similarity(text_vec1, text_vec2)[0][0]

            # Calculate cosine similarity using numerical vectors
            num_sim = cosine_similarity(num_vec1, num_vec2)[0][0]

            # Take average of both similarity scores and add to list of similarity scores
            sim = (text_sim + num_sim)/2
            sim_scores.append(sim)
        
        # Add new column containing similarity scores to song_library DataFrame
        tracks['similarity'] = sim_scores

        # Sort DataFrame based on "similarity" column
        tracks.sort_values(by=['similarity', 'popularity'], ascending=[False, False], inplace=True)

        # Create DataFrame "recommended_songs" containing 5 songs that are most similar to the given song and return this DataFrame
        recommended_songs = tracks[['track_name', 'artist_name', 'album_name']][2:(2+n)]
        return recommended_songs
    except:
        # If given song is not found in song library then display message
        print('{} not found in songs library.'.format(song_name))