# Main notebook for the algorithm development and enhancement

In [1]:
# Required packages
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
sns.set()

# allowing all the columns to be visualized
pd.options.display.max_columns = None

# rendering all graphics straight out of the notebook
%matplotlib inline

In [2]:
f1 = pd.read_json("data/SpotifyData.json")
f1.head(2)

Unnamed: 0,artist_name,track_name,track_id,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,24kGoldn,Mood (feat. iann dior),3tjFYV6RSFtuktYl3ZtYcq,99,0.7,0.722,7,-3.558,0,0.0369,0.221,0.0,0.272,0.756,90.989,140526,4
1,Ariana Grande,34+35,6Im9k8u9iIzKMrmV7BWtlF,97,0.83,0.585,0,-6.476,1,0.094,0.237,0.0,0.248,0.485,109.978,173711,4


In [3]:
df_BL = pd.read_csv("data/brendan-spotify.csv")
df_NL = pd.read_csv("data/nick-spotify.csv")
df_BT = pd.read_csv("data/btam-spotify.csv")
df_DD = pd.read_csv("data/danica-spotify.csv")
df_TLC = pd.read_csv("data/toma-spotify.csv")

frames = [df_BL, df_NL, df_BT, df_DD, df_TLC]

personal_df = pd.concat(frames).reset_index(drop=True) 
personal_df.head(2)

Unnamed: 0,Song,Artist,Genre,Year,added,bpm,energy,dance,dB,liveliness,valence,duration,acousticness,speechiness,popularity
0,Sooner Or Later,Aaron Carter,dance pop,2018,2021‑01‑06,112,73,74,-7,7,44,213,12,4,49
1,This is the Place,Tom Grennan,indie anthem-folk,2020,2021‑01‑05,142,62,65,-5,9,46,186,32,3,68


In [4]:
# Cleaning and prepping data

f1 = f1.drop(columns=['track_id', 'key', 'mode', 'time_signature', 'duration_ms','instrumentalness'])
print(f1.head(2))

# Rename certain columns
personal_df = personal_df.rename(columns={"Song": "track_name", 
                            "Artist": "artist_name", 
                            "dance": "danceability", 
                            "dB": "loudness",
                            "bpm": "tempo",
                            "liveliness":"liveness",
                            })

# Rearrange to match order of master set
personal_df = personal_df[['artist_name', 
                           'track_name',
                           'popularity',
                           'danceability',
                           'energy',
                           'speechiness',
                           'acousticness',
                           'liveness',
                           'valence',
                           'tempo',]]
# print("\n\n",personal_df.head(2))


     artist_name              track_name  popularity  danceability  energy  \
0       24kGoldn  Mood (feat. iann dior)          99          0.70   0.722   
1  Ariana Grande                   34+35          97          0.83   0.585   

   loudness  speechiness  acousticness  liveness  valence    tempo  
0    -3.558       0.0369         0.221     0.272    0.756   90.989  
1    -6.476       0.0940         0.237     0.248    0.485  109.978  


In [6]:
# Concatenate the datasets
df = pd.concat([f1, personal_df])
df = df.drop_duplicates(subset=['track_name','artist_name'], keep='first', inplace=False, ignore_index=False )

# shape of the dataset
print("Data Frame Dimensions:")
print("Lines:\t\t{}".format(df.shape[0]))
print("Columns:\t{}".format(df.shape[1]))

# Verify no missing data
print(df.info())

Data Frame Dimensions:
Lines:		22662
Columns:	11
<class 'pandas.core.frame.DataFrame'>
Int64Index: 22662 entries, 0 to 3693
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   artist_name   22662 non-null  object 
 1   track_name    22662 non-null  object 
 2   popularity    22662 non-null  int64  
 3   danceability  22662 non-null  float64
 4   energy        22662 non-null  float64
 5   loudness      20745 non-null  float64
 6   speechiness   22662 non-null  float64
 7   acousticness  22662 non-null  float64
 8   liveness      22662 non-null  float64
 9   valence       22662 non-null  float64
 10  tempo         22662 non-null  float64
dtypes: float64(8), int64(1), object(2)
memory usage: 2.1+ MB
None


In [7]:
# statistical distribution of categorical values
df.describe(include="O")

Unnamed: 0,artist_name,track_name
count,22662,22662
unique,6865,20840
top,Drake,Home
freq,153,13


In [8]:
# checking most recurrent artists
n = 10
print("Most Popular Artists")
df['artist_name'].value_counts()[:n]

Most Popular Artists


Drake                153
Roy Orbison          150
TandMProductionCo    145
Taylor Swift         128
BTS                  110
Robin Trower          96
The Weeknd            95
David Bowie           94
One Direction         92
Mac Miller            87
Name: artist_name, dtype: int64

In [None]:
def createGroupTopPlaylist(frames, playlistLen, weightParams):
    """
    frames: list of the df containing individual song libraries
    playlistLen: Length of the desired playlist output
    moodProfile: List [energy, dance, liveliness, valence, popularity] providing the mood desired
    """
    if(type(playlistLen) != (int)):
        return "playlist length not type int"
    
    
    
    # Concatenate all the frames (ignored for our test case)    
#     df = df.drop_duplicates(subset=['track_name','artist_name'], keep='first', inplace=False, ignore_index=False )
    
    
    
    # Calculates song difference attribute for each song
    df = df.assign(delta=lambda x: 
                       abs(x['energy'] - moodProfile[0]) * weightParams[0]
                       + abs(x['dance'] - moodProfile[1]) * weightParams[1]
                       + abs(x['valence'] - moodProfile[2]) * weightParams[2]
                       + abs(x['popularity']- moodProfile[3]) * weightParams[3]
                  )

In [12]:
btam= pd.read_json('data/beatrice_top_tracks-Copy1.json')
btam

Unnamed: 0,name,popularity,type,id
0,Berenstein,48,track,0VXI7SOcGQQ1SLjyJaaIEa
1,Monster (Shawn Mendes & Justin Bieber),88,track,2Z8yfpFX0ZMavHkcIeHiO1
2,Passenger,50,track,6wkVmLTBedKaAoeSj4xP6J
3,Coaster,74,track,39KG4kom3enSx4GTThuDGt
4,Easily,1,track,3TpXajg1nKzG3ngc9tBwrD
5,Tonight,43,track,2NfLgvpXpGKAcOAQwMPzez
6,Viola,39,track,7hHLxpZfhFdiM4AkFk74Id
7,This Side of Paradise,56,track,3sgf2a905hjZ81tXYytY6i
8,I Found You,45,track,50J6lp4IShQGX8PC3cP8GY
9,SLOW DANCING IN THE DARK,83,track,0rKtyWc8bvkriBthvHKY8d
