In [1]:
import pandas as pd
import numpy as np

# ranking and similarity algorithms 


In [2]:
df = pd.read_csv("dataset.csv")

df.head()

Unnamed: 0.1,Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,Comedy,73,230666,False,0.676,0.461,...,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4,acoustic
1,1,4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,Ghost (Acoustic),Ghost - Acoustic,55,149610,False,0.42,0.166,...,-17.235,1,0.0763,0.924,6e-06,0.101,0.267,77.489,4,acoustic
2,2,1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,To Begin Again,To Begin Again,57,210826,False,0.438,0.359,...,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332,4,acoustic
3,3,6lfxq3CG4xtTiEg7opyCyx,Kina Grannis,Crazy Rich Asians (Original Motion Picture Sou...,Can't Help Falling In Love,71,201933,False,0.266,0.0596,...,-18.515,1,0.0363,0.905,7.1e-05,0.132,0.143,181.74,3,acoustic
4,4,5vjLSffimiIP26QG5WcN2K,Chord Overstreet,Hold On,Hold On,82,198853,False,0.618,0.443,...,-9.681,1,0.0526,0.469,0.0,0.0829,0.167,119.949,4,acoustic


In [3]:
null_columns = df.columns[df.isnull().any()]
print(null_columns)

Index(['artists', 'album_name', 'track_name'], dtype='object')


In [4]:
has_duplicates = df.duplicated().any()
print(f"Does the DataFrame have duplicates? {has_duplicates}")

Does the DataFrame have duplicates? False


In [5]:
#  checking for duplicates and na values
df.drop_duplicates()
df.dropna()

# now 0 duplicates and null vals
print(df.duplicated().sum())
print(df.isnull().sum())


print(df.shape)

0
Unnamed: 0          0
track_id            0
artists             1
album_name          1
track_name          1
popularity          0
duration_ms         0
explicit            0
danceability        0
energy              0
key                 0
loudness            0
mode                0
speechiness         0
acousticness        0
instrumentalness    0
liveness            0
valence             0
tempo               0
time_signature      0
track_genre         0
dtype: int64
(114000, 21)


In [6]:
# select features for useful data

# print(df.columns)

df.describe()

selected_feautures = ['danceability', 'energy', 'valence', 'tempo', 'instrumentalness', 'acousticness', 'speechiness', 'popularity']




danceability → Higher values mean the song is more danceable.
energy → Measures intensity. High energy = exciting, low energy = calm.
valence → Measures happiness. High valence = positive/happy mood, low valence = sad/serious.
tempo → Faster tempos often mean energetic/hype, slower tempos = chill/sad.
instrumentalness → Higher values mean fewer lyrics; often used for ambient or study music.
acousticness → Higher values mean more acoustic elements (often calmer, folk-like).
speechiness → High values indicate more spoken words (e.g., rap, podcasts).
popularity → Not directly a mood feature but can help weight recommendations.

In [7]:

# Mood	Danceability	Energy	Valence	Tempo	Instrumentalness	Acousticness	Speechiness
# Happy	High	Medium/High	High	Medium/High	Low	Low	Low
# Chill	Medium	Low	Medium	Low	Medium/High	High	Low
# Sad	Low	Low	Low	Low	Medium	High	Low
# Hype	High	High	Medium/High	High	Low	Low	Medium
# Focus	Low/Medium	Low	Medium	Low/Medium	High	Medium	Low

In [8]:
# need to make a column that dictates mood

# point system 1 - 3 3 high 2 medium 1 low 
def scoring(x):
    score = 0
    # Danceability
    if x['danceability'] > 0.7:
        score += 3
    elif 0.4 <= x['danceability'] <= 0.7:
        score += 2
    else:
        score += 1
    
    # Energy
    if x['energy'] > 0.8:
        score += 3
    elif 0.5 <= x['energy'] <= 0.8:
        score += 2
    else:
        score += 1
    
    # Valence
    if x['valence'] > 0.7:
        score += 3
    elif 0.4 <= x['valence'] <= 0.7:
        score += 2
    else:
        score += 1

    # Tempo
    if x['tempo'] > 120:
        score += 3
    elif 80 <= x['tempo'] <= 120:
        score += 2
    else:
        score += 1
    
    # Instrumentalness
    if x['instrumentalness'] < 0.2:
        score += 3
    elif 0.2 <= x['instrumentalness'] <= 0.5:
        score += 2
    else:
        score += 1
    
    # Acousticness
    if x['acousticness'] < 0.3:
        score += 3
    elif 0.3 <= x['acousticness'] <= 0.6:
        score += 2
    else:
        score += 1
    
    # Speechiness
    if x['speechiness'] < 0.3:
        score += 3
    elif 0.3 <= x['speechiness'] <= 0.6:
        score += 2
    else:
        score += 1
    
    # Popularity
    if x['popularity'] > 70:
        score += 3
    elif 40 <= x['popularity'] <= 70:
        score += 2
    else:
        score += 1
    
    return score

    

    



In [9]:
# score logic now 

score = df[selected_feautures].apply(scoring, axis=1)
print(score)




0         20
1         14
2         16
3         16
4         17
          ..
113995    12
113996    11
113997    17
113998    19
113999    15
Length: 114000, dtype: int64


In [10]:
# will now give you the mood based on emotions
def get_mood(x):
    mood = ""
    if 18 <= x <= 21:
        mood = "Happy"
    elif 15 <= x <= 18:
        mood = "Chill"
    elif 8 <= x <= 12:
        mood = "Sad"
    elif 22 <= x <= 24:
        mood = "Hype"
    elif 13 <= x <= 14:
        mood = "Focus"
    else:
        mood = "Unknown"
    return mood


In [11]:
# create both features one on score and one on mood

df['Score'] = df[selected_feautures].apply(scoring, axis=1)

df.head()

Unnamed: 0.1,Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,...,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre,Score
0,0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,Comedy,73,230666,False,0.676,0.461,...,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4,acoustic,20
1,1,4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,Ghost (Acoustic),Ghost - Acoustic,55,149610,False,0.42,0.166,...,1,0.0763,0.924,6e-06,0.101,0.267,77.489,4,acoustic,14
2,2,1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,To Begin Again,To Begin Again,57,210826,False,0.438,0.359,...,1,0.0557,0.21,0.0,0.117,0.12,76.332,4,acoustic,16
3,3,6lfxq3CG4xtTiEg7opyCyx,Kina Grannis,Crazy Rich Asians (Original Motion Picture Sou...,Can't Help Falling In Love,71,201933,False,0.266,0.0596,...,1,0.0363,0.905,7.1e-05,0.132,0.143,181.74,3,acoustic,16
4,4,5vjLSffimiIP26QG5WcN2K,Chord Overstreet,Hold On,Hold On,82,198853,False,0.618,0.443,...,1,0.0526,0.469,0.0,0.0829,0.167,119.949,4,acoustic,17


In [14]:
df['mood'] = df['Score'].apply(get_mood)

df.head()

Unnamed: 0.1,Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,...,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre,Score,mood
0,0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,Comedy,73,230666,False,0.676,0.461,...,0.143,0.0322,1e-06,0.358,0.715,87.917,4,acoustic,20,Happy
1,1,4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,Ghost (Acoustic),Ghost - Acoustic,55,149610,False,0.42,0.166,...,0.0763,0.924,6e-06,0.101,0.267,77.489,4,acoustic,14,Focus
2,2,1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,To Begin Again,To Begin Again,57,210826,False,0.438,0.359,...,0.0557,0.21,0.0,0.117,0.12,76.332,4,acoustic,16,Chill
3,3,6lfxq3CG4xtTiEg7opyCyx,Kina Grannis,Crazy Rich Asians (Original Motion Picture Sou...,Can't Help Falling In Love,71,201933,False,0.266,0.0596,...,0.0363,0.905,7.1e-05,0.132,0.143,181.74,3,acoustic,16,Chill
4,4,5vjLSffimiIP26QG5WcN2K,Chord Overstreet,Hold On,Hold On,82,198853,False,0.618,0.443,...,0.0526,0.469,0.0,0.0829,0.167,119.949,4,acoustic,17,Chill


In [23]:
#create a filter mechanism

moods = ["Happy", "Sad", "Chill", "Hype", "Focus"]
user_mood = input("Please select what mood you are in, must type as shown: Happy, Sad, Chill, Hype, Focus")

while user_mood not in moods:
    print("not in the list of moods provided")
    user_mood = input("Please select what mood you are in, must type as shown: Happy, Sad, Chill, Hype, Focus")



In [24]:
filtered_df = df[df["mood"] == user_mood]

filtered_df.head()

Unnamed: 0.1,Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,...,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre,Score,mood
0,0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,Comedy,73,230666,False,0.676,0.461,...,0.143,0.0322,1e-06,0.358,0.715,87.917,4,acoustic,20,Happy
5,5,01MVOl9KtVTNfFiBU9I7dc,Tyrone Wells,Days I Will Remember,Days I Will Remember,58,214240,False,0.688,0.481,...,0.105,0.289,0.0,0.189,0.666,98.017,4,acoustic,18,Happy
7,7,1EzrEOXmMH3G43AXT1y7pA,Jason Mraz,We Sing. We Dance. We Steal Things.,I'm Yours,80,242946,False,0.703,0.444,...,0.0417,0.559,0.0,0.0973,0.712,150.96,4,acoustic,21,Happy
8,8,0IktbUcnAGrvD03AWnz3Q8,Jason Mraz;Colbie Caillat,We Sing. We Dance. We Steal Things.,Lucky,74,189613,False,0.625,0.414,...,0.0369,0.294,0.0,0.151,0.669,130.088,4,acoustic,20,Happy
10,10,4mzP5mHkRvGxdhdGdAH7EJ,Zack Tabudlo,Episode,Give Me Your Forever,74,244800,False,0.627,0.363,...,0.0291,0.279,0.0,0.0928,0.301,99.905,4,acoustic,18,Happy


In [None]:
# sorts the data with the songs with the highest emotion in that category

df_sorted = filtered_df.sort_values(by='Score', ascending=False)

df_sorted.head()

Unnamed: 0.1,Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,...,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre,Score,mood
57058,57058,0BxE4FqsDD1Ot4YuBXwAPp,Arctic Monkeys,Favourite Worst Nightmare,505,88,253586,False,0.526,0.866,...,0.0568,0.00287,7.8e-05,0.0945,0.248,140.266,4,indie,21,Happy
56622,56622,1i8s3cuXeFCUj5Jz3x2rgw,Sanah Moidutty;Yazin Nizar,High Way,Kommallo,41,258471,False,0.813,0.524,...,0.0472,0.301,0.000335,0.26,0.787,124.034,4,indie-pop,21,Happy
56548,56548,2rUzh19do9xKVsfWCa92x7,Fitz and The Tantrums;Dave Audé,10's Dance Classics,HandClap - Dave Audé Remix,0,295932,False,0.725,0.918,...,0.0352,0.000148,0.00127,0.0642,0.503,128.016,4,indie-pop,21,Happy
56556,56556,4RGWHfQeJftd5XrP8JUgFj,Vansire,Metamodernity,Metamodernity,70,162663,False,0.823,0.544,...,0.0358,0.389,0.165,0.108,0.826,121.968,4,indie-pop,21,Happy
83289,83289,2DnINK9YPAgEpT9hP5cWqc,Armin van Buuren,ASOT 1090 - A State Of Trance Episode 1090,"A State Of Trance (ASOT 1090) - Track Recap, P...",38,31900,False,0.503,0.85,...,0.28,0.0187,0.0,0.369,0.894,137.354,4,progressive-house,21,Happy


In [31]:
# now will filter by genre of the music the user likes
genres = [
    'acoustic', 'afrobeat', 'alt-rock', 'alternative', 'ambient', 'anime', 
    'black-metal', 'bluegrass', 'blues', 'brazil', 'breakbeat', 'british', 
    'cantopop', 'chicago-house', 'children', 'chill', 'classical', 'club', 
    'comedy', 'country', 'dance', 'dancehall', 'death-metal', 'deep-house', 
    'detroit-techno', 'disco', 'disney', 'drum-and-bass', 'dub', 'dubstep', 
    'edm', 'electro', 'electronic', 'emo', 'folk', 'forro', 'french', 'funk', 
    'garage', 'german', 'gospel', 'goth', 'grindcore', 'groove', 'grunge', 
    'guitar', 'happy', 'hard-rock', 'hardcore', 'hardstyle', 'heavy-metal', 
    'hip-hop', 'honky-tonk', 'house', 'idm', 'indian', 'indie-pop', 'indie', 
    'industrial', 'iranian', 'j-dance', 'j-idol', 'j-pop', 'j-rock', 'jazz', 
    'k-pop', 'kids', 'latin', 'latino', 'malay', 'mandopop', 'metal', 
    'metalcore', 'minimal-techno', 'mpb', 'new-age', 'opera', 'pagode', 
    'party', 'piano', 'pop-film', 'pop', 'power-pop', 'progressive-house', 
    'psych-rock', 'punk-rock', 'punk', 'r-n-b', 'reggae', 'reggaeton', 
    'rock-n-roll', 'rock', 'rockabilly', 'romance', 'sad', 'salsa', 'samba', 
    'sertanejo', 'show-tunes', 'singer-songwriter', 'ska', 'sleep', 
    'songwriter', 'soul', 'spanish', 'study', 'swedish', 'synth-pop', 
    'tango', 'techno', 'trance', 'trip-hop', 'turkish', 'world-music'
]

# Function to get user input for genres and handle edge cases

def get_user_genres():
    print("Available genres:")
    print(", ".join(genres))
    print("You can select multiple genres separated by commas.")
    print("Type 'exit' to quit genre selection.")
    while True:
        user_input = input("Please select the genres of music you like: ")
        if user_input.lower() == 'exit':
            print("Exiting genre selection.")
            return []
        # Split input into a list, strip whitespace, and convert to lowercase
        user_genres = [genre.strip().lower() for genre in user_input.split(",")]
        # Check if all selected genres are valid
        invalid_genres = [genre for genre in user_genres if genre not in genres]
        if invalid_genres:
            print(f"Invalid genres: {', '.join(invalid_genres)}")
            print("Please select only from the available genres.")
        else:
            return user_genres
        

get_user_genres()



Available genres:
acoustic, afrobeat, alt-rock, alternative, ambient, anime, black-metal, bluegrass, blues, brazil, breakbeat, british, cantopop, chicago-house, children, chill, classical, club, comedy, country, dance, dancehall, death-metal, deep-house, detroit-techno, disco, disney, drum-and-bass, dub, dubstep, edm, electro, electronic, emo, folk, forro, french, funk, garage, german, gospel, goth, grindcore, groove, grunge, guitar, happy, hard-rock, hardcore, hardstyle, heavy-metal, hip-hop, honky-tonk, house, idm, indian, indie-pop, indie, industrial, iranian, j-dance, j-idol, j-pop, j-rock, jazz, k-pop, kids, latin, latino, malay, mandopop, metal, metalcore, minimal-techno, mpb, new-age, opera, pagode, party, piano, pop-film, pop, power-pop, progressive-house, psych-rock, punk-rock, punk, r-n-b, reggae, reggaeton, rock-n-roll, rock, rockabilly, romance, sad, salsa, samba, sertanejo, show-tunes, singer-songwriter, ska, sleep, songwriter, soul, spanish, study, swedish, synth-pop, tan

['alt-rock']

In [None]:
# implement a filter based on the music the people enjoy. From there we can soon start mixing music