In [55]:
#Importing libraries:

import pandas as pd
import random
from fuzzywuzzy import fuzz
import string
import re
import numpy as np
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_score
import config
import spotipy
import pandas as pd
import json
from spotipy.oauth2 import SpotifyClientCredentials
from IPython.display import IFrame

#Initialize SpotiPy with user credentials
sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id= config.client_id,
                                                           client_secret= config.client_secret))

In [56]:
#Loading the datasets
url1 = "world_top_songs.csv"
top_world = pd.read_csv(url1)
url2 = "latin_top_songs.csv"
top_latin = pd.read_csv(url2)

path1= 'afmusic_df.csv'
afmusic_df = pd.read_csv(path1)
afmusic_df
path2= 'music_compl_df.csv'
music_com_df = pd.read_csv(path2)

In [57]:
#Cleaning users input
def clean_input(user_input):
    # Remove leading and trailing spaces
    cleaned_input = user_input.strip()
    
    # Convert to lowercase
    cleaned_input = cleaned_input.lower()
    
    # Remove symbols using regular expression
    cleaned_input = re.sub(r'[^\w\s]', '', cleaned_input)
    
    return cleaned_input

# Function to find the best match
def find_best_match(user_input, column_values):
    best_match = None
    highest_similarity = 0

    for value in column_values:
        similarity = fuzz.partial_ratio(user_input, value)
        if similarity > highest_similarity:
            highest_similarity = similarity
            best_match = value

    return best_match, highest_similarity

def play_song(track_id):
    return IFrame(src="https://open.spotify.com/embed/track/"+ track_id,
       width="320",
       height="80",
       frameborder="0",
       allowtransparency="true",
       allow="encrypted-media",
        )

def ask_and_clean_user_input():
    user_input =  print(input('Write one of your favorite songs: '))  
    clean_user_input = clean_input(user_input)
    best_match, similarity = find_best_match(cleaned_user_input, top_world['song'].values)
    return clean_user_input


def display_song(cleaned_input):
    results = sp.search(q= cleaned_input, limit=5, market="GB")
    track_id = results["tracks"]["items"][0]["id"]
    print('Here you can listen to your song: ')
    return display(play_song(track_id))

def recc_hot_song(intended_user_input):
    print(f"Congrats {intended_user_input} is one of the top world songs! We recommend you to listen to: ")
    hot_song_column_values = top_world['song'].tolist()
    random_hot_song = random.choice(hot_song_column_values)
    print(random_hot_song)
    return random_hot_song



In [58]:
#ML algorthm K-Means

def music_df_clusters_ML():
    X = pd.DataFrame(afmusic_df)
    scaler = StandardScaler()
    scaler.fit(X)
    X_scaled = scaler.transform(X)
    X_scaled_df = pd.DataFrame(X_scaled, columns = X.columns)
    kmeans = KMeans(n_clusters= 16, random_state=1234)
    kmeans.fit(X_scaled_df)
    labels = kmeans.labels_
    clusters = kmeans.predict(X_scaled_df)
    music_com_df['cluster']= clusters
    return music_com_df
    
def get_user_track_id(cleaned_input):
    results = sp.search(q= cleaned_input, limit=5, market="GB")
    user_track_id = results["tracks"]["items"][0]["id"]
    return user_track_id

def ML_recc_song(user_track_id, music_com_df):
    X = pd.DataFrame(afmusic_df)
    scaler = StandardScaler()
    scaler.fit(X)
    X_scaled = scaler.transform(X)
    X_scaled_df = pd.DataFrame(X_scaled, columns = X.columns)
    kmeans = KMeans(n_clusters= 15, random_state=1234)
    kmeans.fit(X_scaled_df)
    labels = kmeans.labels_
    clusters = kmeans.predict(X_scaled_df)

    af_users_song= sp.audio_features(user_track_id)
    af_users_song_df= pd.DataFrame(af_users_song)
    af_users_song_df.drop(columns= ['type', 'id', 'uri', 'track_href', 'analysis_url'], inplace= True)
    
    user_song_scaled = scaler.transform(af_users_song_df)
    user_song_scaled_df = pd.DataFrame(user_song_scaled, columns = af_users_song_df.columns)
    a_cluster_song2recommend = kmeans.predict(user_song_scaled_df)
    cluster_song2recommend = a_cluster_song2recommend[0]

    #select a random song from my database that have the same cluster
    song_cluster_df= music_com_df[music_com_df['cluster'] == cluster_song2recommend]
    #select a random song from my database
    cluster_column_values = song_cluster_df['id'].tolist()
    id_random_cluster_song = random.choice(cluster_column_values)
    print(f"Hey the song is not in the top 100, but I can recommend you a song that I'm pretty sure you will like:")
    return display(play_song(id_random_cluster_song))
    



In [59]:
def recommend_song():
    user_input =  input('Write one of your favorite songs: ')
    clean_user_input = clean_input(user_input)
    best_match, similarity = find_best_match(clean_user_input, top_world['song'].values)
    display_song(clean_user_input)
    # Check if similarity is at least 60%
    if similarity >= 70:
        response = input(f"We found a possible match for top hot songs: '{best_match}'. Is this what you mean? (yes/no): ")
        if response.lower() in ['yes', 'yeah', 'yep', 'ye']:
            if best_match in top_world["song"].values:
                recc_hot_song(best_match)
                display_song(recc_hot_song)
        else:
            attempt= input('Sorry, we could not understand your song, do you want to try again? (yes/no)')
            if attempt.lower() in ['yes', 'yeah', 'yep', 'ye']:
                user_input =  input('Write one of your favorite songs: ')
                clean_user_input = clean_input(user_input)
                display_song(clean_user_input)
                #Check again if this song is in the top hot or not
                if clean_user_input in top_world["song"].str.lower().values:
                    recc_hot_song(clean_user_input)
                    display_song(recc_hot_song) #Potential improvement add while loops for making sure is the song the user wants and to guarantee the song they enter is not the same they get recommended
                else:
                    music_df_clusters_ML()
                    ML_recc_song(get_user_track_id(clean_user_input), music_df_clusters_ML())
            else:
                print('Ok, see you next time')
                exit 
    else:
        music_df_clusters_ML()
        ML_recc_song(get_user_track_id(clean_user_input), music_df_clusters_ML())
            

In [60]:
recommend_song()

Write one of your favorite songs:  flo


Here you can listen to your song: 


We found a possible match for top hot songs: 'Murder On The Dancefloor'. Is this what you mean? (yes/no):  no
Sorry, we could not understand your song, do you want to try again? (yes/no) yes
Write one of your favorite songs:  la bamba


Here you can listen to your song: 


Hey the song is not in the top 100, but I can recommend you a song that I'm pretty sure you will like:
