In [17]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances, manhattan_distances
from scipy.stats import pearsonr
from scipy.spatial.distance import jaccard
from scipy.sparse import hstack, csr_matrix
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder

In [18]:
df = pd.read_csv('data/dataset_processed.csv')

In [19]:
df = df.dropna(subset=["lyrics"])

In [20]:
# Seleccion de columnas numericas
num_cols = [
    "valence", "danceability", "energy", "acousticness", 
    "instrumentalness", "liveness", "tempo", "loudness", "mode", "key"
]

scaler = MinMaxScaler()
num_features = scaler.fit_transform(df[num_cols])
num_sparse = csr_matrix(num_features)

In [21]:
# Vectorizar texto (TF-IDF)
tfidf = TfidfVectorizer(stop_words='english', max_features=8000) # 8000 palabras más frecuentes
tfidf_matrix = tfidf.fit_transform(df["lyrics"])

In [22]:
cat_encoder = OneHotEncoder(sparse_output=True, handle_unknown="ignore")
cat_features = cat_encoder.fit_transform(df[["artist", "decade"]])

In [35]:
def get_top_k(sim_vector, k=5):
    indices = sim_vector.argsort()[-k-1:][::-1]
    indices = [i for i in indices if sim_vector[i] < 0.9999]
    return indices[:k]

In [36]:
def recommend(song_name, metric="cosine", data_type="numeric", k=5):
    try:
        song_index = df[df["name"].str.lower() == song_name.lower()].index[0]
    except IndexError:
        print(f"Canción '{song_name}' no encontrada.")
        return
    
    if data_type == "numeric":
        data = num_sparse
    elif data_type == "text":
        data = tfidf_matrix
    elif data_type == "categorical":
        data = cat_features
    elif data_type == "mixed":
        data = hstack([num_sparse, tfidf_matrix, cat_features])
    else:
        raise ValueError("Tipo de datos inválido: 'numeric', 'text', 'categorical' o 'mixed'")
    
    x = data[song_index]

    if metric == "cosine":
        sim = cosine_similarity(x, data).flatten()
    elif metric == "euclidean":
        sim = 1 / (1 + euclidean_distances(x, data).flatten())
    elif metric == "manhattan":
        sim = 1 / (1 + manhattan_distances(x, data).flatten())
    elif metric == "pearson":
        x_dense = x.toarray().flatten()
        sim = np.array([pearsonr(x_dense, y.toarray().flatten())[0] for y in data])
        sim = np.nan_to_num(sim)
    elif metric == "jaccard":
        cv = CountVectorizer(binary=True, max_features=8000)
        bin_matrix = cv.fit_transform(df["lyrics"])
        x_bin = bin_matrix[song_index].toarray().ravel() > 0
        sim = []
        for i in range(bin_matrix.shape[0]):
            y_bin = bin_matrix[i].toarray().ravel() > 0
            sim.append(1 - jaccard(x_bin, y_bin))
        sim = np.array(sim)
    else:
        raise ValueError("Métrica inválida.")

    indices = get_top_k(sim, k)
    recs = df.iloc[indices][["name", "artist", "decade"]].copy()
    recs["similarity"] = sim[indices]
    
    print(f"\nRecomendaciones para: {df.loc[song_index, 'name']} ({metric}, {data_type})")
    return recs

In [37]:
song = "Wonderwall"


In [None]:
recommend(song, metric="cosine", data_type="numeric")


Recomendaciones para: Wonderwall (cosine, numeric)


Unnamed: 0,name,artist,decade,similarity
9256,Lonely Children,Foreigner,1970,0.99824
18125,Placeholder,The Story So Far,2010,0.99799
17943,Lie To Me,12 Stones,2000,0.997884
1584,My Friends Over You,New Found Glory,2000,0.997738
14488,Falling in Love Again,Joyce Manor,2010,0.997545


In [41]:
recommend(song, metric="euclidean", data_type="numeric")



Recomendaciones para: Wonderwall (euclidean, numeric)


Unnamed: 0,name,artist,decade,similarity
9256,Lonely Children,Foreigner,1970,0.897445
17943,Lie To Me,12 Stones,2000,0.890763
11419,Pain and Pleasure,Judas Priest,1980,0.875085
7593,Too Many Puppies,Primus,1990,0.873275
19918,Kids in America,The Muffs,2010,0.872584


In [42]:
recommend(song, metric="pearson", data_type="numeric")


Recomendaciones para: Wonderwall (pearson, numeric)


Unnamed: 0,name,artist,decade,similarity
9256,Lonely Children,Foreigner,1970,0.995308
1584,My Friends Over You,New Found Glory,2000,0.995008
17943,Lie To Me,12 Stones,2000,0.994692
18125,Placeholder,The Story So Far,2010,0.994654
14488,Falling in Love Again,Joyce Manor,2010,0.994046


In [43]:
recommend(song, metric="cosine", data_type="text")


Recomendaciones para: Wonderwall (cosine, text)


Unnamed: 0,name,artist,decade,similarity
3199,Jesus Saves,Slayer,1980,0.381213
6176,You Make It Real,James Morrison,2000,0.318198
4321,Fireproof,One Direction,2010,0.301804
17635,Crimson Ghost,Misfits,1990,0.274993
14871,Love Me Like You Mean It,Percy Sledge,1960,0.267984


In [44]:
recommend(song, metric="jaccard", data_type="text")


Recomendaciones para: Wonderwall (jaccard, text)


Unnamed: 0,name,artist,decade,similarity
5448,Feel A Whole Lot Better,Tom Petty,1980,0.272727
3863,Hit the Floor,Linkin Park,2000,0.26875
14833,I'll Feel a Whole Lot Better,The Byrds,1960,0.267857
8735,Kinfolks,Sam Hunt,2020,0.263514
801,You Can Do Magic,America,1980,0.261194


In [45]:
recommend(song, metric="cosine", data_type="mixed")


Recomendaciones para: Wonderwall (cosine, mixed)


Unnamed: 0,name,artist,decade,similarity
1270,Don't Look Back In Anger,Oasis,1990,0.849128
7772,Hey Now!,Oasis,1990,0.839889
9936,The Masterplan,Oasis,1990,0.83056
1367,Tubthumping,Chumbawamba,1990,0.710935
15590,Why Do You Want Him?,Green Day,1990,0.705764
