In [93]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances, manhattan_distances
from scipy.stats import pearsonr
from scipy.spatial.distance import jaccard
from scipy.sparse import hstack, csr_matrix
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder

In [94]:
df = pd.read_csv('data/dataset_processed.csv')

In [95]:
df = df.dropna(subset=["lyrics"])

In [96]:
# Seleccion de columnas numericas
num_cols = [
    "valence", "danceability", "energy", "acousticness", 
    "instrumentalness", "liveness", "tempo", "loudness", "mode", "key"
]

scaler = MinMaxScaler()
num_features = scaler.fit_transform(df[num_cols])
num_sparse = csr_matrix(num_features)

In [97]:
# Vectorizar texto (TF-IDF)
tfidf = TfidfVectorizer(stop_words='english', max_features=8000) # 8000 palabras más frecuentes
tfidf_matrix = tfidf.fit_transform(df["lyrics"])

In [98]:
cat_encoder = OneHotEncoder(sparse_output=True, handle_unknown="ignore")
cat_features = cat_encoder.fit_transform(df[["artist", "decade"]])

In [99]:
def get_top_k(sim_vector, k=10):
    indices = sim_vector.argsort()[-k-1:][::-1]
    indices = [i for i in indices if sim_vector[i] < 0.9999]
    return indices[:k]

In [100]:
def recommend(song_name, metric="cosine", data_type="numeric", k=10):
    try:
        song_index = df[df["name"].str.lower() == song_name.lower()].index[0]
    except IndexError:
        print(f"Canción '{song_name}' no encontrada.")
        return
    
    if data_type == "numeric":
        data = num_sparse
    elif data_type == "text":
        data = tfidf_matrix
    elif data_type == "categorical":
        data = cat_features
    elif data_type == "mixed":
        data = hstack([num_sparse, tfidf_matrix, cat_features])
    else:
        raise ValueError("Tipo de datos inválido: 'numeric', 'text', 'categorical' o 'mixed'")
    
    x = data[song_index]

    if metric == "cosine":
        sim = cosine_similarity(x, data).flatten()
    elif metric == "euclidean":
        sim = 1 / (1 + euclidean_distances(x, data).flatten())
    elif metric == "manhattan":
        sim = 1 / (1 + manhattan_distances(x, data).flatten())
    elif metric == "pearson":
        x_dense = x.toarray().flatten()
        sim = np.array([pearsonr(x_dense, y.toarray().flatten())[0] for y in data])
        sim = np.nan_to_num(sim)
    elif metric == "jaccard":
        cv = CountVectorizer(binary=True, max_features=8000)
        bin_matrix = cv.fit_transform(df["lyrics"])
        x_bin = bin_matrix[song_index].toarray().ravel() > 0
        sim = []
        for i in range(bin_matrix.shape[0]):
            y_bin = bin_matrix[i].toarray().ravel() > 0
            sim.append(1 - jaccard(x_bin, y_bin))
        sim = np.array(sim)
    else:
        raise ValueError("Métrica inválida.")

    indices = get_top_k(sim, k)
    recs = df.iloc[indices][["name", "artist", "decade"]].copy()
    recs["similarity"] = sim[indices]
    
    print(f"\nRecomendaciones para: {df.loc[song_index, 'name']} ({metric}, {data_type})")
    return recs

In [101]:
song = "Just the Way You Are"

In [102]:
print(recommend(song, metric="cosine", data_type="numeric"))
print(recommend(song, metric="euclidean", data_type="numeric"))
print(recommend(song, metric="pearson", data_type="numeric"))


Recomendaciones para: Just the Way You Are (cosine, numeric)
                                               name                 artist  \
19446                    When the Heartache Is Over            Tina Turner   
2150                             Fire and the Flood              Vance Joy   
3629                                      Bartender             (Hed) P.E.   
18062                            Such Great Heights  Streetlight Manifesto   
4214   Freak The Freak Out (feat. Victoria Justice)        Victorious Cast   
8471                            Deal with the Devil               Pop Evil   
6237                               According To You               Orianthi   
6112                                  Bread & Water           Ryan Bingham   
20058                                       Kiss Me          Casey Donahew   
12605                                  Off She Goes               Bad Suns   

       decade  similarity  
19446    1990    0.998663  
2150     2010    0.9979

In [103]:
print(recommend(song, metric="cosine", data_type="text"))
print(recommend(song, metric="jaccard", data_type="text"))


Recomendaciones para: Just the Way You Are (cosine, text)
                                name           artist  decade  similarity
960                       So Amazing  Luther Vandross    1980    0.447924
8514                           Hello             OMFG    2010    0.414921
10020                Love Of My Life   Brian McKnight    2000    0.340674
17097                    All the Way     Judas Priest    1980    0.318356
19728                Detlef Schrempf   Band of Horses    2000    0.295776
17717  Slum Beautiful (feat. Cee-Lo)          OutKast    2000    0.295714
2162         Scars To Your Beautiful     Alessia Cara    2010    0.286229
1333                  You're the One              SWV    1990    0.274925
13908               You've Got A Way     Shania Twain    1990    0.268412
8420            Everything About You    One Direction    2010    0.267022

Recomendaciones para: Just the Way You Are (jaccard, text)
                                                    name          a

In [104]:
print(recommend(song, metric="cosine", data_type="mixed"))


Recomendaciones para: Just the Way You Are (cosine, mixed)
                       name      artist  decade  similarity
8415                Natalie  Bruno Mars    2010    0.861249
2034   Locked out of Heaven  Bruno Mars    2010    0.851461
1956           Runaway Baby  Bruno Mars    2010    0.835371
14405  Money Make Her Smile  Bruno Mars    2010    0.826693
6332              If I Knew  Bruno Mars    2010    0.794093
14417              Treasure  Bruno Mars    2010    0.755790
8557                Finesse  Bruno Mars    2010    0.754191
2027    When I Was Your Man  Bruno Mars    2010    0.742695
10665                Summer  Marshmello    2010    0.742660
4244            Young Girls  Bruno Mars    2010    0.742517
