In [1]:
import hdf5_getters as GETTERS
import h5py
import glob

In [2]:
# Localiza un archivo de canción
filename = glob.glob(
    r'C:\Users\Mario\Documents\1.Estudios\1.UNIR\TFM\Datasets\millionsongsubset\MillionSongSubset\A\A\A\TRAAAAW128F429D538.h5'
)[0]

# Abre el archivo
h5 = GETTERS.open_h5_file_read(filename)

# Obtén algunos datos
artist_name = GETTERS.get_artist_name(h5)
song_title = GETTERS.get_title(h5)
year = GETTERS.get_year(h5)
duration = GETTERS.get_duration(h5)

print(
    f"Artist: {artist_name}, Song: {song_title}, Year: {year}, Duration: {duration:.2f}s")

h5.close()

Artist: b'Casual', Song: b"I Didn't Mean To", Year: 0, Duration: 218.93s


#### 1. Crear la sesión con SPARK

In [14]:
#Initialize spark:
from pyspark.sql import SparkSession

# import pyspark
# print(pyspark.__version__)

spark = SparkSession.builder \
    .appName("MSD Recommender") \
    .master("spark://localhost:7077") \
    .config("spark.driver.host", "192.168.1.130") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "12g") \
    .config("spark.executor.cores", "3") \
    .getOrCreate()

In [13]:
# Stop Spark session
spark.stop()

In [15]:
df_test = spark.range(10)  # 1 millón de filas
print(df_test.count())  # Debe devolver 1,000,000

10


#### Anañizar el dataset

In [16]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.sql.types import StructType, StructField, DoubleType, StringType

import numpy as np
import hdf5_getters as GETTERS
import h5py
import glob
import pandas as pd

sc = spark.sparkContext

In [17]:
# Cargar ruta del dataset

dataset_path = "C:/Users/Mario/Documents/1.Estudios/1.UNIR/TFM/Datasets/millionsongsubset/MillionSongSubset"
file_paths = glob.glob(f"{dataset_path}/**/*.h5", recursive=True)


In [24]:
# Lista para guardar los features
data = []
metadata_list = []  # OJO cambiamos el nombre de la lista de metadatos


def extract_features(path):
    try:
        h5 = GETTERS.open_h5_file_read(path)
        segments_timbre = GETTERS.get_segments_timbre(h5)
        if segments_timbre is None or len(segments_timbre) == 0:
            timbre_mean = [0.0] * 12
            timbre_std = [0.0] * 12
        else:
            timbre_mean = np.mean(segments_timbre, axis=0).tolist()
            timbre_std = np.std(segments_timbre, axis=0).tolist()

        features = {
            'tempo': float(GETTERS.get_tempo(h5)),
            'loudness': float(GETTERS.get_loudness(h5)),
            'duration': float(GETTERS.get_duration(h5)),
            'key': float(GETTERS.get_key(h5)),
            'mode': float(GETTERS.get_mode(h5)),
            'time_signature': float(GETTERS.get_time_signature(h5)),
            'song_id': GETTERS.get_song_id(h5).decode('utf-8')
        }

        for i in range(12):
            features[f'timbre_mean_{i}'] = timbre_mean[i]
            features[f'timbre_std_{i}'] = timbre_std[i]

        song_id = GETTERS.get_song_id(h5).decode('utf-8')
        title = GETTERS.get_title(h5).decode('utf-8')
        artist = GETTERS.get_artist_name(h5).decode('utf-8')

        meta = {
            'song_id': song_id,
            'title': title,
            'artist': artist
        }

        h5.close()

        return features, meta

    except Exception as e:
        print(f"Error procesando el fichero: {path} --> {e}")
        return None, None


# Recorremos los ficheros uno a uno
for path in file_paths:
    result, meta = extract_features(path)
    if result is not None:
        data.append(result)
    if meta is not None:
        metadata_list.append(meta)

# Convertimos las listas a pandas
df_pd = pd.DataFrame(data)
df_meta = pd.DataFrame(metadata_list)

# Guardamos los metadatos
df_meta.to_csv("songs_metadata.csv", index=False)

# Y lo convertimos a DataFrame de PySpark
df = spark.createDataFrame(df_pd)

# Visualizamos los primeros registros
df.show(5)

+-------+--------+---------+---+----+--------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-------------------+------------------+-------------------+------------------+------------------+------------------+-------------------+------------------+--------------------+------------------+--------------------+------------------+------------------+------------------+-------------------+------------------+-------------------+------------------+
|  tempo|loudness| duration|key|mode|time_signature|           song_id|     timbre_mean_0|      timbre_std_0|     timbre_mean_1|      timbre_std_1|     timbre_mean_2|      timbre_std_2|      timbre_mean_3|      timbre_std_3|      timbre_mean_4|      timbre_std_4|     timbre_mean_5|      timbre_std_5|      timbre_mean_6|      timbre_std_6|       timbre_mean_7|      timbre_std_7|       timbre_mean_8|      timbre_std_8|     timbre_mean_9|      timbre_std_9|     

#### Entrenar el Modelo

In [20]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import StandardScaler
from pyspark.ml.feature import VectorAssembler

feature_cols = ['tempo', 'loudness', 'duration', 'key', 'mode', 'time_signature'] + \
               [f'timbre_mean_{i}' for i in range(12)] + \
               [f'timbre_std_{i}' for i in range(12)]

assembler = VectorAssembler(inputCols=feature_cols, outputCol="features_vec")
df_assembled = assembler.transform(df)


scaler = StandardScaler(inputCol="features_vec", outputCol="scaled_features")
scaler_model = scaler.fit(df_assembled)
df_scaled = scaler_model.transform(df_assembled)


kmeans = KMeans(featuresCol='scaled_features',
                predictionCol='cluster_id', k=20, seed=42)
model = kmeans.fit(df_scaled)


#### Recomendación 

In [22]:
import numpy as np
import pandas as pd
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.clustering import KMeansModel

# Supongamos que tienes ya definido:
# model, scaler_model, assembler, df_scaled

# 1️⃣ Simulamos la nueva canción
new_song = {
    'tempo': 120.0,
    'loudness': -5.0,
    'duration': 210.0,
    'key': 2.0,
    'mode': 1.0,
    'time_signature': 4.0,
    'song_id': 'NEW_SONG_ID'
}

# Simulamos timbre features
for i in range(12):
    new_song[f'timbre_mean_{i}'] = np.random.uniform(-50, 50)
    new_song[f'timbre_std_{i}'] = np.random.uniform(10, 50)

# 2️⃣ Convertimos a Spark DataFrame
new_song_df_pd = pd.DataFrame([new_song])
new_song_df = spark.createDataFrame(new_song_df_pd)

# 3️⃣ Ensamblamos y escalamos usando el pipeline original
new_song_vec = assembler.transform(new_song_df)
new_song_scaled = scaler_model.transform(new_song_vec)

# 4️⃣ Hacemos la predicción de cluster
new_song_pred = model.transform(new_song_scaled)
predicted_cluster = new_song_pred.select(
    "cluster_id").collect()[0]["cluster_id"]

print(f"La nueva canción ha sido asignada al cluster: {predicted_cluster}")

# 5️⃣ Buscamos canciones del dataset original en el mismo cluster

# Si no habías guardado los clusters en el df original:
df_with_cluster = model.transform(df_scaled)

# Filtramos por el mismo cluster
similar_songs = df_with_cluster.filter(
    df_with_cluster.cluster_id == predicted_cluster)

# Mostramos algunas recomendaciones (por ejemplo 3)
similar_songs.select("song_id").limit(3).show()

La nueva canción ha sido asignada al cluster: 13
+------------------+
|           song_id|
+------------------+
|SOIJXXM12A8C1416D6|
|SOTUBVH12AB018D2EC|
|SOFGQAP12AB0185946|
+------------------+



In [26]:
metadata_df = pd.read_csv("songs_metadata.csv")

# Supongamos que estos son los song_ids recomendados que has obtenido de tu modelo
recommended_song_ids = [row['song_id']
                        for row in similar_songs.select("song_id").limit(3).collect()]


# Filtramos los metadatos para obtener nombre de la canción y artista
recommended_songs = metadata_df[metadata_df['song_id'].isin(
    recommended_song_ids)]

# Mostramos el resultado
print(recommended_songs[['song_id', 'title', 'artist']])

                song_id             title          artist
69   SOIJXXM12A8C1416D6  Rosemary Recalls   Bruce Rowland
85   SOTUBVH12AB018D2EC  Indian Love Call    Slim Whitman
175  SOFGQAP12AB0185946    Dearly Beloved  Norrie Paramor
