In [4]:
!pip install kaggle
from google.colab import files
files.upload()  # Upload kaggle.json

!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

!kaggle datasets download -d andradaolteanu/gtzan-dataset-music-genre-classification
!unzip gtzan-dataset-music-genre-classification.zip



ModuleNotFoundError: No module named 'google.colab'

In [None]:
!pip install librosa scikit-learn tensorflow

In [6]:
import os
import numpy as np
import librosa

# Update the data directory based on your extraction output.
data_dir = "Data/genres_original"  # Note the "Data/" prefix now.
genres = ["blues", "classical", "country", "disco", "hiphop",
          "jazz", "metal", "pop", "reggae", "rock"]

def extract_features(file_path):
    """
    Extract three audio features:
      - Tempo (BPM)
      - Average RMS Energy
      - Average Spectral Centroid
    """
    try:
        y, sr = librosa.load(file_path, duration=30)  # load first 30 seconds
        tempo = librosa.beat.tempo(y=y, sr=sr)[0]
        rms = np.mean(librosa.feature.rms(y=y))
        spec_centroid = np.mean(librosa.feature.spectral_centroid(y=y, sr=sr))
        return np.array([tempo, rms, spec_centroid])
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return np.array([0, 0, 0])

# Process a subset for demonstration: 5 files per genre
song_features = []
song_ids = []
song_genres = []
song_file_paths = []
song_id = 0
max_files_per_genre = 5  # adjust as needed

for genre in genres:
    genre_path = os.path.join(data_dir, genre)
    if not os.path.exists(genre_path):
        print(f"Folder not found: {genre_path}")
        continue
    files_in_genre = [f for f in os.listdir(genre_path) if f.endswith(".wav")]
    files_in_genre = files_in_genre[:max_files_per_genre]
    for file in files_in_genre:
        file_path = os.path.join(genre_path, file)
        features = extract_features(file_path)
        song_features.append(features)
        song_ids.append(song_id)
        song_genres.append(genre)
        song_file_paths.append(file_path)
        song_id += 1

song_features = np.array(song_features)
print(f"Extracted features for {song_features.shape[0]} songs.")

	This function was moved to 'librosa.feature.rhythm.tempo' in librosa version 0.10.0.
	This alias will be removed in librosa version 1.0.
  tempo = librosa.beat.tempo(y=y, sr=sr)[0]


Extracted features for 50 songs.


In [7]:
import numpy as np

num_users = 100  # Simulated total number of users
num_songs = song_id  # Number of songs processed above
num_interactions = 2000  # Total simulated interactions

# Simulate user and song interactions
users_interactions = np.random.randint(0, num_users, size=(num_interactions, 1))
songs_interactions = np.random.randint(0, num_songs, size=(num_interactions, 1))
ratings = np.random.rand(num_interactions, 1)  # Simulated ratings (0 to 1)

# Map each interaction to its audio features (shape: [num_interactions, 3])
audio_inputs = np.array([song_features[song_idx] for song_idx in songs_interactions.flatten()])

In [8]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Dropout, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras import backend as K

# Custom RMSE metric
def rmse(y_true, y_pred):
    return K.sqrt(K.mean(K.square(y_pred - y_true)))

# Define inputs
user_input = Input(shape=(1,), name='user_input')
song_input = Input(shape=(1,), name='song_input')
audio_input = Input(shape=(3,), name='audio_features')

# Collaborative Filtering Pathway (User & Song embeddings)
user_embedding = Embedding(input_dim=num_users, output_dim=50, name='user_embedding')(user_input)
song_embedding = Embedding(input_dim=num_songs, output_dim=50, name='song_embedding')(song_input)

user_vec = Flatten()(user_embedding)
song_vec = Flatten()(song_embedding)

interaction = Concatenate()([user_vec, song_vec])
collaborative = Dense(64, activation='relu')(interaction)
collaborative = Dropout(0.3)(collaborative)

# Content-Based Pathway (Audio features)
content = Dense(32, activation='relu')(audio_input)
content = Dropout(0.3)(content)

# Merge both pathways
merged = Concatenate()([collaborative, content])
merged = Dense(64, activation='relu')(merged)
merged = Dropout(0.3)(merged)
merged = Dense(32, activation='relu')(merged)

# Output Layer: Predicting a continuous rating
output = Dense(1, activation='linear', name='output')(merged)

# Build and compile the model
model = Model(inputs=[user_input, song_input, audio_input], outputs=output)
model.compile(optimizer='adam', loss='mse', metrics=[rmse, 'mae'])
model.summary()

In [9]:
history = model.fit([users_interactions, songs_interactions, audio_inputs], ratings,
                    epochs=10,
                    batch_size=32,
                    validation_split=0.2)

Epoch 1/10
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 11ms/step - loss: 25653.6699 - mae: 118.1565 - rmse: 144.5610 - val_loss: 6.0890 - val_mae: 1.9343 - val_rmse: 2.4468
Epoch 2/10
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 2953.3518 - mae: 40.9912 - rmse: 53.4916 - val_loss: 103.1462 - val_mae: 9.2898 - val_rmse: 10.1415
Epoch 3/10
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 1563.6328 - mae: 30.0310 - rmse: 39.1161 - val_loss: 53.9674 - val_mae: 6.5476 - val_rmse: 7.3284
Epoch 4/10
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 1024.3210 - mae: 24.8855 - rmse: 31.7249 - val_loss: 36.0138 - val_mae: 5.2138 - val_rmse: 5.9812
Epoch 5/10
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 759.9844 - mae: 21.0962 - rmse: 27.2881 - val_loss: 14.7542 - val_mae: 3.1609 - val_rmse: 3.8184
Epoch 6/10
[1m50/50[0m [32m━━━━━━━━━━━━━━━━