# Import data and library

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/the-spotify-hit-predictor-dataset/LICENSE
/kaggle/input/the-spotify-hit-predictor-dataset/dataset-of-70s.csv
/kaggle/input/the-spotify-hit-predictor-dataset/dataset-of-10s.csv
/kaggle/input/the-spotify-hit-predictor-dataset/README.txt
/kaggle/input/the-spotify-hit-predictor-dataset/dataset-of-60s.csv
/kaggle/input/the-spotify-hit-predictor-dataset/dataset-of-80s.csv
/kaggle/input/the-spotify-hit-predictor-dataset/dataset-of-00s.csv
/kaggle/input/the-spotify-hit-predictor-dataset/dataset-of-90s.csv


In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import f1_score, confusion_matrix, mean_squared_error, precision_score

# Liste des fichiers de données pour différentes décennies
decades = ["dataset-of-00s.csv", "dataset-of-10s.csv", "dataset-of-60s.csv", "dataset-of-70s.csv", "dataset-of-80s.csv", "dataset-of-90s.csv"]

# Charger les données de toutes les décennies et les combiner en un seul DataFrame
all_data = pd.concat([pd.read_csv("/kaggle/input/the-spotify-hit-predictor-dataset/" + decade_file) for decade_file in decades])

# Diviser les données en caractéristiques (X) et cible (y)
X = all_data.drop(columns=["track", "artist", "uri", "target", "chorus_hit", "sections"])
y = all_data["target"]
# Diviser les données en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# Normaliser les caractéristiques (mettre à l'échelle les valeurs entre 0 et 1)
X_train = (X_train - X_train.min()) / (X_train.max() - X_train.min())
X_test = (X_test - X_test.min()) / (X_test.max() - X_test.min())

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

print(X_train)




X_train shape: (32884, 13)
X_test shape: (8222, 13)
y_train shape: (32884,)
y_test shape: (8222,)
      danceability    energy       key  loudness  mode  speechiness  \
2741      0.388664  0.282820  0.818182  0.637017   1.0     0.030126   
441       0.470648  0.743936  0.363636  0.764439   0.0     0.070084   
4208      0.422065  0.495873  0.181818  0.843576   1.0     0.031904   
2518      0.375506  0.999000  0.181818  0.861124   1.0     0.208159   
4340      0.609312  0.784946  0.636364  0.813706   1.0     0.060460   
...            ...       ...       ...       ...   ...          ...   
1941      0.940283  0.338834  0.090909  0.724041   1.0     0.388075   
3833      0.631579  0.579895  0.181818  0.761062   1.0     0.029603   
5192      0.638664  0.754938  0.636364  0.793385   1.0     0.142259   
6300      0.564777  0.817954  0.727273  0.852558   1.0     0.042155   
4325      0.557692  0.571893  0.545455  0.792668   1.0     0.034310   

      acousticness  instrumentalness  liveness   

# Define and train the model

In [3]:
# Créer un modèle CNN avec les paramètres actuels
model = keras.Sequential([
    layers.Reshape((X.shape[1], 1), input_shape=(X.shape[1],)),  # Couche d'entrée
    layers.Conv1D(128, kernel_size=3, activation='relu'),  # Couche de convolution
    layers.MaxPooling1D(pool_size=2),  # Couche de pooling
    layers.Conv1D(32, kernel_size=3, activation='relu'),  # Couche de convolution
    layers.MaxPooling1D(pool_size=2),  # Couche de pooling
    layers.Flatten(),  # Couche de mise à plat
    layers.Dense(32, activation='relu'),  # Couche dense
    layers.Dense(1, activation='sigmoid')  # Couche de sortie entre 0 et 1
])

# Compiler le modèle
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Ajouter Early Stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Entraîner le modèle avec Early Stopping
model.fit(X_train, y_train, epochs=100, batch_size=32, validation_data=(X_test, y_test), callbacks=[early_stopping])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100


<keras.callbacks.History at 0x7ee950261480>

# Test the model

In [4]:
# Évaluer le modèle sur les données de test
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy = {test_accuracy}")

# Prédire les étiquettes
predictions = model.predict(X_test)
threshold = 0.5  # Vous pouvez ajuster ce seuil au besoin

# Classification binaire basée sur le seuil
predicted_labels = (predictions > threshold).astype(int)

# Calculer le F1-score
f1 = f1_score(y_test, predicted_labels)
print(f"F1-Score = {f1}")

# Calculer la matrice de confusion
conf_matrix = confusion_matrix(y_test, predicted_labels)
print("Confusion Matrix = ")
print(conf_matrix)

# Calculer le Mean Squared Error (MSE)
mse = mean_squared_error(y_test, predictions)
print(f"Mean Squared Error (MSE) = {mse}")

# Calculer la précision
precision = precision_score(y_test, predicted_labels)
print(f"Precision = {precision} ")

Test Accuracy = 0.7608854174613953
F1-Score = 0.7779534673593856
Confusion Matrix = 
[[2812 1251]
 [ 715 3444]]
Mean Squared Error (MSE) = 0.16205317386084728
Precision = 0.7335463258785943 


# Test with 1 song

In [5]:
print(f"X_test[:1] = {X_test[:1]}")

print("Generate a prediction")
prediction = model.predict(X_test[:1])
print("prediction shape:", prediction)

X_test[:1] =       danceability    energy       key  loudness  mode  speechiness  \
3690      0.149608  0.046626  0.727273  0.323468   0.0     0.022936   

      acousticness  instrumentalness  liveness   valence     tempo  \
3690      0.995984          0.861862  0.075862  0.128153  0.494759   

      duration_ms  time_signature  
3690     0.017733             0.5  
Generate a prediction
prediction shape: [[0.00350052]]


In [6]:
# Despasito from https://developer.spotify.com/documentation/web-api/reference/get-audio-features

data = {
  "danceability": 0.655,
  "energy": 0.797,
  "key": 2,
  "loudness": 4.787,
  "mode": 1,
  "speechiness": 0.153,
  "acousticness": 0.198,
  "instrumentalness": 0,
  "liveness": 0.067,
  "valence": 0.839,
  "tempo": 177.928,
  "type": "audio_features",
  "id": "6habFhsOp2NvshLv26DqMb",
  "uri": "spotify:track:6habFhsOp2NvshLv26DqMb",
  "track_href": "https://api.spotify.com/v1/tracks/6habFhsOp2NvshLv26DqMb",
  "analysis_url": "https://api.spotify.com/v1/audio-analysis/6habFhsOp2NvshLv26DqMb",
  "duration_ms": 229360,
  "time_signature": 4
}

df = pd.DataFrame(data, index=[0])

print(df)

X = all_data[:1]
print(f"X = {X} \n")
X = X.drop(columns=["track", "artist", "uri", "target"])
print(f"X = {X} \n")


despasito = df.drop(columns=["type", "id", "uri", "track_href", "analysis_url"])
print(f"despasito = {despasito}")



print("Generate a prediction")
prediction = model.predict(despasito)
print("prediction:", prediction)

if prediction > 0.5:
    print("It's a hit !")

   danceability  energy  key  loudness  mode  speechiness  acousticness  \
0         0.655   0.797    2     4.787     1        0.153         0.198   

   instrumentalness  liveness  valence    tempo            type  \
0                 0     0.067    0.839  177.928  audio_features   

                       id                                   uri  \
0  6habFhsOp2NvshLv26DqMb  spotify:track:6habFhsOp2NvshLv26DqMb   

                                          track_href  \
0  https://api.spotify.com/v1/tracks/6habFhsOp2Nv...   

                                        analysis_url  duration_ms  \
0  https://api.spotify.com/v1/audio-analysis/6hab...       229360   

   time_signature  
0               4  
X =        track             artist                                   uri  \
0  Lucky Man  Montgomery Gentry  spotify:track:4GiXBCUF7H6YfNQsnBRIzl   

   danceability  energy  key  loudness  mode  speechiness  acousticness  \
0         0.578   0.471    4     -7.27     1       0.0289    