In [66]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import plotly.express as px
import joblib
import umap
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [67]:
model_path = "/home/matthieu/UbuntuData/PycharmProjects/MelodAI/models/banger_random_forest_model.pkl"

In [68]:
# Load the model
from joblib import load

model = load(model_path)

In [69]:
from src.data.make_data_for_prediction import prepare_data_df

In [70]:
def parse_track_data(track_data: str):
    # Split the input string by commas
    data = track_data.split(',')

    # Define a dictionary with the parsed values
    input_data = {
        "track_id": data[1],
        "artists": data[2],
        "album_name": data[3],
        "track_name": data[4],
        "popularity": int(data[5]),
        "duration_ms": int(data[6]),
        "explicit": data[7] == 'True',
        "danceability": float(data[8]),
        "energy": float(data[9]),
        "key": int(data[10]),
        "loudness": float(data[11]),
        "mode": int(data[12]),
        "speechiness": float(data[13]),
        "acousticness": float(data[14]),
        "instrumentalness": float(data[15]),
        "liveness": float(data[16]),
        "valence": float(data[17]),
        "tempo": float(data[18]),
        "time_signature": int(data[19]),
        "track_genre": data[20]
    }

    return input_data


In [71]:
def parse_multiple_tracks_data(tracks_data):
    # Liste pour stocker les résultats
    parsed_tracks = []

    # Parcours de chaque ligne des données
    for track_data in tracks_data:
        # Split des valeurs par la virgule
        data = track_data.split(',')

        # Création du dictionnaire pour chaque morceau de musique
        input_data = {
            "track_id": data[1],
            "artists": data[2],
            "album_name": data[3],
            "track_name": data[4],
            "popularity": int(data[5]),
            "duration_ms": int(data[6]),
            "explicit": data[7] == 'True',
            "danceability": float(data[8]),
            "energy": float(data[9]),
            "key": int(data[10]),
            "loudness": float(data[11]),
            "mode": int(data[12]),
            "speechiness": float(data[13]),
            "acousticness": float(data[14]),
            "instrumentalness": float(data[15]),
            "liveness": float(data[16]),
            "valence": float(data[17]),
            "tempo": float(data[18]),
            "time_signature": int(data[19]),
            "track_genre": data[20]
        }

        # Ajout du dictionnaire à la liste des résultats
        parsed_tracks.append(input_data)

    return parsed_tracks


In [84]:
tracks_data = [
    "0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,Comedy,73,230666,False,0.676,0.461,1,-6.746,0,0.143,0.0322,1.01e-06,0.358,0.715,87.917,4,acoustic",
    "1,4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,Ghost (Acoustic),Ghost - Acoustic,55,149610,False,0.42,0.166,1,-17.235,1,0.0763,0.924,5.56e-06,0.101,0.267,77.489,4,acoustic",
    "2,1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,To Begin Again,To Begin Again,57,210826,False,0.438,0.359,0,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332,4,acoustic",
    "13,0X9MxHR1rTkEHDjp95F2OO,Anna Hamilton,Bad Liar,Bad Liar,62,248448,False,0.691,0.234,3,-6.441,1,0.0285,0.777,0.0,0.12,0.209,87.103,4,acoustic",
    "10,4mzP5mHkRvGxdhdGdAH7EJ,Zack Tabudlo,Episode,Give Me Your Forever,74,244800,False,0.627,0.363,8,-8.127,1,0.0291,0.279,0.0,0.0928,0.301,99.905,4,acoustic",
    "9,7k9GuJYLp2AzqokyEdwEw2,Ross Copperman,Hunger,Hunger,56,205594,False,0.442,0.632,1,-6.77,1,0.0295,0.426,0.00419,0.0735,0.196,78.899,4,acoustic",
    "42,0e5PAxSyZ5DWWVqKANHETz,Brandi Carlile;Lucius,Country Car Hits,You and Me on the Rock,0,230098,False,0.568,0.686,1,-6.635,1,0.033,0.15,1.81e-06,0.0881,0.725,172.075,4,acoustic",
    "4014,5VYBergVrUDcb8QyEg70cF,Cigarettes After Sex,Affection,Keep on Loving You,65,233010,False,0.456,0.364,2,-9.336,0,0.0252,0.533,0.96,0.111,0.116,73.457,4,ambient",
    "2416,40lKptao1hxVqA7fd1OOGp,Chris Tomlin,And If Our God Is For Us...,Our God,63,285160,False,0.509,0.778,11,-3.608,1,0.0295,0.00998,0.0,0.19,0.216,105.066,4,alt-rock",
    "1362,4n6CwGRWj7dRtDqmQCX4Pz,Jorge Drexler,Silencio,Silencio,21,207986,False,0.887,0.568,4,-10.146,0,0.194,0.263,0.00285,0.0951,0.127,127.923,4,afrobeat",
]

track_data = ["0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,Comedy,73,230666,False,0.676,0.461,1,-6.746,0,0.143,0.0322,1.01e-06,0.358,0.715,87.917,4,acoustic"]

# parse the track
input_data = parse_multiple_tracks_data(track_data)

In [85]:
input_data

[{'track_id': '5SuOikwiRyPMVoIQDJUgSV',
  'artists': 'Gen Hoshino',
  'album_name': 'Comedy',
  'track_name': 'Comedy',
  'popularity': 73,
  'duration_ms': 230666,
  'explicit': False,
  'danceability': 0.676,
  'energy': 0.461,
  'key': 1,
  'loudness': -6.746,
  'mode': 0,
  'speechiness': 0.143,
  'acousticness': 0.0322,
  'instrumentalness': 1.01e-06,
  'liveness': 0.358,
  'valence': 0.715,
  'tempo': 87.917,
  'time_signature': 4,
  'track_genre': 'acoustic'}]

In [86]:
# Create a DataFrame
X = pd.DataFrame(input_data)

In [87]:
X

Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,Comedy,73,230666,False,0.676,0.461,1,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4,acoustic


In [88]:
# Initialize a LabelEncoder for the 'track_genre' column
label_encoder = LabelEncoder()

# Encode the 'track_genre' column and add it as a new column 'track_genre_encoded'
X.loc[:, 'track_genre_encoded'] = label_encoder.fit_transform(X['track_genre'])

# List of columns that are not useful for processing and training
unexploitable_columns = ['track_id', 'artists', 'album_name', 'track_name', 'track_genre']

# DataFrame containing only the unexploitable columns
trash = X[unexploitable_columns]

# DataFrame without the unexploitable columns
X = X.drop(columns=unexploitable_columns)

In [89]:
scaler = StandardScaler()
scaled_features = scaler.fit_transform(X.drop(columns=['track_genre_encoded']))

# Create a dataframe with the scaled features
df_std_scaler = pd.DataFrame(scaled_features, columns=X.columns[:-1])  # Les colonnes sans la target

# Add the target column
df_std_scaler['track_genre_encoded'] = X['track_genre_encoded']

# drop null values
df_std_scaler = df_std_scaler.dropna()

# UMAP for dimensionality reduction
umap_reducer = umap.UMAP(n_components=6)
df_umap = umap_reducer.fit_transform(df_std_scaler)

# Create a DataFrame
df_umap = pd.DataFrame(df_umap, columns=['UMAP1', 'UMAP2', 'UMAP3', 'UMAP4', 'UMAP5', 'UMAP6'])

# Concatenate with the target
df_umap = pd.concat([df_umap, X['track_genre_encoded']], axis=1)

# Drop null values
df_umap = df_umap.dropna()

In [90]:
df_umap

Unnamed: 0,UMAP1,UMAP2,UMAP3,UMAP4,UMAP5,UMAP6,track_genre_encoded
0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [91]:
# Predict
prediction = model.predict(df_umap.drop(columns=["track_genre_encoded"]))

In [92]:
prediction

array([97.])