In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV

In [2]:
PATH_DATA = 'all.csv'

In [3]:
sp = pd.read_csv(PATH_DATA, sep = '\t')

In [4]:
sp.shape

(671, 12)

In [5]:
sp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 671 entries, 0 to 670
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   title         671 non-null    object
 1   artist        671 non-null    object
 2   release       671 non-null    object
 3   bpm           671 non-null    int64 
 4   energy        671 non-null    int64 
 5   danceability  671 non-null    int64 
 6   loud          671 non-null    int64 
 7   valence       671 non-null    int64 
 8   length        671 non-null    object
 9   acoustic      671 non-null    int64 
 10  popularity    671 non-null    int64 
 11  style         671 non-null    object
dtypes: int64(7), object(5)
memory usage: 63.0+ KB


In [6]:
sp.head()

Unnamed: 0,title,artist,release,bpm,energy,danceability,loud,valence,length,acoustic,popularity,style
0,Don't Let Me Down,The Chainsmokers,5/02/16,160,87,53,-5,42,03:28,16,84,POP
1,In the Name of Love,Martin Garrix,29/07/16,134,52,50,-6,17,03:16,11,83,POP
2,FRIENDS,Marshmello,9/02/18,95,88,63,-2,53,03:23,21,90,POP
3,The Middle,Zedd,23/01/18,107,65,75,-3,43,03:05,21,54,POP
4,Never Forget You,Zara Larsson,10/09/15,146,73,58,-6,28,03:33,0,22,POP


In [7]:
def length_to_sec(length):
    minutes, seconds = length.split(':')
    return int(minutes) * 60 + int(seconds)

In [8]:
sp['length seconds'] = sp['length'].apply(length_to_sec)

In [9]:
sp.head()

Unnamed: 0,title,artist,release,bpm,energy,danceability,loud,valence,length,acoustic,popularity,style,length seconds
0,Don't Let Me Down,The Chainsmokers,5/02/16,160,87,53,-5,42,03:28,16,84,POP,208
1,In the Name of Love,Martin Garrix,29/07/16,134,52,50,-6,17,03:16,11,83,POP,196
2,FRIENDS,Marshmello,9/02/18,95,88,63,-2,53,03:23,21,90,POP,203
3,The Middle,Zedd,23/01/18,107,65,75,-3,43,03:05,21,54,POP,185
4,Never Forget You,Zara Larsson,10/09/15,146,73,58,-6,28,03:33,0,22,POP,213


In [10]:
sp.corr()['danceability']

bpm              -0.044197
energy            0.290425
danceability      1.000000
loud              0.410181
valence           0.546807
acoustic         -0.312721
popularity        0.187963
length seconds   -0.121361
Name: danceability, dtype: float64

In [11]:
features = ['energy', 'loud', 'valence']
target = 'danceability'

In [12]:
my_music = pd.DataFrame(
    [[87, -6, 42]], columns=features
).iloc[0]

In [13]:
my_music

energy     87
loud       -6
valence    42
Name: 0, dtype: int64

In [14]:
random_music = sp.sample(1, random_state=1).iloc[0]
random_music

title             Facing My Fear
artist                    Cortes
release                 29/10/16
bpm                          168
energy                        81
danceability                  41
loud                          -3
valence                       64
length                     03:22
acoustic                       0
popularity                     2
style                       ROCK
length seconds               202
Name: 107, dtype: object

In [15]:
def distance(music1, music2):
    return((music1 - music2) ** 2).sum() ** 0.5

distance(my_music, random_music)

23.0

In [16]:
def distance_from_my_music(other_music):
    return distance(my_music, other_music)

In [17]:
distances = sp[features].apply(distance_from_my_music, axis=1)

In [18]:
sp['Distance from my music'] = distances

In [19]:
sp.head()

Unnamed: 0,title,artist,release,bpm,energy,danceability,loud,valence,length,acoustic,popularity,style,length seconds,Distance from my music
0,Don't Let Me Down,The Chainsmokers,5/02/16,160,87,53,-5,42,03:28,16,84,POP,208,1.0
1,In the Name of Love,Martin Garrix,29/07/16,134,52,50,-6,17,03:16,11,83,POP,196,43.011626
2,FRIENDS,Marshmello,9/02/18,95,88,63,-2,53,03:23,21,90,POP,203,11.74734
3,The Middle,Zedd,23/01/18,107,65,75,-3,43,03:05,21,54,POP,185,22.226111
4,Never Forget You,Zara Larsson,10/09/15,146,73,58,-6,28,03:33,0,22,POP,213,19.79899


# Danceability sur 10 voisin?

In [20]:
sp.sort_values(by='Distance from my music').head(10)['danceability'].mean()

44.1

In [21]:
knn_mono = KNeighborsRegressor(10, algorithm='brute')

In [22]:
X_mono = sp[['energy']]
y = sp['danceability']

In [23]:
knn_mono.fit(X_mono, y)

KNeighborsRegressor(algorithm='brute', n_neighbors=10)

In [24]:
knn_mono.predict([
    [52],
    [60],
    [45]
])

array([59.2, 65.1, 62.1])

In [25]:
knn_multi = KNeighborsRegressor(10, algorithm='brute')

In [26]:
X_multi = sp[['energy', 'loud', 'valence']]

In [27]:
knn_multi.fit(X_multi, y)

KNeighborsRegressor(algorithm='brute', n_neighbors=10)

In [28]:
knn_multi.predict([
    [87, -6, 42]
])

array([46.4])

##### Pour comparer des modèles, nous pouvons utiliser le MSE = Mean Squared Error

In [29]:
Y_predicted_mono = knn_mono.predict(sp[['energy']])

In [30]:
mse_mono = ((y - Y_predicted_mono) ** 2).mean()
mse_mono

209.1635171385991

In [31]:
mse_mono ** 0.5

14.462486547568474

In [32]:
Y_predicted_multi = knn_multi.predict(sp[['energy', 'loud', 'valence']])

In [33]:
mse_multi = ((y - Y_predicted_multi) ** 2).mean()
mse_multi

149.33047690014902

In [34]:
mse_multi ** 0.5

12.220084979252356

In [35]:
mean_squared_error(y, Y_predicted_multi) ** 0.5

12.220084979252356

##### Overfitting

In [36]:
train, test = train_test_split(sp, test_size=0.2, random_state=1)

In [37]:
knn = KNeighborsRegressor(5, algorithm='brute')

In [38]:
knn.fit(train[['energy', 'loud', 'valence']], train['danceability'])

KNeighborsRegressor(algorithm='brute')

In [39]:
knn.predict([
    [87, -6, 42]
])

array([38.6])

In [40]:
mse = mean_squared_error(
    test['danceability'],
    knn.predict(test[['energy', 'loud', 'valence']])
)

In [41]:
mse ** 0.5

14.529969029560936

In [42]:
kfold = KFold(n_splits=5, shuffle=True, random_state=1)

In [43]:
mses = []

for train_index, test_index in kfold.split(sp):
    # Je selectionne mes donnees via les indices
    train_set = sp.iloc[train_index]
    test_set = sp.iloc[test_index]

    # Je selectionne mes donnees pour les entrainements et pour les tests
    train_X = train_set[['energy', 'loud', 'valence']]
    train_Y = train_set['danceability']

    test_X = test_set[['energy', 'loud', 'valence']]
    test_Y = test_set['danceability']

    # Je cree mon modele
    knn_fold = KNeighborsRegressor(5, algorithm='brute')

    # Je l'entraine avec mes donnees d'entrainement
    knn_fold.fit(train_X, train_Y)

    # Je predis les donnees via mes donnees de tests
    Y_predicted = knn_fold.predict(test_X)

    # Je calcule mon MSE via mes donnees de tests
    mse = mean_squared_error(test_Y, Y_predicted)

    # J'ajoute le MSE a la liste
    mses.append(mse)

# Hyper-paramétrisation


In [44]:
np.mean(mses)

192.40376738529577

In [45]:
X = sp[['energy', 'loud', 'valence']]
y = sp['danceability']
blank_model = KNeighborsRegressor(5, algorithm='brute')
kfold = KFold(n_splits=5, shuffle=True, random_state=1)

np.mean(
    -cross_val_score(blank_model, X, y, cv=kfold, scoring='neg_mean_squared_error')
)

192.40376738529577

In [46]:
blank_model = KNeighborsRegressor(algorithm='brute')
kfold = KFold(n_splits=5, shuffle=True, random_state=1)
parameters = {
    'n_neighbors' : list(range(1, 31))
}

In [47]:
gs = GridSearchCV(blank_model, parameters, cv=kfold, scoring='neg_mean_squared_error')
gs.fit(sp[['energy', 'loud', 'valence']], sp['danceability'])

GridSearchCV(cv=KFold(n_splits=5, random_state=1, shuffle=True),
             estimator=KNeighborsRegressor(algorithm='brute'),
             param_grid={'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                         13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
                                         23, 24, 25, 26, 27, 28, 29, 30]},
             scoring='neg_mean_squared_error')

In [48]:
gs.best_params_

{'n_neighbors': 14}

In [49]:
gs.best_estimator_

KNeighborsRegressor(algorithm='brute', n_neighbors=14)