In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb

from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [2]:
# Croisement features/tracks du dataset train
traingenre = pd.read_csv(filepath_or_buffer="train_clean.csv", sep=",")
datatrain = pd.read_csv(filepath_or_buffer="train_data.csv", sep=",")

data = pd.merge(traingenre, datatrain, on='track_id')
print("data merged :",data.shape, ", train genre :", traingenre.shape, ", train data :", datatrain.shape)
data.sample(n=10)

data merged : (3995, 541) , train genre : (3995, 2) , train data : (3997, 540)


Unnamed: 0,track_id,genre_id,chroma_cens.1.kurtosis,chroma_cens.2.kurtosis,chroma_cens.3.kurtosis,chroma_cens.4.kurtosis,chroma_cens.5.kurtosis,chroma_cens.6.kurtosis,chroma_cens.7.kurtosis,chroma_cens.8.kurtosis,...,spectral_flatness.1.mean,spectral_flatness.1.std,spectral_flatness.1.median,zero_crossing_rate.1.kurtosis,zero_crossing_rate.1.skew,zero_crossing_rate.1.amax,zero_crossing_rate.1.amin,zero_crossing_rate.1.mean,zero_crossing_rate.1.std,zero_crossing_rate.1.median
1117,43290,2,-0.397257,1.125258,-0.251642,-0.619167,0.087015,1.331539,0.47331,1.24457,...,0.015389,0.013969,0.011108,-0.670261,-0.016105,0.381348,0.007812,0.172246,0.077225,0.17334
3011,114427,8,-0.586725,-1.309208,0.606689,4.93947,-0.728544,-1.047655,-1.12376,-0.495722,...,0.000463,0.002402,3.8e-05,11.384439,2.507294,0.125488,0.00293,0.022268,0.013091,0.020508
1780,64520,4,-0.501923,-0.647989,-0.69307,-0.367798,0.050214,-1.013556,-0.922846,-0.447982,...,0.00141,0.003947,0.000171,10.779195,2.831683,0.328613,0.004395,0.046171,0.04372,0.032227
2924,112460,4,-0.761783,-0.821797,0.03848,0.728742,-0.093741,1.391619,1.239211,0.041407,...,0.002788,0.008723,0.000107,7.922057,2.577684,0.308594,0.001953,0.040129,0.045052,0.025879
1797,64836,7,-0.450774,-0.692876,-0.385053,-0.218366,-0.313339,-0.486855,-0.515963,-0.815472,...,0.004546,0.005459,0.002599,1.604511,1.02697,0.25293,0.01416,0.07568,0.034366,0.070312
1500,55779,2,-0.623857,-0.933477,-0.737083,-0.848343,-0.53972,-0.440108,0.098668,0.500562,...,0.000331,0.000586,0.000181,1.2795,1.048141,0.343262,0.003418,0.09236,0.060085,0.081055
352,12923,1,-0.904184,-1.120289,0.237691,-0.882855,-0.23757,-0.992595,-1.099598,-0.831494,...,0.000519,0.019724,1.7e-05,55.564543,5.801118,0.369629,0.0,0.026332,0.027885,0.018555
2575,96662,4,-0.932349,-0.848899,-0.418941,0.563712,-0.009385,0.152181,-0.763238,-0.730254,...,0.00639,0.011329,0.00161,3.057083,1.694943,0.447754,0.002441,0.080326,0.069958,0.055664
2080,72146,6,-0.515593,-0.359494,-0.272365,-0.703083,-0.317253,-0.453634,-0.141609,-0.588779,...,0.000764,0.000887,0.00055,0.748342,0.612682,0.249512,0.03125,0.102991,0.031074,0.101074
3607,135341,7,-0.019418,0.178403,-0.087839,0.159739,-0.0065,-0.377886,0.770724,1.432152,...,0.000755,0.001788,0.00023,1.417478,1.373157,0.539062,0.004883,0.13756,0.103218,0.101562


In [3]:
# training sets
x = data.drop(['genre_id', 'track_id'], axis=1)
y = data['genre_id'].values

print("x :", x.shape, ", y :", y.shape)

x : (3995, 539) , y : (3995,)


In [4]:
# split train test
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

print("x_train :", x_train.shape, ", y_train :", y_train.shape)
print("x_test :", x_test.shape, ", y_test :", y_test.shape)

x_train : (3196, 539) , y_train : (3196,)
x_test : (799, 539) , y_test : (799,)


In [5]:
# normalisation
scaler = preprocessing.StandardScaler().fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

mean_train = x_train.mean()
std_train = x_train.std()
print(f'mean_train: {mean_train}. std_train: {std_train}')

mean_train: -4.3604497104961323e-17. std_train: 0.9999999999999997


## Premier modèle : K nearest neighbors

In [6]:
knn_model = KNeighborsClassifier(n_neighbors=3)
knn_model.fit(x_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')

In [7]:
y_pred = knn_model.predict(x_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           1       0.71      0.90      0.80       112
           2       0.29      0.28      0.29        88
           3       0.54      0.61      0.57       111
           4       0.44      0.60      0.51       107
           5       0.50      0.51      0.50       105
           6       0.50      0.18      0.27        94
           7       0.38      0.21      0.27        91
           8       0.42      0.48      0.45        91

    accuracy                           0.49       799
   macro avg       0.47      0.47      0.46       799
weighted avg       0.48      0.49      0.47       799



## Deuxième modèle : Decision Tree

In [8]:
tree_model = DecisionTreeClassifier(max_leaf_nodes=16, min_samples_split=2)
tree_model.fit(x_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=16,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [9]:
y_pred = tree_model.predict(x_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           1       0.74      0.81      0.77       112
           2       0.23      0.26      0.25        88
           3       0.55      0.36      0.43       111
           4       0.48      0.47      0.47       107
           5       0.28      0.54      0.37       105
           6       0.30      0.24      0.27        94
           7       0.30      0.15      0.20        91
           8       0.54      0.42      0.47        91

    accuracy                           0.42       799
   macro avg       0.43      0.41      0.40       799
weighted avg       0.44      0.42      0.42       799



In [10]:
# Tentative d'optimisation du decision tree

# Attention !!! Prend beaucoup de temps d'execution. Valeurs trouvée déjà utilisées précédemment.
# Trouvé : max_leaf_nodes = 16 et min_samples_split = 2

# optimization
# parameters = {'max_leaf_nodes':[x for x in range(2, 20)], 'min_samples_split':[y for y in range(2, 20)]} 

# grid_search_cv = GridSearchCV(DecisionTreeClassifier(), parameters, scoring='accuracy')
# grid_search_cv.fit(x_train, y_train)
# grid_search_cv.best_estimator_

In [11]:
# y_pred = grid_search_cv.predict(x_test)
# print(classification_report(y_test,y_pred))

## Troisième modèle : Random Forest

In [12]:
forest_model = RandomForestClassifier(max_leaf_nodes=16, min_samples_split=2)
forest_model.fit(x_train, y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=16,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [13]:
y_pred = forest_model.predict(x_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           1       0.66      0.86      0.74       112
           2       0.34      0.28      0.31        88
           3       0.51      0.54      0.52       111
           4       0.46      0.67      0.54       107
           5       0.35      0.33      0.34       105
           6       0.42      0.23      0.30        94
           7       0.11      0.02      0.04        91
           8       0.43      0.63      0.51        91

    accuracy                           0.46       799
   macro avg       0.41      0.45      0.41       799
weighted avg       0.42      0.46      0.43       799



## Quatrième méthode : Gradient Boosting

In [14]:
gradient_model = GradientBoostingClassifier()
gradient_model.fit(x_train, y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='auto',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [15]:
y_pred = gradient_model.predict(x_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           1       0.84      0.88      0.86       112
           2       0.43      0.48      0.45        88
           3       0.69      0.59      0.64       111
           4       0.61      0.63      0.62       107
           5       0.54      0.53      0.54       105
           6       0.49      0.48      0.49        94
           7       0.22      0.22      0.22        91
           8       0.50      0.53      0.51        91

    accuracy                           0.55       799
   macro avg       0.54      0.54      0.54       799
weighted avg       0.56      0.55      0.55       799



## Cinquième méthode : XGBoost

In [16]:
xgb_model = xgb.XGBClassifier()
xgb_model.fit(x_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [17]:
y_pred = xgb_model.predict(x_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           1       0.82      0.91      0.86       112
           2       0.43      0.49      0.46        88
           3       0.71      0.60      0.65       111
           4       0.60      0.68      0.64       107
           5       0.46      0.46      0.46       105
           6       0.51      0.44      0.47        94
           7       0.24      0.21      0.22        91
           8       0.55      0.57      0.56        91

    accuracy                           0.56       799
   macro avg       0.54      0.54      0.54       799
weighted avg       0.55      0.56      0.55       799



## Tuning XGBoost

In [18]:
# Tentative d'optimisation

# mauvaise idée : temps d'execution infini avec ce nombre de paramètres

# parameters = {'n_estimators':[n for n in range(100, 200, 10)], 'learning_rate':[x/100 for x in range(5, 30)], 'max_depth':[y for y in range(3, 10)], 'min_child_weight':[z for z in range(1, 6)]} 
# grid_search_cv = GridSearchCV(xgb.XGBClassifier(), parameters, scoring='accuracy')
# grid_search_cv.fit(x_train, y_train)
# grid_search_cv.best_estimator_

In [19]:
xgb_model = xgb.XGBClassifier(n_estimators=180, learning_rate=0.2, max_depth=5, min_child_weight=3)
xgb_model.fit(x_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.2, max_delta_step=0, max_depth=5,
              min_child_weight=3, missing=None, n_estimators=180, n_jobs=1,
              nthread=None, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [20]:
y_pred = xgb_model.predict(x_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           1       0.83      0.90      0.87       112
           2       0.44      0.48      0.46        88
           3       0.71      0.67      0.69       111
           4       0.62      0.75      0.68       107
           5       0.57      0.52      0.55       105
           6       0.54      0.48      0.51        94
           7       0.28      0.26      0.27        91
           8       0.59      0.55      0.57        91

    accuracy                           0.59       799
   macro avg       0.57      0.58      0.57       799
weighted avg       0.59      0.59      0.59       799



In [21]:
# paramètres testés pour xgboost et résultats d'accuracy

# n_estimators=150, learning_rate=0.2, max_depth=3, min_child_weight=1  --------> 0.58
# n_estimators=150, learning_rate=0.2, max_depth=5, min_child_weight=1  --------> 0.58
# n_estimators=180, learning_rate=0.2, max_depth=5, min_child_weight=3  --------> 0.59
# n_estimators=180, learning_rate=0.25, max_depth=5, min_child_weight=3  -------> 0.58
# n_estimators=170, learning_rate=0.2, max_depth=7, min_child_weight=3  --------> 0.58
# n_estimators=180, learning_rate=0.15, max_depth=5, min_child_weight=3  -------> 0.58
# n_estimators=200, learning_rate=0.1, max_depth=5, min_child_weight=3  --------> 0.58
# n_estimators=200, learning_rate=0.2, max_depth=3, min_child_weight=1  --------> 0.59