In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb

from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [2]:
# Croisement features/tracks du dataset train
traingenre = pd.read_csv(filepath_or_buffer="train_clean.csv", sep=",")
datatrain = pd.read_csv(filepath_or_buffer="train_data.csv", sep=",")

data = pd.merge(traingenre, datatrain, on='track_id')
print("data merged :",data.shape, ", train genre :", traingenre.shape, ", train data :", datatrain.shape)
data.sample(n=10)

data merged : (3995, 541) , train genre : (3995, 2) , train data : (3997, 540)


Unnamed: 0,track_id,genre_id,chroma_cens.1.kurtosis,chroma_cens.2.kurtosis,chroma_cens.3.kurtosis,chroma_cens.4.kurtosis,chroma_cens.5.kurtosis,chroma_cens.6.kurtosis,chroma_cens.7.kurtosis,chroma_cens.8.kurtosis,...,spectral_flatness.1.mean,spectral_flatness.1.std,spectral_flatness.1.median,zero_crossing_rate.1.kurtosis,zero_crossing_rate.1.skew,zero_crossing_rate.1.amax,zero_crossing_rate.1.amin,zero_crossing_rate.1.mean,zero_crossing_rate.1.std,zero_crossing_rate.1.median
600,23160,1,-0.76954,-1.045243,0.074666,-0.078524,-0.66967,-0.569221,-1.13284,-0.786514,...,1.2e-05,9e-06,1e-05,2.968807,1.500468,0.074707,0.00293,0.027749,0.010212,0.025391
1750,63226,7,-0.368606,-0.942418,1.357453,-1.536629,-1.317451,-1.289291,-0.847402,-1.122102,...,0.000515,0.001472,8.5e-05,2.499086,1.551817,0.245117,0.000488,0.057374,0.044314,0.044922
2309,82243,6,-0.327033,-0.644931,0.635093,1.559189,-0.515575,-0.876778,0.154838,-1.13536,...,0.000334,0.000606,9.6e-05,53.60638,5.509696,0.375,0.00293,0.023773,0.023453,0.020508
793,30384,2,-0.510673,-1.10631,-0.726714,-0.86105,-0.787715,0.009944,-0.921187,-0.316034,...,0.002224,0.005879,0.000569,5.492209,2.177345,0.369629,0.007812,0.056789,0.052158,0.035156
3674,139110,3,-0.926402,1.576949,1.39558,0.38406,0.019932,-1.070472,-0.453638,-0.955023,...,0.00045,0.002025,7e-05,1.130463,1.190806,0.166992,0.006836,0.048211,0.028398,0.041504
2457,90584,7,1.691624,4.487879,-0.293419,0.979912,-0.792147,0.114376,-0.338539,2.621673,...,0.000136,0.000139,9.1e-05,1.102993,0.998616,0.163086,0.007812,0.051027,0.025924,0.046387
1250,47770,1,0.251898,-0.881312,-0.601509,-1.064309,-0.520576,-0.750684,-0.402726,0.565117,...,5.2e-05,0.000135,1.6e-05,5.133371,2.116852,0.143066,0.006348,0.032657,0.020711,0.024902
1313,48862,6,-0.559218,-0.622358,-0.826497,8.415811,0.847821,3.743505,-0.648947,-0.442487,...,0.001937,0.005272,0.000585,3.867778,1.609126,0.193359,0.008301,0.05691,0.025921,0.051758
2455,90579,7,-0.744188,-0.855389,-0.827198,-0.956269,-0.85957,0.131414,-0.491873,-1.034719,...,4.6e-05,6.8e-05,1.9e-05,-0.19237,0.432938,0.214844,0.007812,0.082841,0.03787,0.081055
1740,62591,5,0.423642,-0.596475,-0.561036,-0.364549,0.560249,0.898802,-0.217315,-0.370291,...,0.000814,0.002871,0.000133,1.354216,1.214072,0.168945,0.004883,0.033905,0.024461,0.027344


In [3]:
# training sets
x = data.drop(['genre_id', 'track_id'], axis=1)
y = data['genre_id'].values

print("x :", x.shape, ", y :", y.shape)

x : (3995, 539) , y : (3995,)


In [4]:
# split train test
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

print("x_train :", x_train.shape, ", y_train :", y_train.shape)
print("x_test :", x_test.shape, ", y_test :", y_test.shape)

x_train : (3196, 539) , y_train : (3196,)
x_test : (799, 539) , y_test : (799,)


In [5]:
# normalisation
scaler = preprocessing.StandardScaler().fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

mean_train = x_train.mean()
std_train = x_train.std()
print(f'mean_train: {mean_train}. std_train: {std_train}')

mean_train: -4.3604497104961323e-17. std_train: 0.9999999999999997


## Premier modèle : K nearest neighbors

In [6]:
knn_model = KNeighborsClassifier(n_neighbors=1)
knn_model.fit(x_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=1, p=2,
                     weights='uniform')

In [7]:
y_pred = knn_model.predict(x_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           1       0.79      0.85      0.82       112
           2       0.30      0.16      0.21        88
           3       0.54      0.46      0.50       111
           4       0.47      0.58      0.52       107
           5       0.41      0.50      0.45       105
           6       0.40      0.24      0.30        94
           7       0.21      0.22      0.22        91
           8       0.40      0.54      0.46        91

    accuracy                           0.46       799
   macro avg       0.44      0.44      0.43       799
weighted avg       0.45      0.46      0.45       799



## Deuxième modèle : Decision Tree

In [8]:
tree_model = DecisionTreeClassifier(max_leaf_nodes=16, min_samples_split=2)
tree_model.fit(x_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=16,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [9]:
y_pred = tree_model.predict(x_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           1       0.74      0.81      0.77       112
           2       0.23      0.26      0.25        88
           3       0.55      0.36      0.43       111
           4       0.48      0.47      0.47       107
           5       0.28      0.54      0.37       105
           6       0.30      0.24      0.27        94
           7       0.30      0.15      0.20        91
           8       0.54      0.42      0.47        91

    accuracy                           0.42       799
   macro avg       0.43      0.41      0.40       799
weighted avg       0.44      0.42      0.42       799



In [10]:
# Attention !!! Prend beaucoup de temps d'execution. Valeurs trouvée déjà utilisées précédemment.
# Trouvé : max_leaf_nodes = 16 et min_samples_split = 2

# optimization
# parameters = {'max_leaf_nodes':[x for x in range(2, 20)], 'min_samples_split':[y for y in range(2, 20)]} 

# grid_search_cv = GridSearchCV(DecisionTreeClassifier(), parameters, scoring='accuracy')
# grid_search_cv.fit(x_train, y_train)
# grid_search_cv.best_estimator_

In [11]:
# y_pred = grid_search_cv.predict(x_test)
# print(classification_report(y_test,y_pred))

## Troisième modèle : Random Forest

In [12]:
forest_model = RandomForestClassifier(max_leaf_nodes=16, min_samples_split=2)
forest_model.fit(x_train, y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=16,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [13]:
y_pred = forest_model.predict(x_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           1       0.62      0.86      0.72       112
           2       0.33      0.23      0.27        88
           3       0.57      0.54      0.56       111
           4       0.46      0.65      0.54       107
           5       0.38      0.31      0.35       105
           6       0.36      0.27      0.31        94
           7       0.19      0.07      0.10        91
           8       0.45      0.68      0.54        91

    accuracy                           0.47       799
   macro avg       0.42      0.45      0.42       799
weighted avg       0.43      0.47      0.43       799



## Quatrième méthode : Gradient Boosting

In [14]:
gradient_model = GradientBoostingClassifier()
gradient_model.fit(x_train, y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='auto',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [15]:
y_pred = gradient_model.predict(x_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           1       0.84      0.88      0.86       112
           2       0.44      0.50      0.47        88
           3       0.68      0.57      0.62       111
           4       0.63      0.68      0.66       107
           5       0.52      0.53      0.53       105
           6       0.50      0.47      0.48        94
           7       0.24      0.24      0.24        91
           8       0.52      0.52      0.52        91

    accuracy                           0.56       799
   macro avg       0.55      0.55      0.55       799
weighted avg       0.56      0.56      0.56       799



## Cinquième méthode : XGBoost

In [16]:
xgb_model = xgb.XGBClassifier()
xgb_model.fit(x_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [17]:
y_pred = xgb_model.predict(x_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           1       0.82      0.91      0.86       112
           2       0.43      0.49      0.46        88
           3       0.71      0.60      0.65       111
           4       0.60      0.68      0.64       107
           5       0.46      0.46      0.46       105
           6       0.51      0.44      0.47        94
           7       0.24      0.21      0.22        91
           8       0.55      0.57      0.56        91

    accuracy                           0.56       799
   macro avg       0.54      0.54      0.54       799
weighted avg       0.55      0.56      0.55       799



## Tuning XGBoost

In [18]:
# params = {
#     # Parameters that we are going to tune.
#     'max_depth':6,
#     'min_child_weight': 1,
#     'eta':.3
# }

# params['eval_metric'] = "mae"
# num_boost_round = 999

# model = xgb.train(
#     params,
#     x_train,
#     num_boost_round=num_boost_round,
#     evals=[(y_train, "Test")],
#     early_stopping_rounds=10
# )

# xgb_model = xgb.XGBClassifier()
# xgb_model.fit(x_train, y_train)

In [19]:
# mauvaise idée : temps d'execution infini

# parameters = {'n_estimators':[n for n in range(100, 200, 10)], 'learning_rate':[x/100 for x in range(5, 30)], 'max_depth':[y for y in range(3, 10)], 'min_child_weight':[z for z in range(1, 6)]} 
# grid_search_cv = GridSearchCV(xgb.XGBClassifier(), parameters, scoring='accuracy')
# grid_search_cv.fit(x_train, y_train)
# grid_search_cv.best_estimator_

In [24]:
xgb_model = xgb.XGBClassifier(n_estimators=200, learning_rate=0.1, max_depth=5, min_child_weight=3)
xgb_model.fit(x_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=5,
              min_child_weight=3, missing=None, n_estimators=200, n_jobs=1,
              nthread=None, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [25]:
y_pred = xgb_model.predict(x_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           1       0.85      0.89      0.87       112
           2       0.40      0.44      0.42        88
           3       0.70      0.64      0.67       111
           4       0.63      0.70      0.66       107
           5       0.62      0.58      0.60       105
           6       0.48      0.45      0.46        94
           7       0.25      0.23      0.24        91
           8       0.54      0.56      0.55        91

    accuracy                           0.58       799
   macro avg       0.56      0.56      0.56       799
weighted avg       0.57      0.58      0.57       799



In [None]:
# n_estimators=150, learning_rate=0.2, max_depth=3, min_child_weight=1  --------> 0.58
# n_estimators=150, learning_rate=0.2, max_depth=5, min_child_weight=1  --------> 0.58
# n_estimators=180, learning_rate=0.2, max_depth=5, min_child_weight=3  --------> 0.59
# n_estimators=180, learning_rate=0.25, max_depth=5, min_child_weight=3  -------> 0.58
# n_estimators=170, learning_rate=0.2, max_depth=7, min_child_weight=3  --------> 0.58
# n_estimators=180, learning_rate=0.15, max_depth=5, min_child_weight=3  -------> 0.58
# n_estimators=200, learning_rate=0.1, max_depth=5, min_child_weight=3  --------> 0.58
# n_estimators=200, learning_rate=0.2, max_depth=3, min_child_weight=1  --------> 0.59