In [14]:
import numpy as np
import pandas as pd
import xgboost as xgb

from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [2]:
# Croisement features/tracks du dataset train
traingenre = pd.read_csv(filepath_or_buffer="train_clean.csv", sep=",")
datatrain = pd.read_csv(filepath_or_buffer="train_data.csv", sep=",")

data = pd.merge(traingenre, datatrain, on='track_id')
print("data merged :",data.shape, ", train genre :", traingenre.shape, ", train data :", datatrain.shape)
data.sample(n=10)

data merged : (3995, 541) , train genre : (3995, 2) , train data : (3997, 540)


Unnamed: 0,track_id,genre_id,chroma_cens.1.kurtosis,chroma_cens.2.kurtosis,chroma_cens.3.kurtosis,chroma_cens.4.kurtosis,chroma_cens.5.kurtosis,chroma_cens.6.kurtosis,chroma_cens.7.kurtosis,chroma_cens.8.kurtosis,...,spectral_flatness.1.mean,spectral_flatness.1.std,spectral_flatness.1.median,zero_crossing_rate.1.kurtosis,zero_crossing_rate.1.skew,zero_crossing_rate.1.amax,zero_crossing_rate.1.amin,zero_crossing_rate.1.mean,zero_crossing_rate.1.std,zero_crossing_rate.1.median
1562,57168,3,-0.326656,0.484104,-0.400976,-0.10096,0.814617,-0.35978,-0.10065,6.488883,...,0.001846,0.004176,0.000424,9.269056,2.813545,0.428223,0.007812,0.078718,0.063277,0.064453
3070,117943,5,-0.322007,-1.594685,3.890751,12.641701,6.267424,1.976586,-0.73396,-1.289568,...,0.008228,0.034918,0.000259,31.23655,4.879776,0.41748,0.001953,0.029873,0.043424,0.01709
2942,113034,4,-0.806977,-0.259791,0.930923,0.742254,-0.027599,0.149701,-0.53204,0.345915,...,0.004258,0.007339,0.001445,9.699513,2.753354,0.417969,0.001465,0.0594,0.060807,0.041992
2109,73172,3,-0.592592,0.459068,1.659597,-0.128133,-0.432241,-0.258269,2.083415,0.129092,...,1.1e-05,1.5e-05,7e-06,4.955555,1.597445,0.048828,0.004883,0.013886,0.004481,0.012695
624,24368,5,-0.172815,0.419096,0.520159,0.540396,0.987491,0.76664,0.082191,0.862059,...,0.000445,0.019664,5e-05,1.883174,0.874321,0.14209,0.0,0.057609,0.017824,0.055664
1713,61529,4,-0.281388,-0.756576,-0.805845,0.101481,0.746809,0.016206,1.001236,1.940711,...,0.020932,0.034069,0.007706,1.877174,1.506563,0.429199,0.003418,0.121846,0.087218,0.098633
2768,107597,3,0.04776,-1.223922,-0.633841,-0.115228,3.096759,0.311119,-1.027571,-0.892695,...,1.6e-05,3.7e-05,7e-06,24.062437,3.996108,0.122559,0.00293,0.017462,0.011371,0.01416
18,684,4,-0.10459,-0.046719,-0.187148,0.037519,-0.8974,0.629414,-0.79008,-0.593534,...,0.000374,0.001026,8.7e-05,20.809174,4.012769,0.334961,0.003906,0.037083,0.038637,0.026855
1366,51577,2,-0.907425,-0.964249,-0.334448,-0.16399,-0.602353,-1.323662,-1.288337,-1.293222,...,0.001221,0.003252,0.00026,9.279628,2.761881,0.175293,0.001953,0.025776,0.023679,0.018555
3033,115700,2,1.255773,-0.005038,0.27779,0.515823,2.136545,1.927928,1.118122,-0.24028,...,0.018964,0.004128,0.018609,0.214716,0.154591,0.103027,0.035645,0.067305,0.007726,0.066895


In [3]:
# training sets
x = data.drop(['genre_id', 'track_id'], axis=1)
y = data['genre_id'].values

print("x :", x.shape, ", y :", y.shape)

x : (3995, 539) , y : (3995,)


In [4]:
# split train test
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

print("x_train :", x_train.shape, ", y_train :", y_train.shape)
print("x_test :", x_test.shape, ", y_test :", y_test.shape)

x_train : (3196, 539) , y_train : (3196,)
x_test : (799, 539) , y_test : (799,)


In [5]:
# normalisation
scaler = preprocessing.StandardScaler().fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

mean_train = x_train.mean()
std_train = x_train.std()
print(f'mean_train: {mean_train}. std_train: {std_train}')

mean_train: -4.3604497104961323e-17. std_train: 0.9999999999999997


## Premier modèle testé : K nearest neighbors

In [6]:
knn_model = KNeighborsClassifier(n_neighbors=1)
knn_model.fit(x_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=1, p=2,
                     weights='uniform')

In [7]:
y_pred = knn_model.predict(x_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           1       0.79      0.85      0.82       112
           2       0.30      0.16      0.21        88
           3       0.54      0.46      0.50       111
           4       0.47      0.58      0.52       107
           5       0.41      0.50      0.45       105
           6       0.40      0.24      0.30        94
           7       0.21      0.22      0.22        91
           8       0.40      0.54      0.46        91

    accuracy                           0.46       799
   macro avg       0.44      0.44      0.43       799
weighted avg       0.45      0.46      0.45       799



## Deuxième modèle : Decision Tree

In [8]:
tree_model = DecisionTreeClassifier(max_leaf_nodes=16, min_samples_split=2)
tree_model.fit(x_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=16,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [9]:
y_pred = tree_model.predict(x_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           1       0.74      0.81      0.77       112
           2       0.23      0.26      0.25        88
           3       0.55      0.36      0.43       111
           4       0.48      0.47      0.47       107
           5       0.28      0.54      0.37       105
           6       0.30      0.24      0.27        94
           7       0.30      0.15      0.20        91
           8       0.54      0.42      0.47        91

    accuracy                           0.42       799
   macro avg       0.43      0.41      0.40       799
weighted avg       0.44      0.42      0.42       799



In [10]:
# Attention !!! Prend beaucoup de temps d'execution. Valeurs trouvée déjà utilisées précédemment.
# Trouvé : max_leaf_nodes = 16 et min_samples_split = 2

# optimization
# parameters = {'max_leaf_nodes':[x for x in range(2, 20)], 'min_samples_split':[y for y in range(2, 20)]} 

# grid_search_cv = GridSearchCV(DecisionTreeClassifier(), parameters, scoring='accuracy')
# grid_search_cv.fit(x_train, y_train)
# grid_search_cv.best_estimator_

In [11]:
# y_pred = grid_search_cv.predict(x_test)
# print(classification_report(y_test,y_pred))

## Troisième modèle : Random Forest

In [12]:
forest_model = RandomForestClassifier(max_leaf_nodes=16, min_samples_split=2)
forest_model.fit(x_train, y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=16,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [13]:
y_pred = forest_model.predict(x_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           1       0.65      0.84      0.73       112
           2       0.31      0.31      0.31        88
           3       0.53      0.52      0.52       111
           4       0.46      0.62      0.53       107
           5       0.38      0.38      0.38       105
           6       0.42      0.27      0.32        94
           7       0.22      0.07      0.10        91
           8       0.46      0.62      0.53        91

    accuracy                           0.47       799
   macro avg       0.43      0.45      0.43       799
weighted avg       0.44      0.47      0.44       799



## Quatrième méthode : Gradient Boosting

In [15]:
gradient_model = GradientBoostingClassifier()
gradient_model.fit(x_train, y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='auto',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [16]:
y_pred = gradient_model.predict(x_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           1       0.84      0.87      0.85       112
           2       0.40      0.44      0.42        88
           3       0.66      0.58      0.62       111
           4       0.62      0.65      0.64       107
           5       0.51      0.52      0.52       105
           6       0.48      0.47      0.47        94
           7       0.26      0.24      0.25        91
           8       0.54      0.55      0.54        91

    accuracy                           0.55       799
   macro avg       0.54      0.54      0.54       799
weighted avg       0.55      0.55      0.55       799



In [None]:

# read in data
dtrain = xgb.DMatrix('demo/data/agaricus.txt.train')
dtest = xgb.DMatrix('demo/data/agaricus.txt.test')
# specify parameters via map
param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic' }
num_round = 2
bst = xgb.train(param, dtrain, num_round)
# make prediction
preds = bst.predict(dtest)
