In [1]:
# Ce notebook contient les mêmes méthodes que le premier mais executées sur les features extraites par Mr. Hanna
# contrairement à l'autre qui repose sur notre propre extraction avec librosa

# Executez cette case si vous n'avez pas les données dans le répertoire

# !wget http://dept-info.labri.fr/~hanna/Pub/features_adapte.csv
# !wget http://dept-info.labri.fr/~hanna/Pub/features_head.csv
# !wget http://dept-info.labri.fr/~hanna/Pub/train_clean.csv
# !wget http://dept-info.labri.fr/~hanna/Pub/test_clean.csv

In [2]:
import numpy as np
import pandas as pd
import xgboost as xgb
import random as rd

from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

## Loading the data

In [3]:
# Croisement features/tracks du dataset train

# Nom des features
features = pd.read_csv(filepath_or_buffer="features_head.csv", sep=",")
print(features.columns)
#print(features)
print("#################")

# Croisement features/tracks du dataset train
traingenre = pd.read_csv(filepath_or_buffer="train_clean.csv", sep=",")
iter_csv = pd.read_csv(filepath_or_buffer="features_adapte.csv", sep=",", iterator=True, chunksize=10000)
datatrain = pd.concat([chunk for chunk in iter_csv])

train_data = pd.merge(traingenre, datatrain, on='track_id')
train_data.sample(n=10)

testgenre = pd.read_csv(filepath_or_buffer="test_clean.csv", sep=",")
test_data = pd.merge(testgenre, datatrain, on='track_id')
test_data.drop(['genre_id'], axis=1, inplace=True)
print(f"test data: {test_data.shape}")

all_test_id = pd.read_csv(filepath_or_buffer="submission.csv", sep=",")
all_test_id.drop(['genre_id'], axis=1, inplace=True)
print(f"all test id: {len(all_test_id)}")

Index(['feature', 'chroma_cens', 'chroma_cens.1', 'chroma_cens.2',
       'chroma_cens.3', 'chroma_cens.4', 'chroma_cens.5', 'chroma_cens.6',
       'chroma_cens.7', 'chroma_cens.8',
       ...
       'tonnetz.39', 'tonnetz.40', 'tonnetz.41', 'zcr', 'zcr.1', 'zcr.2',
       'zcr.3', 'zcr.4', 'zcr.5', 'zcr.6'],
      dtype='object', length=519)
#################
test data: (4002, 519)
all test id: 4008


## Preparing the data

In [4]:
# training sets
x_train = train_data.drop(['genre_id', 'track_id'], axis=1)
y_train = train_data['genre_id'].values
x_test  = test_data.drop(['track_id'], axis=1)
test_id = test_data['track_id'].values

print(f"x_train: {x_train.shape}, y_train: {y_train.shape}, x_test: {x_test.shape}")

x_train: (3995, 518), y_train: (3995,), x_test: (4002, 518)


In [5]:
# normalisation
scaler = preprocessing.StandardScaler().fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

mean_train = x_train.mean()
std_train = x_train.std()

print(f'mean_train: {mean_train}. std_train: {std_train}')

mean_train: -1.815250402489839e-16. std_train: 0.9961315135452709


## Training the model

In [6]:
model = xgb.XGBClassifier(n_estimators=180, learning_rate=0.2, max_depth=5, min_child_weight=3)
model.fit(x_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.2, max_delta_step=0, max_depth=5,
              min_child_weight=3, missing=None, n_estimators=180, n_jobs=1,
              nthread=None, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [7]:
y_pred = model.predict(x_test)

assert len(y_pred) == len(test_id)

## Create kaggle csv

In [8]:
output_data = pd.DataFrame({'track_id': test_id, 'genre_id': y_pred})
output_data = output_data.merge(all_test_id, on='track_id', how='right')

# fill missing id with a random genre
output_data['genre_id'] = output_data['genre_id'].apply(lambda x: rd.randint(1,8) if np.isnan(x) else x)

output_data['genre_id'] = output_data['genre_id'].apply(int)
output_data.set_index('track_id', inplace=True)
print(f"predicted id: {len(test_id)}, submitting {len(output_data)} ids")

output_data.to_csv('genreForKaggle.csv')

predicted id: 4002, submitting 4008 ids
