In [1]:
# Music Genre Classification
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB 
from sklearn.utils.class_weight import compute_sample_weight

In [2]:
# preprocessing of data
train_features_lyric = []
valid_features_lyric = []
test_features_lyric = []
train_features_num_rangeFix = []
valid_features_num_rangeFix = []
test_features_num_rangeFix = []
train_features_all = []
valid_features_all = []
test_features_all = []
train_labels = []
valid_labels = []
training_instance_weight = []
test_features = []

def preprocess():
    global train_features_lyric
    global valid_features_lyric
    global test_features_lyric
    global train_features_num_rangeFix
    global valid_features_num_rangeFix
    global test_features_num_rangeFix
    global train_features_all
    global valid_features_all
    global test_features_all
    global train_labels_withoutID
    global valid_labels_withoutID
    global train_labels
    global valid_labels
    global training_instance_weight
    global test_features
    
    #load data sets
    train_features = pd.read_csv('train_features.csv')
    train_labels = pd.read_csv('train_labels.csv')
    valid_features = pd.read_csv('valid_features.csv')
    valid_labels = pd.read_csv('valid_labels.csv')
    test_features = pd.read_csv('test_features.csv')

    # data features
    # all numerical features
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    train_features_numerics = train_features.select_dtypes(include=numerics)
    valid_features_numerics = valid_features.select_dtypes(include=numerics)
    test_features_numerics = test_features.select_dtypes(include=numerics)
    # all numerical features without ID
    train_features_numerics_withoutID = train_features_numerics.drop(columns = ['trackID'])
    valid_features_numerics_withoutID = valid_features_numerics.drop(columns = ['trackID'])
    test_features_numerics_withoutID = test_features_numerics.drop(columns = ['trackID'])
    # lyric features with one-hot process
    vec = CountVectorizer()
    train_features_lyric_mtx = vec.fit_transform(train_features['tags'])
    valid_features_lyric_mtx = vec.transform(valid_features['tags'])
    test_features_lyric_mtx = vec.transform(test_features['tags'])
    train_features_lyric = pd.DataFrame.sparse.from_spmatrix(train_features_lyric_mtx)
    valid_features_lyric = pd.DataFrame.sparse.from_spmatrix(valid_features_lyric_mtx)
    test_features_lyric = pd.DataFrame.sparse.from_spmatrix(test_features_lyric_mtx)
    # scaling feature colomns
    scaler = StandardScaler()
    train_features_num_rangeFix = scaler.fit_transform(np.array(train_features_numerics_withoutID, dtype = float))
    valid_features_num_rangeFix = scaler.fit_transform(np.array(valid_features_numerics_withoutID, dtype = float))
    test_features_num_rangeFix = scaler.fit_transform(np.array(test_features_numerics_withoutID, dtype = float))
    # all features including numeric (with normalization) and lyrics
    train_features_all = np.hstack((train_features_lyric,train_features_num_rangeFix))
    valid_features_all = np.hstack((valid_features_lyric,valid_features_num_rangeFix))
    test_features_all = np.hstack((test_features_lyric,test_features_num_rangeFix))
    
    # compute sample weights
    training_instance_weight = compute_sample_weight("balanced", train_labels.genre)

    # data labels
    genre_mapping = {
        'soul and reggae': 0,
        'pop': 1,
        'punk': 2,
        'jazz and blues': 3,
        'dance and electronica': 4,
        'folk': 5,
        'classic pop and rock': 6,
        'metal': 7}
    train_labels['genre'] = train_labels['genre'].map(genre_mapping)
    valid_labels['genre'] = valid_labels['genre'].map(genre_mapping)

In [3]:
# Sub classifiers
def SupportVectorMachine(X_train, y_train, training_label_weight):
    SVC_clf = SVC(gamma = 'auto', random_state = 0)
    SVC_clf.fit(X_train, y_train, sample_weight = training_label_weight)
    return SVC_clf
def NaiveBayes(X_train, y_train, training_label_weight):
    MNB_clf = MultinomialNB()
    MNB_clf.fit(X_train, y_train, sample_weight = training_label_weight)
    return MNB_clf
def DecisionTree_bagging(X_train, y_train, training_label_weight):
    DT_bagging_clf = BaggingClassifier(base_estimator=DecisionTreeClassifier(random_state=0,class_weight='balanced')
                                    ,n_estimators=100, random_state=0, n_jobs=-1)
    DT_bagging_clf.fit(X_train,y_train,  sample_weight = training_label_weight)
    return DT_bagging_clf

In [4]:
# Train classifier
def train(X_train, X_train_lyric, y_train, training_label_weight):
    print('Start training SVC!')
    SVC_clf = SupportVectorMachine(X_train, y_train, training_label_weight)
    print('SVC training done!')
    print('\nStart training Naive Bayes!')
    MNB_clf = NaiveBayes(X_train_lyric, y_train, training_label_weight)
    print('Naive Bayes training done!')
    print('\nStart training Decision Tree!')
    DT_clf = DecisionTree_bagging(X_train, y_train, training_label_weight)
    print('Decision Tree training done!')
    stacking = [SVC_clf, MNB_clf, DT_clf]
    return stacking

In [5]:
# Predict with stacking
def predict(clf, X_test, X_test_lyric):
    y_predict_SVC = clf[0].predict(X_test)
    y_predict_MNB = clf[1].predict(X_test_lyric)
    y_predict_DT = clf[2].predict(X_test)
    X_test_aggregate = np.vstack(([y_predict_SVC], [y_predict_MNB], [y_predict_DT])).T
    y_predict_aggregate = []
    for i in range(len(X_test_aggregate)):
        if np.bincount(X_test_aggregate[i]).max() > 1:
            y_predict_aggregate.append(np.argmax(np.bincount(X_test_aggregate[i])))
        else:
            y_predict_aggregate.append(X_test_aggregate[i][0])
    print('\nSuccessfully get predictions! ')
    return y_predict_aggregate

In [6]:
# Evaluate the classifier
def evaluate(clf, X_valid, X_valid_lyric, y_valid):
    y_pred = predict(clf, X_valid, X_valid_lyric)
    print('\nEvaluation begin:')
    print('Confusion matrix: \n',metrics.confusion_matrix(y_valid, y_pred))
    print('Overall accuracy: ',metrics.accuracy_score(y_valid, y_pred))
    print('Precision of 8 class: \n',metrics.precision_score(y_valid, y_pred, average = None))
    print('Recall of 8 class: \n',metrics.recall_score(y_valid, y_pred, average = None))
    print('F1 score of 8 class: \n',metrics.f1_score(y_valid, y_pred, average = None))
    print('Evaluation done!')

In [7]:
# Save prediction to CSV
def toCSV(X_test, y_predict):
    if len(X_test)!=len(y_predict):
        print('Instance number does not match!')
    else:
        y = pd.DataFrame(np.asarray(y_predict))
        genre_mapping = {
            0: 'soul and reggae',
            1: 'pop',
            2: 'punk',
            3: 'jazz and blues',
            4: 'dance and electronica',
            5: 'folk',
            6: 'classic pop and rock',
            7: 'metal'}
        y = y[0].map(genre_mapping)
        track_ID = X_test.trackID
        y_output = pd.concat([track_ID,y],axis=1, keys=['trackID','genre'])
        y_output.to_csv('test_labels.csv', index=None)
        print('\nSuccessfully save prediction to CSV file!')

In [8]:
# run the process and save pridiction result to csv file

# Data preprocess
preprocess()
# Train model
clf = train(train_features_all, train_features_lyric, train_labels.genre, training_instance_weight)
# Evaluate model
evaluate(clf, valid_features_all, valid_features_lyric, valid_labels.genre)
# Predict test set
y_predict = predict(clf, test_features_all, test_features_lyric)
# Save prediction to CSV
toCSV(test_features, y_predict)

Start training SVC!
SVC training done!

Start training Naive Bayes!
Naive Bayes training done!

Start training Decision Tree!
Decision Tree training done!

Successfully get predictions! 
Evaluation begin:
Confusion matrix: 
 [[46  0  0  2  6  2  2  0]
 [ 0 74  0  0  0  0  0  0]
 [ 0  2 34  0  0  3  4  1]
 [ 2  0  2 16  3 11  9  1]
 [ 5  3  7  0 14  8  5  3]
 [ 0  0  3  0  0 41 20  0]
 [ 2  0  1 13  0  6 33  0]
 [ 0  1  6  0  2  0  0 57]]
Overall accuracy:  0.7
Precision of 8 class: 
 [0.83636364 0.925      0.64150943 0.51612903 0.56       0.57746479
 0.45205479 0.91935484]
Recall of 8 class: 
 [0.79310345 1.         0.77272727 0.36363636 0.31111111 0.640625
 0.6        0.86363636]
F1 score of 8 class: 
 [0.81415929 0.96103896 0.70103093 0.42666667 0.4        0.60740741
 0.515625   0.890625  ]
Evaluation done!

Successfully get predictions! 

Successfully save prediction to CSV file!
