### Title: Exploring Customer Reviews for Music Genre Classification

Tutorial notebook prepared by Sergio Oramas

Oramas, S., Espinosa-Anke L., Lawlor A., Serra X., & Saggion H. (2016). Exploring Customer Reviews for Music Genre Classification and Evolutionary Studies. 17th International Society for Music Information Retrieval Conference (ISMIR16).

PhD thesis:
http://sergiooramas.com/phd-thesis/
slides for the thesis presentation:
https://www.slideshare.net/soramas/phd-thesis-knowledge-extraction-and-representation-learning-for-music-recommendation-and-classification

NLP4MIR Tutorial with slides and video:
https://www.upf.edu/web/mdm-dtic/tutorial-natural-language-processing-for-music-information-retrieval

In [1]:
# Ipython notebook Python3 version for https://github.com/sergiooramas/music-genre-classification
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.svm import LinearSVC
from sklearn import metrics
import json
from scipy.sparse import hstack

In [2]:
#Downloading and unzipping data
import urllib.request
import zipfile
import os, sys
#Initialization
url='https://sites.google.com/site/mirspring2018/resources/genre_classification_data.zip?attredirects=0&d=1'
filename='genre_classification_data.zip'
#Downloading the zip file from the url
if not os.path.exists(filename):
    urllib.request.urlretrieve(url,filename)
    #Unzipping
    zip_ref = zipfile.ZipFile(filename, 'r')
    zip_ref.extractall()
    zip_ref.close()
    print('Data downloaded and unzipped')


In [3]:
# Load data
products = json.load(open("dataset_classification.json","r"))
genre_products = dict()
for id, product in products.items():
    genre_products.setdefault(product['genre'],[]).append(id)

categories = genre_products.keys()
print(categories)

dict_keys(['Rap & Hip-Hop', 'R&B', 'Dance & Electronic', 'Alternative Rock', 'Latin Music', 'New Age', 'Metal', 'Jazz', 'Classical', 'Folk', 'Country', 'Pop', 'Rock'])


In [4]:
# Load or create partitions for cross-validation
def partition(lst, n): 
    division = len(lst) / float(n) 
    return [ lst[int(round(division * i)): int(round(division * (i + 1)))] for i in xrange(n) ]

def create_folds(k,suffix):
    test = []
    train = []
    for i in range(0,k):
        test.append(set())
        train.append(set())
    for genre, ids in genre_products.items():
        rnd = ids[:]
        random.shuffle(rnd)
        folds = partition(rnd, k)
        for i, fold in enumerate(folds):
            test[i].update(fold)
            train[i].update(set(ids).difference(fold))
    for i in range(0,k):
        ftr = open("evaluation/train_"+suffix+str(i)+".csv","w")
        ftr.write("\n".join(list(train[i])))
        fts = open("evaluation/test_"+suffix+str(i)+".csv","w")
        fts.write("\n".join(list(test[i])))
    return train, test

def load_folds(k,suffix):
    test = []
    train = []
    for i in range(0,k):
        ftr = open("evaluation/train_"+suffix+str(i)+".csv","r")
        train.append(set(ftr.read().splitlines()))
        fts = open("evaluation/test_"+suffix+str(i)+".csv","r")
        test.append(set(fts.read().splitlines()))
    return train, test

In [5]:
def classify(train,test,features):
    # Ground truth
    y_train = [products[id]['genre'] for id in train]
    y_test = [products[id]['genre'] for id in test]
    
    X_train_d = dict()
    X_test_d = dict()  
    
    if 'bow' in features:
        # Create bag-of-words matrix
        vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english', ngram_range=(1,2), analyzer='word')
        data_train = [products[id]['all_text'] for i,id in enumerate(train)]
        data_test = [products[id]['all_text'] for i,id in enumerate(test)]
        X_train_d['bow'] = vectorizer.fit_transform(data_train)
        X_test_d['bow'] = vectorizer.transform(data_test)
    if 'semantic' in features:
        # Create bag-of-categories matrix
        semantic_data = json.load(open("semantic_features.json"))
        data_train = []
        data_test = []
        for id in train:
            entities = " ".join([str(e) for e in semantic_data[id]['entities']])
            categories = " ".join(semantic_data[id]['categories'])
            data_train.append(entities + " " + categories)
        for id in test:
            entities = " ".join([str(e) for e in semantic_data[id]['entities']])
            categories = " ".join(semantic_data[id]['categories'])
            data_test.append(entities + " " + categories)

        # Create X matrix bag-of-categories
        sem_vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english')
        X_train_d['semantic'] = sem_vectorizer.fit_transform(data_train)
        X_test_d['semantic'] = sem_vectorizer.transform(data_test)
    
    X_train = X_train_d[features[0]]
    X_test = X_test_d[features[0]]
    for i in range(1,len(features)):
        X_train = hstack((X_train,X_train_d[features[i]]),format='csr')
        X_test = hstack((X_test,X_test_d[features[i]]),format='csr')

    # Classify
    clf = LinearSVC(loss='squared_hinge', penalty='l2', dual=False, tol=1e-3)
    clf.fit(X_train, y_train)
    pred = clf.predict(X_test)
    score = metrics.accuracy_score(y_test, pred)
    if False:
        print("classification report:")
        print(metrics.classification_report(y_test, pred,target_names=categories))
        print("confusion matrix:")
        print(metrics.confusion_matrix(y_test, pred))
        print()
            

    return score

In [6]:
if __name__ == '__main__':
    train,test = load_folds(5,"")
    #train,test = create_folds(5,"reviews")
    experiments = [['bow'],['semantic'],['bow','semantic']]
    for features in experiments:
        results = []
        print("*********************\n"+"+".join(features)+"\n")
        confusion_matrix = np.zeros((13,13))
        for i, (train_i, test_i) in enumerate(zip(train,test)):
            print("Running fold %d" % i)
            results.append(classify(train_i, test_i, features))
        print("Mean accuracy: %.2f Std: %.2f" % (np.mean(results), np.std(results)))


*********************
bow

Running fold 0
Running fold 1
Running fold 2
Running fold 3
Running fold 4
Mean accuracy: 0.63 Std: 0.02
*********************
semantic

Running fold 0
Running fold 1
Running fold 2
Running fold 3
Running fold 4
Mean accuracy: 0.65 Std: 0.03
*********************
bow+semantic

Running fold 0
Running fold 1
Running fold 2
Running fold 3
Running fold 4
Mean accuracy: 0.69 Std: 0.02
