# Datasets

## 1. The Million Song Dataset

#### Description: 
   A freely-available collection of audio features and metadata for a million contemporary popular music tracks.
   
   Subset Size: 1.8 GB / 10,000 songs

#### Functions to get the tempo, meter, and loundess from HDF5 files for each song:

In [3]:
import tables

# open an hdf5 file


def open_h5_file_read(h5filename):

    return tables.open_file(h5filename, mode='r')

# get duration from an hdf5 file


def get_duration(h5, songidx=0):

    return h5.root.analysis.songs.cols.duration[songidx]

# get loudness from an hdf5 file


def get_loudness(h5, songidx=0):

    return h5.root.analysis.songs.cols.loudness[songidx]

# get tempo from an hdf5 file


def get_tempo(h5, songidx=0):

    return h5.root.analysis.songs.cols.tempo[songidx]

# get pitches array from an hdf5 file


def get_segments_pitches(h5, songidx=0):

    if h5.root.analysis.songs.nrows == songidx + 1:
        return h5.root.analysis.segments_pitches[h5.root.analysis.songs.cols.idx_segments_pitches[songidx]:, :]
    return h5.root.analysis.segments_pitches[h5.root.analysis.songs.cols.idx_segments_pitches[songidx]:
                                             h5.root.analysis.songs.cols.idx_segments_pitches[songidx + 1], :]

# get timbre array from an hdf5 file


def get_segments_timbre(h5, songidx=0):

    if h5.root.analysis.songs.nrows == songidx + 1:
        return h5.root.analysis.segments_timbre[h5.root.analysis.songs.cols.idx_segments_timbre[songidx]:, :]
    return h5.root.analysis.segments_timbre[h5.root.analysis.songs.cols.idx_segments_timbre[songidx]:
                                            h5.root.analysis.songs.cols.idx_segments_timbre[songidx + 1], :]

# get bars start array from an hdf5 file


def get_bars_start(h5, songidx=0):

    if h5.root.analysis.songs.nrows == songidx + 1:
        return h5.root.analysis.bars_start[h5.root.analysis.songs.cols.idx_bars_start[songidx]:]
    return h5.root.analysis.bars_start[h5.root.analysis.songs.cols.idx_bars_start[songidx]:
                                       h5.root.analysis.songs.cols.idx_bars_start[songidx + 1]]

# get beats start array from an hdf5 file


def get_beats_start(h5, songidx=0):

    if h5.root.analysis.songs.nrows == songidx + 1:
        return h5.root.analysis.beats_start[h5.root.analysis.songs.cols.idx_beats_start[songidx]:]
    return h5.root.analysis.beats_start[h5.root.analysis.songs.cols.idx_beats_start[songidx]:
                                        h5.root.analysis.songs.cols.idx_beats_start[songidx + 1]]

# identify meter from an hdf5 file


def get_meter(h5, sonidx=0):

    beats = get_beats_start(h5)
    bar = get_bars_start(h5)
    meter = int(beats.shape[0] / bar.shape[0])
    return meter

## 2. Lyrics of 10,000 songs 

#### Description:
   Web Scraped from www.azlyrics.com and stored in a csv file

#### Functions to perform topic modeling and sentiment analysis for each song:

In [6]:
import pandas as pd
import numpy as np
import nltk
from nltk import sent_tokenize, word_tokenize
from gensim import corpora
from gensim.models.ldamodel import LdaModel
from gensim.parsing.preprocessing import STOPWORDS

# Perform topic modeling


def get_topics(song_id):
    lyrics = df.iloc[song_id]
    text = ''.join(np.array(lyrics).tolist())
    striptext = text_pop.replace('\n\n', ' ').replace('\n', ' ')

    sentences = sent_tokenize(striptext)
    texts = [[word for word in sentence.lower().split(
    ) if word not in STOPWORDS and word.isalnum()] for sentence in sentences]

    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]

    num_topics = 1
    passes = 5

    lda = LdaModel(corpus, id2word=dictionary, num_topics=num_topics, passes=5)
    topic = lda.print_topics(num_words=passes)[0]

    return topic

# Perform sentment analysis


def get_pos_neg_words():
    def get_words(url):
        import requests
        words = requests.get(url).content.decode('latin-1')
        word_list = words.split('\n')
        index = 0
        while index < len(word_list):
            word = word_list[index]
            if ';' in word or not word:
                word_list.pop(index)
            else:
                index += 1
        return word_list

    # Get lists of positive and negative words
    p_url = 'http://ptrckprry.com/course/ssd/data/positive-words.txt'
    n_url = 'http://ptrckprry.com/course/ssd/data/negative-words.txt'
    positive_words = get_words(p_url)
    negative_words = get_words(n_url)
    return positive_words, negative_words


def do_pos_neg_sentiment_analysis(song_id):
    positive_words, negative_words = get_pos_neg_words()
    lyrics = df.iloc[song_id]
    text = ''.join(np.array(lyrics).tolist())
    striptext = text_pop.replace('\n\n', ' ').replace('\n', ' ')
    sentences = sent_tokenize(striptext)
    texts = [[word for word in sentence.lower().split(
    ) if word not in STOPWORDS and word.isalnum()] for sentence in sentences]

    result = list()
    cpos = cneg = 0
    for word in word_tokenize(texts):
        if word in positive_words:
                cpos += 1
        if word in negative_words:
                cneg+=1
    result = ( cpos/len(word_tokenize(texts))*100,
                            cneg/len(word_tokenize(texts))*100)

    return result

# Perform k-means clustering on datasets

#### Input: a matrix X which contains 5 features
 tempo, loudness, meter, positive ratio for lyrics, negative ratio for lyrics

#### Output: 
 labels for 10,000 songs range from 1 to 50

In [None]:
from sklearn.cluster import KMeans

# Number of clusters
kmeans = KMeans(n_clusters=3)
# Fitting the input data
kmeans = kmeans.fit(X)
# Getting the cluster labels
labels = kmeans.predict(X)
# Centroid values
centroids = kmeans.cluster_centers_