In [8]:
# SETUP IMPORTS

from os.path import join
from os import listdir
import pandas as pd
import numpy as np
import json
import nltk
import networkx as nx

GENERATE_DATA = False
GENRES = ["blues", "gospel", "rap", "country", "rock"]
DATA_DIR = "/n/fs/guoweis-18iw/get_data/lyrics"

if GENERATE_DATA:
    # Read data
    df = pd.DataFrame(np.nan, index=[], columns=['artist', 'title', 'album', 'year', 'lyrics', 'genre'])

    ct = 0
    for genre in GENRES:
        genre_dir = join(DATA_DIR, genre)
        fns = listdir(genre_dir)
        for i, fn in enumerate(fns):
            if i % 10 == 0:
                print("Done with " + str(i) + " of " + str(len(fns)) + " files.")
            fp = join(genre_dir, fn)
            data_str = open(fp).read()
            data = json.loads(data_str)
            songs_data = data["songs"]
            for j, song in enumerate(songs_data):
                df.loc[ct, "genre"] = genre
                for key in song.keys():
                    if key == "raw" or key == "image":
                        continue
                    df.loc[ct, key] = song[key]
                ct += 1
        df.to_pickle(genre + ".data")


In [2]:
# Read Data
ALL_DATA_FN = "all.data"
df = pd.read_pickle(ALL_DATA_FN)

In [3]:
# Split into training and test data
df["randn"] = np.random.uniform(0, 1, df.shape[0])
df["data_split"] = np.array(["test" if n > 0.8 else "train" for n in df["randn"]])

In [4]:
# Get label distributions of training and test sets
def get_distribution(labels):
    ct = {}
    for lab in labels:
        if lab in ct:
            ct[lab] += 1
        else:
            ct[lab] = 1

    return ct

# Calculate dataset statistics
print("Entire Dataset")
print(get_distribution(df["genre"]))
print("Training Set")
print(get_distribution(df.query("data_split == 'train'")["genre"]))
print("Test Set")
print(get_distribution(df.query("data_split == 'test'")["genre"]))

Entire Dataset
{'rock': 16089, 'country': 16188, 'rap': 10902, 'gospel': 9086, 'blues': 7146}
Training Set
{'rock': 12817, 'country': 12986, 'rap': 8733, 'gospel': 7305, 'blues': 5671}
Test Set
{'rock': 3272, 'country': 3202, 'rap': 2169, 'gospel': 1781, 'blues': 1475}


In [33]:
# Extract features
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")

ACCEPTED_CHARS = set('abcdefghijklmnopqrstuvwxyz \'\n')
def lyrics_strip(lyrics):
    verse = lyrics.strip().lower()
    verse = ''.join(filter(ACCEPTED_CHARS.__contains__, verse))
    return verse

def lyrics_to_linelist(lyrics):
    lyrics = lyrics.split("\n")
    lyrics = [line.split() for line in lyrics]
    return lyrics

def linelist_to_wordlist(lines):
    return [word for line in lines for word in line]

def lyrics_to_wordlist(lyrics):
    return lyrics.split()

def avg_word_len(lyrics):
    wordlist = lyrics_to_wordlist(lyrics)
    lengths = [len(word) for word in wordlist]
    return sum(lengths)/len(lengths)

def avg_line_len(lyrics):
    linelist = lyrics_to_linelist(lyrics)
    lengths = [len(line) for line in linelist]
    return sum(lengths)/len(lengths)

def total_num_lines(lyrics):
    linelist = lyrics_to_linelist(lyrics)
    return len(linelist)

def total_num_words(lyrics):
    wordlist = lyrics_to_wordlist(lyrics)
    return len(wordlist)

def num_contractions(lyrics):
    wordlist = lyrics_to_wordlist(lyrics)
    return sum([1 if "\'" in word else 0 for word in wordlist])

def num_contractions(lyrics):
    wordlist = lyrics_to_wordlist(lyrics)
    return sum([1 if "\'" in word else 0 for word in wordlist])

def vocab(lyrics):
    wordlist = lyrics_to_wordlist(lyrics)
    wordlist = [stemmer.stem(word) for word in wordlist]
    return set(wordlist)

def vocab_size(lyrics):
    return len(vocab(lyrics))

lyrics = df.loc[1,"lyrics"]
linelist = lyrics_to_linelist(lyrics)
wordlist = linelist_to_wordlist(linelist)



3.981651376146789
5.45
20
109
13
{'reason', 'way', 'here', 'be', 'honey,', 'all', 'then', "i'm", 'know,', 'as', 'babe', 'deep', 'just', 'drivin', 'babi', 'ya', 'down', 'string', 'need', 'well,', 'that', 'a-yeah,', 'you', 'hurt', 'darlin', 'tell', 'cryin', 'your', 'a-just', 'what', 'dri', 'the', 'to', 'tie', 'want', 'crazy,', 'and', 'string,', 'we', 'stop', 'it', 'never', 'come', 'crazi', 'part', 'a-honey,', 'i', 'girl,', 'know', 'a', 'so', 'around', 'sweet', 'would', "i'll", 'eye', 'insid', 'me', 'ta', 'yeah,', 'heart', 'goin', 'can', 'a-honey'}
64


In [66]:
# Extract rhyme features
### Adapted from https://github.com/edwadli/rapgraph/blob/master/src/rapper.py
from nltk.corpus import cmudict
transcr = cmudict.dict()
_NULL_ = '_NULL_'
phs = 'AA AE AH AO AW AY B CH D DH EH ER EY F G HH IH\
    IY JH K L M N NG OW OY P R S SH T TH UH UW V W Y Z'.split()
phs_vowels = set('AA AE AH AO AW AY EH ER EY IH IY OW OY UH UW'.split())

def phonemes(words):
    words = [word.lower() for word in words]
    phonemes = {}
    for word in words:
        # get possible pronunciations from dict
        possible_pronunciations =  transcr.get(word, [[_NULL_]])
        if word not in transcr:
            # TODO: generate a guess on the pronunciation
            pass
        # strip out emphasis on vowels
        for pronunciation in possible_pronunciations:
            for i in range(len(pronunciation)):
                pronunciation[i] = ''.join(c for c in pronunciation[i] if not c.isdigit())
        # remove repeats
        possible_pronunciations = list(set([tuple(p) for p in possible_pronunciations]))
        phonemes[word] = possible_pronunciations
    return phonemes

def phonemeSimilarity(ph_a, ph_b):
    # Heuristic phoneme rhyming similarity in range [0, 1]    
    relative_score = 0.
    if ph_a == _NULL_ or ph_b == _NULL_:
        return 0.
    if ph_a == ph_b:
        # rhyme
        relative_score = 1.
    elif ph_a in phs_vowels:
        if ph_b in phs_vowels:
            # both vowels, likely to rhyme
            relative_score = 0.3
    elif ph_b not in phs_vowels:
        # both consonants, could help rhyme
        relative_score = 0.05
    return relative_score

def alignPhonemeSequences(a_seq, b_seq):
    # Smith-Waterman alignment with custom phoneme similarity scoring
    GAP_PENALTY = -1.
    MIN_SCORE = -10.
    MAX_SCORE = 10.
    score_range = MAX_SCORE - MIN_SCORE
    width = len(a_seq)+1
    height = len(b_seq)+1
    H = [[0] * width for i in range(height)]
    
    # Run the DP alg
    for row in range(1,height):
        for col in range(1,width):
            relative_score = phonemeSimilarity(a_seq[col-1], b_seq[row-1])
            align = H[row-1][col-1] + relative_score * score_range + MIN_SCORE
            deletion = H[row-1][col] + GAP_PENALTY
            insertion = H[row][col-1] + GAP_PENALTY
            H[row][col] = max(0, align, deletion, insertion)

    # extract the solution
    # find max value in H
    max_value = 0
    max_row = None
    max_col = None
    for row in range(height):
        for col in range(width):
            if H[row][col] >= max_value:
                max_value = H[row][col]
                max_row = row
                max_col = col
    return max_value, H

def end_rhyme_score(a_seq, b_seq):
    max_val, h = alignPhonemeSequences(a_seq, b_seq)
    return h[-1][-1]

def aligned_rhyme_score(a_seq, b_seq):
    max_val, h = alignPhonemeSequences(a_seq, b_seq)
    return max_val

def aligned_matrix(a_seq, b_seq):
    max_val, h = alignPhonemeSequences(a_seq, b_seq)
    return h

In [135]:
# Get line adjacency graph

def get_rhyme_adj_graph(lyrics, thresh = 0):
    linelist = lyrics_to_linelist(lyrics)
    wordlist = lyrics_to_wordlist(lyrics)
    get_phonemes = phonemes(wordlist)
    num_wrds = len(wordlist)
    graph = np.zeros((num_wrds, num_wrds))
    
    i = 0
    for j, line in enumerate(linelist):
        full_phrase = line
        if j < len(linelist)-1:
            full_phrase = linelist[j] + linelist[j+1]
        for k, word in enumerate(line):
            word1 = word
            for l, word2 in enumerate(full_phrase[k+1:]):
                ph1 = get_phonemes[word1]
                ph2 = get_phonemes[word2]
                w = 0
                for p1 in ph1:
                    for p2 in ph2:
                        w = max(w, aligned_rhyme_score(p1, p2))
                graph[i, i+1+l] = w
                graph[i+1+l, i] = w
            i += 1
    graph[graph <= thresh] = 0
    return graph

lyrics = lyrics_strip(df.loc[0, "lyrics"])
g = get_rhyme_adj_graph(lyrics, 10)

G = nx.from_numpy_matrix(g)

print(np.count_nonzero(g))
print(np.size(g))
print(np.unique(g))

print(nx.number_connected_components(G))

def edge_density(rhyme_graph, weighted=False):
    if weighted:
        return np.sum(rhyme_graph)/np.size(rhyme_graph)
    return np.count_nonzero(rhyme_graph)/np.size(rhyme_graph)

def edge_var(rhyme_graph):
    return np.var(rhyme_graph[rhyme_graph > 0])

def degree_var(rhyme_graph, weighted=False):
    if weighted:
        degrees = [np.sum(vertex) for vertex in rhyme_graph]
    else:
        degrees = [np.count_nonzero(vertex) for vertex in rhyme_graph]
    return np.var(degrees)

def degree_avg(rhyme_graph, weighted=False):
    if weighted:
        return np.sum(rhyme_graph)/len(rhyme_graph)
    return np.count_nonzero(rhyme_graph)/len(rhyme_graph)

def comp_size_avg(rhyme_graph):
    return len(rhyme_graph)/nx.number_connected_components(nx.from_numpy_matrix(rhyme_graph))

def num_comp(rhyme_graph):
    return nx.number_connected_components(nx.from_numpy_matrix(rhyme_graph))

124
29929
[ 0. 14. 16. 18. 19. 20. 30. 40.]
113


array([40., 30., 40., 30., 20., 30., 20., 30., 20., 20., 30., 30., 20.,
       20., 30., 30., 20., 20., 30., 20., 30., 20., 30., 20., 30., 18.,
       20., 30., 30., 18., 14., 20., 14., 20., 19., 19., 20., 19., 20.,
       30., 19., 30., 20., 30., 30., 30., 30., 30., 20., 20., 30., 30.,
       30., 20., 30., 20., 20., 30., 20., 20., 18., 20., 18., 20., 30.,
       30., 20., 30., 30., 30., 30., 30., 20., 20., 30., 30., 30., 20.,
       30., 20., 20., 30., 20., 20., 20., 20., 18., 18., 18., 16., 20.,
       20., 20., 20., 20., 18., 16., 20., 20., 20., 20., 20., 20., 20.,
       30., 30., 20., 30., 30., 30., 30., 30., 20., 20., 30., 30., 30.,
       20., 30., 20., 20., 30., 20., 20.])

In [None]:
df["randn"]