<h1>Analyzing Song Lyrics using scikit-learn</h1>

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

In [None]:
from itertools import izip_longest

In [None]:
# Load song lyrics dataset.
import json
song_lyrics_2 = json.load(open('data\song_lyrics_2.json','rt'), encoding='utf8')

In [None]:
song_lyrics ={tuple(tup[0]):tup[1] for tup in song_lyrics_2[2000:]}

In [None]:
# Split the titles and lyrics into two lists.
titles, lyrics = zip(*song_lyrics.items())

In [None]:
# Create an instance of the CountVectorizer.
cv = CountVectorizer()

# Create a count matrix from the list of documents.
lyrics_matrix = cv.fit_transform(lyrics)

In [None]:
# Create an instance of a TfidfTransformer.
tfidf = TfidfTransformer()

# Created a weighted matrix.
weighted_matrix = tfidf.fit_transform(lyrics_matrix)

In [None]:
weighted_matrix

In [None]:
from sklearn.metrics.pairwise import pairwise_distances
import numpy as np

In [None]:
# Compute distance matrix.
distance_matrix = pairwise_distances(lyrics_matrix)

In [None]:
def get_closest_pairs(dist_mat, names, num=10, ignore_zeros=False):
    """Finds pairs with closest non-zero distances."""
    dist_mat = np.copy(dist_mat)
    # Set zero values to infinity so they are ignore in the partitioning.
    dist_mat[np.triu_indices_from(dist_mat)] = np.inf
    if ignore_zeros:
        dist_mat[dist_mat == 0] = np.inf
    # Unravel the matrix indices and partition num lowest numbers.
    unr_index = np.unravel_index(dist_mat.argpartition(num, axis=None), dist_mat.shape)
    # Get document names for pairs.
    dist_pairs = [(names[unr_index[0][i]],        # First doc
                   names[unr_index[1][i]],        # Second doc
                   dist_mat[unr_index[0][i], unr_index[1][i]])   # Distance between docs
                   for i in xrange(num)]                # First num indices
    return sorted(dist_pairs, key=lambda tup: tup[2])

In [None]:
# Find the 25 closests 
closest = get_closest_pairs(distance_matrix, titles, num=25)
for tup in closest:
    print(str(tup[0][2]) + ', ' + str(tup[1][2]) + '  :  ' + str(tup[2]))

In [None]:
for i in xrange(len(closest)):
    title1, title2 = closest[i][0], closest[i][1]
    two_cols = izip_longest(lyrics[titles.index(title1)].encode('utf8').split('\n'), 
                            lyrics[titles.index(title2)].encode('utf8').split('\n'), fillvalue='')
    print('____{0:46} | ____{1}'.format(title1[2], title2[2]))
    for tup in two_cols:
        #print('{0:50} | {1}'.format(*map(tup))
        print('{0:50} | {1}'.format(*tup))
    print('')

<b>These are all instrumentals without lyrics.

In [None]:
 a= [k for k,v in song_lyrics.items() if 'Instrumental' in v]

In [None]:
a

In [None]:
# Filter out songs that do not have lyrics in the dataset.
song_lyrics_filtered = {k:v for k,v in song_lyrics.items() if 'Instrumental' in v[:20]}

In [None]:
# Split the titles and lyrics into two lists.
titles, lyrics = zip(*song_lyrics_filtered.items())

# Create an instance of the CountVectorizer.
cv = CountVectorizer()

# Create a count matrix from the list of documents.
lyrics_matrix = cv.fit_transform(lyrics)

# Create an instance of a TfidfTransformer.
tfidf = TfidfTransformer()

# Created a weighted matrix.
weighted_matrix = tfidf.fit_transform(lyrics_matrix)

# Compute distance matrix.
distance_matrix = pairwise_distances(lyrics_matrix)

In [None]:
# Find the 25 closests 
closest = get_closest_pairs(distance_matrix, titles, num=25)
for tup in closest:
    print(str(tup[0][2]) + ', ' + str(tup[1][2]) + '  :  ' + str(tup[2]))

In [None]:
for i in xrange(len(closest)):
    title1, title2 = closest[i][0], closest[i][1]
    two_cols = izip_longest(lyrics[titles.index(title1)].encode('utf8').split('\n'), 
                            lyrics[titles.index(title2)].encode('utf8').split('\n'), fillvalue='')
    print('____{0:46} | ____{1}'.format(title1[2], title2[2]))
    for tup in two_cols:
        #print('{0:50} | {1}'.format(*map(tup))
        print('{0:50} | {1}'.format(*tup))
    print('')

*** This may not be true anymore***

Here we see a number of songs appear in the dataset more than once, sometimes with different names. In fact these songs appear in the top 100 songs from multiple years, which is why they show up more than one. Because they titles are slightly different, the dict created new entries instead of over-riding the values of an existing key.

So now let's ignore all pairs with a distance of 0.

In [None]:
# Find the 10 closests 
closest = get_closest_pairs(distance_matrix, titles, num=10, ignore_zeros=True)
for tup in closest:
    print(str(tup[0][2]) + ', ' + str(tup[1][2]) + '  :  ' + str(tup[2]))

So these are our closest matches. Let's see what they look like.

In [None]:
# Print out closest matching pairs and their lyrics
for i in xrange(len(closest)):
    title1, title2 = closest[i][0], closest[i][1]
    print(str(title1) + ' : ' + lyrics[titles.index(title1)])
    print(str(title2) + ' : ' + lyrics[titles.index(title2)])
    print('')

This is another instance where TFIDF has found something we didn't expect. Let's remove these songs from our dataset.

In [None]:
to_remove = [x for x in lyrics if x.startswith('We do not have') 
                                or x.startswith('[Instrumental]')
                                or x.startswith('Instrumental')
                                or x.startswith('Sorry, we have no')]
to_remove

In [None]:
# Filter out songs that do not have lyrics in the dataset.
song_lyrics_filtered = {k:v for k,v in song_lyrics.items() if not (v.startswith('We do not have')
                                                                   or v.startswith('Sorry, we have no')
                                                                   or 'Instrumental' in v[:20])}

#### An interesting example: Attempt #2

In [None]:
# Split the titles and lyrics into two lists.
titles, lyrics = zip(*song_lyrics_filtered.items())

In [None]:
# Create an instance of the CountVectorizer.
cv = CountVectorizer()

# Create a count matrix from the list of documents.
lyrics_matrix = cv.fit_transform(lyrics)

In [None]:
# Create an instance of a TfidfTransformer.
tfidf = TfidfTransformer()

# Created a weighted matrix.
weighted_matrix = tfidf.fit_transform(lyrics_matrix)

In [None]:
# Make the matrix memory footprint smaller by changing the dtype.
weighted_matrix = weighted_matrix.astype(np.float16)

In [None]:
weighted_matrix

In [None]:
# Compute distance matrix.
distance_matrix = pairwise_distances(lyrics_matrix)

In [None]:
# Find the 10 closests 
closest = get_closest_pairs(distance_matrix, titles, num=20, ignore_zeros=True)
for tup in closest:
    print(str(tup[0][2]) + ', ' + str(tup[1][2]) + '  :  ' + str(tup[2]))

In [None]:
for i in xrange(len(closest)):
    title1, title2 = closest[i][0], closest[i][1]
    two_cols = izip_longest(lyrics[titles.index(title1)].encode('utf8').split('\n'), 
                            lyrics[titles.index(title2)].encode('utf8').split('\n'), fillvalue='')
    print('____{0:46} | ____{1}'.format(title1[2], title2[2]))
    for tup in two_cols:
        #print('{0:50} | {1}'.format(*map(tup))
        print('{0:50} | {1}'.format(*tup))
    print('')

In [None]:
from collections import Counter

In [None]:
# Find songs that appear most often in closest pairs.
closest_50 = get_closest_pairs(distance_matrix, titles, num=50, ignore_zeros=True)

count = Counter([t[2] for tup in closest_50 for t in tup[:2]])

count.most_common(10)

In [None]:
count2 = Counter([t for tup in closest_50 for t in tup[:2]])

In [None]:
count2

In [None]:
count.most_common(1)[0][0]

In [None]:
count2.most_common(10)

In [None]:
titles

In [None]:
count2.most_common(1)[0]

In [None]:
print(lyrics[titles.index(count2.most_common(1)[0][0])])

You Must Love Me is getting repeated a lot. Let's take a look at the terms in the song and their weights.

In [None]:
# Get the index of a song in the titles list.
#titles.index('You Must Love Me Lyrics  Madonna')
titles.index('Piece Of My Heart Lyrics  Tara Kemp')
# This index corresponds the the row for this song in the matrix.
ind = np.nonzero(weighted_matrix[754])[1]
# Create a term to index dict.
t = zip(*cv.vocabulary_.items())
term_ind = dict(zip(t[1], t[0]))
sorted([(weighted_matrix[754, i], term_ind[i]) for i in ind])

The term with the most weight is "chorus." Sometimes the most weighted term is a strong indicator of related documents. In this case, however, all songs have a chorus, and some of the songs have labeled it in their lyrics. Let's take a look.

In [None]:
print(song_lyrics_filtered['Piece Of My Heart Lyrics  Tara Kemp'])

#### An interesting example: Attempt 3

How can we deal with words like "chorus" that appear in most/all documents (or even a few documents), but do not help us distinguish between different kinds of documents?

In [None]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
ENGLISH_STOP_WORDS

Let's look at the CountVectorizer documentation to get a little more detail on the stop_words parameter.

In [None]:
# Split the titles and lyrics into two lists.
titles, lyrics = zip(*song_lyrics_filtered.items())

In [None]:
# Create an instance of the CountVectorizer.
#cv = CountVectorizer(stop_words='english')
custom_stop_words = list(ENGLISH_STOP_WORDS) + ['chorus']
cv = CountVectorizer(stop_words=custom_stop_words)

# Create a count matrix from the list of documents.
lyrics_matrix = cv.fit_transform(lyrics)

In [None]:
# Create an instance of a TfidfTransformer.
tfidf = TfidfTransformer()

# Created a weighted matrix.
weighted_matrix = tfidf.fit_transform(lyrics_matrix)

In [None]:
# Make the matrix memory footprint smaller by changing the dtype.
weighted_matrix = weighted_matrix.astype(np.float16)

In [None]:
weighted_matrix

In [None]:
# Compute distance matrix.
distance_matrix = pairwise_distances(lyrics_matrix).astype(np.float16)
distance_matrix

In [None]:
# Find the 10 closests 
closest = get_closest_pairs(distance_matrix, titles, num=20, ignore_zeros=True)
for tup in closest:
    print(tup[0] + ', ' + tup[1] + '  :  ' + str(tup[2]))

Still seeing a lot of Tara Kemp and Madonna. Let's look at the lyrics more closely.

In [None]:
def get_weighted_vocab(name, names, matrix, cv):
    # Get the index of a song in the titles list.
    name_index = names.index(name)
    # This index corresponds the the row for this song in the matrix.
    # Get indices for all non-zero elements in the document vector.
    ind = np.nonzero(matrix[name_index])[1]
    # Create a term to index dict.
    t = zip(*cv.vocabulary_.items())
    term_ind = dict(zip(t[1], t[0]))
    return sorted([(matrix[name_index, i], term_ind[i]) for i in ind])

In [None]:
tara = get_weighted_vocab('Piece Of My Heart Lyrics  Tara Kemp', titles, weighted_matrix, cv)
madonna = get_weighted_vocab('You Must Love Me Lyrics  Madonna', titles, weighted_matrix, cv)
{tup[1] for tup in tara}.intersection({tup[1] for tup in madonna})

In [None]:
tara, madonna

In [None]:
from itertools import izip_longest

In [None]:
for i in xrange(len(closest)):
    title1, title2 = closest[i][0], closest[i][1]
    two_cols = izip_longest(lyrics[titles.index(title1)].split('\n'), lyrics[titles.index(title2)].split('\n'), fillvalue='')
    print('____{0:46} | ____{1}'.format(title1, title2))
    for tup in two_cols:
        print('{0:50} | {1}'.format(*tup))
    print('')

In [None]:
min(tfidf.idf_), max(tfidf.idf_)

In [None]:
zip(cv.vocabulary_, tfidf.idf_)

In [None]:
vect = TfidfVectorizer(min_df=1)
tfidf = vect.fit_transform(["I'd like an apple",
                             "An apple a day keeps the doctor away",
                             "Never compare an apple to an orange",
                             "I prefer scikit-learn to Orange"])
(tfidf * tfidf.T).A
array([[ 1.        ,  0.25082859,  0.39482963,  0.        ],
       [ 0.25082859,  1.        ,  0.22057609,  0.        ],
       [ 0.39482963,  0.22057609,  1.        ,  0.26264139],
       [ 0.        ,  0.        ,  0.26264139,  1.        ]])

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
vect = TfidfVectorizer(min_df=1)
tfidf = vect.fit_transform(["I'd like an apple",
                             "An apple a day keeps the doctor away",
                             "Never compare an apple to an orange",
                             "I prefer scikit-learn to Orange"])


In [None]:
pairwise_distances(tfidf)

In [None]:
dists = distance_matrix

http://nlp.stanford.edu/IR-book/html/htmledition/document-and-query-weighting-schemes-1.html
TextBlob