# Lyrics Analysis

I have used lyrics dataset present here - https://www.kaggle.com/gyani95/380000-lyrics-from-metrolyrics

# Reading/Parsing/Tokenizing Data

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS
import matplotlib as mpl
import nltk
from math import pi
from bokeh.io import show
from bokeh.models import (
    ColumnDataSource,
    HoverTool,
    LinearColorMapper,
    BasicTicker,
    PrintfTickFormatter,
    ColorBar,
)
from bokeh.plotting import figure
from bokeh.layouts import gridplot
import numpy as np

ModuleNotFoundError: No module named 'wordcloud'

In [None]:
#reading the data
df = pd.read_csv("lyrics.csv")

In [None]:
#adding a new column to keep the tokenized lyrics
df["tokenized"] = df["lyrics"].apply(lambda x: " ".join(nltk.word_tokenize(str(x).decode("utf-8"))))

# Visualizing the Text

## Scatter plot to see word count in songs over the year, also a bar chart to see group over the years and see the mean song length

In [None]:
#Word Count in Songs Across The Year 
sample = df.copy()
sample["word_count"] = sample["lyrics"].apply(lambda x: len(x.split()))
data = ColumnDataSource(data=dict(year=sample["year"],count=sample["word_count"]))
scatter = figure(title="word count over the years")
scatter.circle('year','count',source=data)

#mean word count
test = sample.copy()
test = test.groupby('year')["word_count"].agg({'word_count_mean':'mean'}).reset_index()
test['year'] = test['year'].astype(str)
years=list(test['year'].unique())
cd = ColumnDataSource(test)
#year_cmap = factor_cmap('year', palette=Spectral5, factors=sorted(test.year.unique()))
bar = figure(x_range=years,title="word count over the years")
bar.vbar(x='year',top='word_count_mean',width=1,source=cd)#, line_color=year_cmap, fill_color=year_cmap)
grid = gridplot([[scatter, bar]])
show(grid)

## HeatMap for occurence of word love in genre vs year

In [None]:
#HeatMap for occurence of word Love in genre vs year

import functools

df = df.dropna() 
df = df[df.year > 1960]

count = df.groupby(['genre','year'],as_index=False).count()
count = count[['genre','year','song']]

love = pd.DataFrame(df.groupby(['year','genre'])['lyrics'].apply(lambda x: x[x.str.contains('love')].count()))
love.reset_index(inplace=True)
love.columns = [ 'year','genre','love_lyrics']
love = love.fillna(0)

temp = [count, love]
love_freq = functools.reduce(lambda left,right: pd.merge(left,right,on=['genre','year'], how='outer'), temp)
love_freq['love_lyrics'] = love_freq['love_lyrics'] / love_freq['song']

love_freq['year'] = love_freq['year'].astype(str)

years = list(love_freq.year.unique())
genre = list(love_freq.genre.unique())

colors = ["#75968f", "#a5bab7", "#c9d9d3", "#e2e2e2", "#dfccce", "#ddb7b1", "#cc7878", "#933b41", "#550b1d"]
mapper = LinearColorMapper(palette=colors, low=love_freq.love_lyrics.min(), high=love_freq.love_lyrics.max())

source = ColumnDataSource(love_freq)

TOOLS = "hover,save,pan,box_zoom,reset,wheel_zoom"

p = figure(title="Heat Map for word Love in Song Genres across the Years",
           x_range=years, y_range=list(reversed(genre)),
           x_axis_location="above", plot_width=900, plot_height=400,
           tools=TOOLS, toolbar_location='below')

p.grid.grid_line_color = None
p.axis.axis_line_color = None
p.axis.major_tick_line_color = None
p.axis.major_label_text_font_size = "5pt"
p.axis.major_label_standoff = 0
p.xaxis.major_label_orientation = pi / 3

p.rect(x='year', y='genre', width=1, height=1,
       source=source,
       fill_color={'field': 'love_lyrics', 'transform': mapper},
       line_color=None)

color_bar = ColorBar(color_mapper=mapper, major_label_text_font_size="5pt",
                     ticker=BasicTicker(desired_num_ticks=len(colors)),
                     formatter=PrintfTickFormatter(format="%d%%"),
                     label_standoff=6, border_line_color=None, location=(0, 0))
p.add_layout(color_bar, 'right')


show(p)      # show the plot

## Observation
Heatmap - "Country, Jazz, Pop" has the most songs with word "love" in it and has consistent over years. while metal, electronic and rock has the least. 
Word Count with year - We can see that avg song length has reduced with years. and was its peak during 1995 and 2000. 

# Clustering the Data

 I have grouped rock songs together and compared them with a arbitary corpora "brown corpus". To make a comparison between the two corpora, I define an arbitrary measure of "Rockness", analogous to "Metalness" given here. 
http://www.degeneratestate.org/posts/2016/Apr/20/heavy-metal-and-natural-language-processing-part-1/. 
Finally, I have plotted rockness vs length of token for top 1000 "rock words". Also, I have plotted freq of these top words in rock songs vs freq in  non-rock songs to see if these are specifically "rock words" or are common words in songs. These plots are connected through brushing. 

In [None]:
from collections import defaultdict
from nltk.corpus import stopwords

## calculating token freq for rock songs and the brown corpus

stop_words = set(stopwords.words('english'))
rock = df[df["genre"] == "Rock"]
rock_token_freq = defaultdict(int)
count=0
tokens = rock['tokenized']
print len(tokens)

for i in tokens:
    words = i.split()
    for w in words:
        if len(w)>2 and w not in stop_words:
            rock_token_freq[w.lower()] += 1
            
#brown corpus
words = [w.lower() for w in nltk.corpus.brown.words()]
eng_token_freq = defaultdict(int)
count=0
print len(words)
for w in words:
    eng_token_freq[w] += 1

In [None]:
#calculatin rockness score
rockness = {}
from __future__ import division
from math import log
import operator 

n_rock = sum(rock_token_freq.itervalues())
n_all_eng = sum(eng_token_freq.itervalues())

rock_token_freq = {k:v for k,v in rock_token_freq.iteritems() if v > 5}

for w in rock_token_freq:
    if eng_token_freq[w]>0:
        rockness[w] = log( (rock_token_freq[w] / n_rock) / (eng_token_freq[w] / n_all_eng))
        
rockness_sorted = sorted(rockness.items(), key=operator.itemgetter(1), reverse=True)

rockness_score=[]
length=[]
word=[]

toprockwords = rockness_sorted[:1000]
for key,value in dict(toprockwords).iteritems():
    if len(key) >2:
        rockness_score.append(value)
        length.append(len(key))
        word.append(key)

In [None]:
# calculating nonrock song token freq

nonrock = df[df["genre"]!="Rock"]
nonrock_token_freq = defaultdict(int)
count=0
tokens = nonrock['tokenized']
print len(tokens)
for i in tokens:
    words = i.split()
    for w in words:
        if len(w)>2 and w not in stop_words:
            nonrock_token_freq[w.lower()] += 1

n_nonrock = sum(nonrock_token_freq.values())

#rock vs nonrock
common = dict(toprockwords).keys()
rock_freq=[]
nonrock_freq=[]
common_w=[]

for w in common:
    if w in nonrock_token_freq:
        common_w.append(w)
        rock_freq.append(np.log(rock_token_freq[w]/n_rock))
        nonrock_freq.append(np.log(nonrock_token_freq[w]/n_nonrock))

In [None]:
for w in common:
    if w in nonrock_token_freq:
        common_w.append(w)
        rock_freq.append(np.log(rock_token_freq[w]/n_rock))
        nonrock_freq.append(np.log(nonrock_token_freq[w]/n_nonrock))

In [None]:
from bokeh.plotting import figure, output_file, show, ColumnDataSource
from bokeh.models import HoverTool
import numpy as np
from bokeh.layouts import gridplot

N = 1000

xc = np.random.random(size=N) * 100
yc = np.random.random(size=N) * 100
#radii = np.random.random(size=N) * 1.5
colors = ["#%02x%02x%02x" % (r, g, 150) for r, g in zip(np.floor(50+2*xc), np.floor(30+2*yc))]


TOOLS = "box_select,lasso_select,help"

source = ColumnDataSource(data=dict(
    rockness_score=rockness_score,
    length=length,
    vocab=word,
    rock_freq=rock_freq,
    nonrock_freq=nonrock_freq,
    colors=colors
))

hover = HoverTool(tooltips=[
    ("(x,y)", "($x, $y)"),
    ("word", "@vocab"),
])

hover2 = HoverTool(tooltips=[
    ("(x,y)", "($x, $y)"),
    ("word", "@vocab"),
])

p = figure(tools=[TOOLS,hover])
p.scatter('rockness_score', 'length',size=10,color='colors',source=source)
p.xaxis.axis_label = 'rockness score'
p.yaxis.axis_label = 'length of the word'
p.title.text = "Rockness Vs Length"

q = figure(tools=[TOOLS,hover2])
q.scatter('rock_freq', 'nonrock_freq',size=10,color='colors',source=source)
q.xaxis.axis_label = 'frequency in rock songs'
q.yaxis.axis_label = 'frequency in other songs'
q.title.text = "Rock Frequency vs NonRock Frequency For Top Rock Words"

s = gridplot([[p,q]])
show(s)

## Observation - <br>

Relationship from freq of rock words in rock songs vs non-rock songs is linear and directly proportional. Words classified as rock words have almost same frequency in non-rock songs as well. Also, we can see that there is not a lot relation between length of a word and its frequency of occurence. 
Also, the rockness of the word decreases with increase in length of the word. 

## Clustering By Artist

I also clustered top 100 artist based on their lyrics. (concatenated there lyrics and then calculated vector for each artist using tf-idf)

In [None]:
#get top artist
n = 100
top_artist = df[['artist']].groupby(['artist'])['artist'].count().reset_index(name='count').sort_values(['count'], ascending=False).head(n)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS


def combine_vectors(vectors):
    return normalise(np.sum(vectors, axis=0))

artist = df[df.artist.isin(top_artist.artist.values)]
artist = artist.groupby('artist').agg({'tokenized': 'sum'}).reset_index()

vectorizer = TfidfVectorizer(
    stop_words=ENGLISH_STOP_WORDS,
    lowercase=True,
    max_df=0.7,
    min_df=0.001,
    norm=None
)

artist["vectors_unnormalised"] = list(vectorizer.fit_transform(artist.tokenized.values).toarray())

In [None]:
from sklearn import neighbors, datasets
from sklearn.neighbors import NearestNeighbors
from sklearn import cluster, datasets
from sklearn.neighbors import kneighbors_graph
from sklearn.decomposition import PCA

colors = np.array([x for x in ('#00f', '#0f0', '#f00', '#0ff', '#f0f', '#ff0')])
X = artist["vectors_unnormalised"].tolist()
pca = PCA(n_components=2).fit(X)
data2D = pca.transform(X)
kmeans = cluster.KMeans(4)
kmeans.fit(X)

In [None]:
names = top_artist.artist.values
#print names

In [None]:
plot = figure(width=550, height=550, title='Clustering By Artist')
source = ColumnDataSource(data=dict(data=X,x=data2D[:,0],y=data2D[:,1],color=colors[kmeans.labels_].tolist()))
plot.scatter('x', 'y', color ="color",source=source)