In [277]:
import numpy as np
import pandas as pd
import json
import re
import nltk
nltk.download('stopwords') #make sure list up to date
from nltk.corpus import wordnet
from IPython.display import clear_output


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\magar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [278]:
#read in scraped data
chartsDf = pd.read_csv('charts_and_lyrics_2017.csv', encoding='utf-8')

chartsDf

Unnamed: 0,chart,chartURL,rank,song,artist,lyrics
0,Overall,https://www.billboard.com/charts/year-end//201...,1,Shape Of You,Ed Sheeran,[Verse 1]\nThe club isn't the best place to fi...
1,Overall,https://www.billboard.com/charts/year-end//201...,2,Despacito,Luis Fonsi & Daddy Yankee Featuring Justin Bieber,"[Intro: Luis Fonsi & Daddy Yankee]\nAy, ¡Fonsi..."
2,Overall,https://www.billboard.com/charts/year-end//201...,3,That's What I Like,Bruno Mars,"[Verse 1]\nHey, hey, hey\nI got a condo in Man..."
3,Overall,https://www.billboard.com/charts/year-end//201...,4,Humble.,Kendrick Lamar,[Intro]\nNobody pray for me\nIt's been that da...
4,Overall,https://www.billboard.com/charts/year-end//201...,5,Something Just Like This,The Chainsmokers & Coldplay,[Verse 1: Chris Martin]\nI've been reading boo...
5,Overall,https://www.billboard.com/charts/year-end//201...,6,Bad And Boujee,Migos Featuring Lil Uzi Vert,"[Intro: Offset]\nYou know, young rich niggas\n..."
6,Overall,https://www.billboard.com/charts/year-end//201...,7,Closer,The Chainsmokers Featuring Halsey,"[Verse 1: Andrew Taggart]\nHey, I was doing ju..."
7,Overall,https://www.billboard.com/charts/year-end//201...,8,Body Like A Back Road,Sam Hunt,"[Verse 1]\nGot a girl from the south side, got..."
8,Overall,https://www.billboard.com/charts/year-end//201...,9,Believer,Imagine Dragons,[Verse 1]\nFirst things first\nI'ma say all th...
9,Overall,https://www.billboard.com/charts/year-end//201...,10,Congratulations,Post Malone Featuring Quavo,"[Intro: Post Malone]\nMm-mmm\nYah, yah\nMm-mmm..."


In [279]:
def lineSplitAndClean(lyrics):    
    #make sure lyrics were found for this song
    #lyrics will be np.nan if none were found
    if not pd.isnull(lyrics):
        #strip "[Chorus]", "[Verse 1]" etc tags
        lyrics = re.sub(r'\[.*\]', '', lyrics)
        
        #using .lower both for uniformity and cause stopwords list is all lowercase
        lines = [line.lower() for line in lyrics.split('\n') if len(line)>0]
        
        #strip any punctuation
        lines = [re.sub(r'\W',' ',line) for line in lines]

        #replace any multiple consecutive spaces with just one space
        lines = [re.sub(r'\s+',' ',line) for line in lines]
        
        #possible for lines to be an empty list here like for instrumental
        #songs the starting lyrics will just be '[instrumental]'
        if lines:
            return lines
        else:
            return np.nan
    else:
        return np.nan

In [280]:
#breaking lyrics into lines so nltk.pos_tag get's sentences passed to it rather 
#than all the words at once. Should give better context for tagging.
chartsDf['lines'] = chartsDf.lyrics.apply(lineSplitAndClean)

chartsDf

Unnamed: 0,chart,chartURL,rank,song,artist,lyrics,lines
0,Overall,https://www.billboard.com/charts/year-end//201...,1,Shape Of You,Ed Sheeran,[Verse 1]\nThe club isn't the best place to fi...,[the club isn t the best place to find a lover...
1,Overall,https://www.billboard.com/charts/year-end//201...,2,Despacito,Luis Fonsi & Daddy Yankee Featuring Justin Bieber,"[Intro: Luis Fonsi & Daddy Yankee]\nAy, ¡Fonsi...","[ay fonsi d y , ohhh oh no oh no oh, hey yeah ..."
2,Overall,https://www.billboard.com/charts/year-end//201...,3,That's What I Like,Bruno Mars,"[Verse 1]\nHey, hey, hey\nI got a condo in Man...","[hey hey hey, i got a condo in manhattan, baby..."
3,Overall,https://www.billboard.com/charts/year-end//201...,4,Humble.,Kendrick Lamar,[Intro]\nNobody pray for me\nIt's been that da...,"[nobody pray for me, it s been that day for me..."
4,Overall,https://www.billboard.com/charts/year-end//201...,5,Something Just Like This,The Chainsmokers & Coldplay,[Verse 1: Chris Martin]\nI've been reading boo...,"[i ve been reading books of old, the legends a..."
5,Overall,https://www.billboard.com/charts/year-end//201...,6,Bad And Boujee,Migos Featuring Lil Uzi Vert,"[Intro: Offset]\nYou know, young rich niggas\n...","[you know young rich niggas, you know so we ai..."
6,Overall,https://www.billboard.com/charts/year-end//201...,7,Closer,The Chainsmokers Featuring Halsey,"[Verse 1: Andrew Taggart]\nHey, I was doing ju...","[hey i was doing just fine before i met you, i..."
7,Overall,https://www.billboard.com/charts/year-end//201...,8,Body Like A Back Road,Sam Hunt,"[Verse 1]\nGot a girl from the south side, got...",[got a girl from the south side got braids in ...
8,Overall,https://www.billboard.com/charts/year-end//201...,9,Believer,Imagine Dragons,[Verse 1]\nFirst things first\nI'ma say all th...,"[first things first, i ma say all the words in..."
9,Overall,https://www.billboard.com/charts/year-end//201...,10,Congratulations,Post Malone Featuring Quavo,"[Intro: Post Malone]\nMm-mmm\nYah, yah\nMm-mmm...","[mm mmm, yah yah, mm mmm, yah hey , my momma c..."


In [281]:
def convertPOSTag(treebank_tag):
    #nltk.pos_tag() uses tags from the treebank corpus 
    #https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
    #but the lemmatiser uses tags from word net so need to convert.
    #only noun, verb, adjective and adverb are accepted.
    #adapted from https://stackoverflow.com/questions/15586721/wordnet-lemmatization-and-pos-tagging-in-python
    if treebank_tag.startswith('J'):
        return wordnet.ADJ #returns 'a'
    elif treebank_tag.startswith('V'):
        return wordnet.VERB #returns 'v'
    elif treebank_tag.startswith('R'):
        return wordnet.ADV #returns 'r'

    #return noun for both actual nouns and anything that doesn't fit the other three 
    #noun is the default for the lemmatiser anyway so this will have same effect 
    #as not passing a tag.
    else:
        return wordnet.NOUN #returns 'n'

In [282]:
stopwords = nltk.corpus.stopwords.words('english') #are lowercase
lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmaCounts(row):    
    lines = row.lines
    chartName = row.chart
    
    #make sure lyrics were found for this song
    #lyrics will be np.nan if none were found
    try:
        if pd.isnull(lines): #causes an ValueError when lines is an array (eg [[x], [y]])
            return
    except ValueError:
        pass
    
    for line in lines:
        words = nltk.word_tokenize(line)

        #gives list of tuples in form [(car, NN), (run, VB)... etc]
        word_tags = nltk.pos_tag(words)

        #add lemmas and words to count list
        for wordAndTag in word_tags:
            word = wordAndTag[0]
            tag = convertPOSTag(wordAndTag[1])
            if wordAndTag[0] not in stopwords:
                lemma = lemmatizer.lemmatize(word, tag)
                lemmaAndTag = (lemma, tag)

                #add words to master list
                if lemmaAndTag not in allChartsLemmaCounts.keys():
                    allChartsLemmaCounts[lemmaAndTag] = {'count': 1, 
                                                         'sourceWords': [word]}
                else:                        
                    allChartsLemmaCounts[lemmaAndTag]['count'] = \
                    allChartsLemmaCounts[lemmaAndTag]['count'] + 1
                    if word not in allChartsLemmaCounts[lemmaAndTag]['sourceWords']:
                        allChartsLemmaCounts[lemmaAndTag]['sourceWords'].append(word)

    return

In [283]:
allChartsLemmaCounts = {}
chartsDf.apply(lemmaCounts, axis=1)

#get rid of 600 Nones printed out
clear_output()

In [284]:
allChartsLemmaCountsDf = pd.DataFrame.from_dict(allChartsLemmaCounts, orient='index')

#currently the index of the df is the words and the counts are in a column called 0
allChartsLemmaCountsDf.sort_values(by=['count'], ascending=False, inplace=True)

#create an actual index and move the words into a column
allChartsLemmaCountsDf.reset_index(inplace=True)
allChartsLemmaCountsDf.columns = ['lemma', 'wordType', 'count', 'sourceWords']

allChartsLemmaCountsDf

Unnamed: 0,lemma,wordType,count,sourceWords
0,get,v,2432,"[get, got, getting, gets, gotten]"
1,know,v,1743,"[know, knew, known, knows, knowing]"
2,like,n,1647,"[like, likes]"
3,yeah,n,1547,[yeah]
4,oh,n,1096,[oh]
5,go,v,1072,"[go, going, went, gone, goes]"
6,na,n,948,[na]
7,make,v,942,"[make, made, making, makes]"
8,say,v,913,"[say, said, says, saying]"
9,come,v,909,"[come, came, coming, comes]"
