In [177]:
import numpy as np
import pandas as pd
import json
import re
import nltk
nltk.download('stopwords') #make sure list up to date
from nltk.corpus import wordnet

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\magar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [141]:
#read in scraped data
with open('charts_and_lyrics.json') as json_data:
    inputData = json.load(json_data)

In [194]:
def convertPOSTag(treebank_tag):
    #nltk.pos_tag(word) uses tags from the treebank corpus 
    #https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
    #but the lemmatiser uses tags from word net so need to convert.
    #only noun, verb, adjective and adverb are accepted.
    #adapted from https://stackoverflow.com/questions/15586721/wordnet-lemmatization-and-pos-tagging-in-python
    if treebank_tag.startswith('J'):
        return wordnet.ADJ #returns 'a'
    elif treebank_tag.startswith('V'):
        return wordnet.VERB #returns 'v'
    elif treebank_tag.startswith('R'):
        return wordnet.ADV #returns 'r'

    #return noun for both actual nouns and anything that doesn't fit the other three 
    #noun is the default for the lemmatiser anyway so this will have same effect 
    #as not passing a tag.
    else:
        return wordnet.NOUN #returns 'n'

In [201]:
stopwords = nltk.corpus.stopwords.words('english') #are lowercase
lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmaCounts(lyrics, chartName):    
    #make sure lyrics were found for this song
    #lyrics will be False if none were found
    if lyrics:
        #strip "[Chorus]", "[Verse 1]" etc tags
        lyrics = re.sub(r'\[.*\]', '', lyrics)
        
        #strip any punctuation
        lyrics = re.sub(r'\W',' ',lyrics)
        
        #replace any multiple consecutive spaces with just one space
        lyrics = re.sub(r'\s+',' ',lyrics)
        
        #using .lower both for uniformity and cause stopwords list
        #is all lowercase
        words = nltk.word_tokenize(lyrics.lower())
        
        #gives list of tuples in form [(car, NN), (run, VB)... etc]
        word_tags = nltk.pos_tag(words)

        #add words to word count lists
        for wordAndTag in word_tags:
            word = wordAndTag[0]
            tag = convertPOSTag(wordAndTag[1])
            if wordAndTag[0] not in stopwords:
                lemma = lemmatizer.lemmatize(word, tag)
                lemmaAndTag = (lemma, tag)

                #add words to master list
                if lemmaAndTag not in allChartsLemmaCounts.keys():
                    allChartsLemmaCounts[lemmaAndTag] = {'count': 1, 
                                                         'sourceWords': [word]}
                else:                        
                    allChartsLemmaCounts[lemmaAndTag]['count'] = \
                    allChartsLemmaCounts[lemmaAndTag]['count'] + 1
                    if word not in allChartsLemmaCounts[lemmaAndTag]['sourceWords']:
                        allChartsLemmaCounts[lemmaAndTag]['sourceWords'].append(word)

        return

In [202]:
allChartsLemmaCounts = {}

for chart in inputData:
    for song in chart['entries']:
        lemmaCounts(song['lyrics'], chart['name'])

In [204]:
allChartsLemmaCountsDf = pd.DataFrame.from_dict(allChartsLemmaCounts, orient='index')

#currently the index of the df is the words and the counts are in a column called 0
allChartsLemmaCountsDf.sort_values(by=['count'], ascending=False, inplace=True)

#create an actual index and move the words into a column
allChartsLemmaCountsDf.reset_index(inplace=True)
allChartsLemmaCountsDf.columns = ['lemma', 'wordType', 'count', 'sourceWords']

allChartsLemmaCountsDf

Unnamed: 0,lemma,wordType,count,sourceWords
0,get,v,2438,"[get, got, getting, gets, gotten]"
1,know,v,1750,"[know, knew, knows, known, knowing]"
2,like,n,1651,"[like, likes]"
3,yeah,n,1310,[yeah]
4,go,v,1070,"[go, going, went, gone, goes]"
5,make,v,938,"[make, made, making, makes]"
6,na,n,932,[na]
7,say,v,914,"[say, said, says, saying]"
8,come,v,899,"[come, came, coming, comes]"
9,take,v,840,"[take, took, taking, takes, taken]"


What's the longest list of words that was combined by the lemmitisation?

In [121]:
max(allChartsLemmaCountsDf.sourceWords.apply(len))

2

Looks like the lemmitisation of the words is mainly just taking the 's' off the end of some words, eg likes and like. In fact when I check the longest sourceWords list is only 2 long so not many words are being combined. 

Even just looking at the top 10 there's 'got' is 5th and 'get' is 8th. They should be combined as they're different the present and past tense of the one verb 'get'. 

Looking into this I've found that this is because the lemmatiser doesn't have the part-of-speech tags for the words so it's assuming nearly everything is a noun.