In [25]:
import pandas as pd
from regex import regex as re
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk import FreqDist
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Hein\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
genres = ['rap', 'rock', 'pop']
limit = '10k'
df = pd.read_csv(f'data/song_lyrics_reduced_{"_".join(genres)}_{limit}.csv')

In [3]:
documents = df['lyrics']

In [4]:
def remove_notes(docs: list[str]) -> list[str]:
    result = []
    # RegEx pattern to target all notes in brackets, like [Chorus:] or [Kanye West Sings]
    pattern = r'\[[^\n\[\]]+\]'
    for doc in docs:
        # Remove all bracket notes
        new_doc = re.sub(pattern, '', doc)
        result.append(new_doc)
    return result

In [5]:
def remove_punctuation(docs: list[str]) -> list[str]:
    '''Remove useless punctuation from all documents'''
    result = []
    # RegEx pattern to target all characters that are NOT:
    # a letter, underscore, dash, space, or newline
    pattern = r"[^\w' \n]"
    for doc in docs:
        # Reduce repeated newlines to singular newlines
        new_doc = re.sub(r'\n+', '\n', doc)
        # Remove punctuation 
        new_doc = re.sub(pattern, ' ', new_doc)
        result.append(new_doc)
    return result

In [6]:
def normalize(docs: list[str]) -> list[str]:
    '''Simply lowercase all words'''
    return [doc.lower() for doc in docs]

In [7]:
def tokenize(docs: list[str]) -> list[list[str]]:
    '''Tokenize documents using nltk'''
    result = []
    for doc in docs:
        new_doc = word_tokenize(doc)
        result.append(new_doc)
    return result

In [8]:
def remove_stopwords(docs: list[list[str]]) -> list[list[str]]:
    '''Remove stopwords in documents using nltk english stopwords'''
    stop_words = stopwords.words('english')
    result = []
    for doc in docs:
        new_doc = [word for word in doc if word not in stop_words]
        result.append(new_doc)
    return result

In [9]:
def lemmatize(docs: list[list[str]]) -> list[list[str]]:
    '''Lemmatize documents usting nltk WordNetLemmatizer'''
    lemmatizer = WordNetLemmatizer()
    result = []
    for doc in docs:
        new_doc = [lemmatizer.lemmatize(word) for word in doc]
        result.append(new_doc)
    return result

In [10]:
def stem(docs: list[list[str]]) -> list[list[str]]:
    '''Stem documents usting nltk PorterStemmer'''
    stemmer = PorterStemmer()
    result = []
    for doc in docs:
        new_doc = [stemmer.stem(word) for word in doc]
        result.append(new_doc)
    return result

In [11]:
def vocabulary(docs: list[list[str]]) -> tuple[list[list[str]], list[str]]:
    '''Return unique words per document, and a total vocabulary'''
    vocab = set()
    result = []
    for doc in docs:
        for word in doc:
            vocab.add(word)
        result.append(list(set(doc)))
    return result, list(vocab)

In [31]:
def write_out(data, columns, location):
    df = pd.DataFrame(data, columns=columns)
    file_name = f'output/{location}.csv'
    df.to_csv(file_name)
    print(f'Wrote data to {file_name}')

In [32]:
def remove_rare_words(docs: list[list[str]], limit=1) -> list[list[str]]:
    '''Remove words that occur in less than a given proportion of documents'''
    unique_docs, vocab = vocabulary(docs)
    all_words = []
    for doc in unique_docs:
        all_words += doc
    size = len(all_words)
    distribution = FreqDist(all_words)
    distribution_words = list(distribution)

    print(f'Total words: {size}')
    print(f'Unique words: {len(vocab)}')
    print('Most frequent:')
    print('; '.join([f'{word}: {int(distribution.freq(word) * size)}' for word in distribution_words[:10]]))
    print('Least frequent:')
    print(' - '.join([f'{word}: {int(distribution.freq(word) * size)}' for word in distribution_words[-10:]]))

    result = []
    removed = []
    for doc in docs:
        new_doc = [word for word in doc if int(distribution.freq(word) * size) > limit]
        removed += [word for word in doc if int(distribution.freq(word) * size) <= limit]
        result.append(new_doc)
    new_vocab = [word for word in vocab if int(distribution.freq(word) * size) > limit]

    output = []
    for word in vocab:
        output.append([word, int(distribution.freq(word) * size), int(distribution.freq(word) * size) > limit])
    write_out(output, ['word', 'occurence', 'included'], 'word_frequencies')

    print(f'\nRemoved all words occuring {limit} or less times')
    print(f'Reduced vocab from {len(vocab)} to {len(new_vocab)} words')
    print(f'Removed: {(list(set(removed)))}')
    return result

In [28]:
noteless = remove_notes(documents)
just_words = remove_punctuation(noteless)
normalized = normalize(just_words)
tokens = tokenize(normalized)
no_stopwords = remove_stopwords(tokens)
lemmatized = lemmatize(no_stopwords)
stemmed = stem(no_stopwords)
for document in lemmatized[:5]:
    print(' '.join(document))
print()
for document in stemmed[:5]:
    print(' '.join(document))

killa cam killa cam cam killa cam killa cam killa cam cam killa cam killa cam cam killa killa killa cam killa cam cam killa killa killa cam killa cam cam base loaded killa cam killa cam uh huh killa cam cam santana second jim third killa cam killa cam cam 'm bat killa killa killa cam killa cam cam killa 'm 'bout hit shit world killa cam ugh heatmakerz killa cam cam killa cam killa cam killa cam cam hahahaha killa cam killa cam cam killa killa killa cam killa cam cam killa make shit clap killa cam killa cam cam killa cam killa cam killa cam cam killa cam killa cam cam killa killa killa cam killa killa killa cam cam killa goon spar stay tune like damn realest since 'kumbaya ' bomaye killa cam lord lord still man pan scrilla fam board bitch want neuter nigga want tutor hooligan houlihan 's maneuvering 's nothing new doggy 'm land grind pan pan gram dime toe mc say hammer time beef hammer mine get hand nine 'bama line corduroy cam 'll shine canary burgundy call lemon red red yellow diamond

In [33]:
no_rare_words = remove_rare_words(lemmatized, limit=3)

Total words: 3378506
Unique words: 77441
Most frequent:
's: 24010; n't: 23146; 'm: 18776; like: 18537; know: 18366; got: 16202; get: 15708; go: 14103; see: 13857; one: 13620
Least frequent:
clogg: 1 - starlt: 1 - uhoh: 1 - widlin: 1 - tamaki: 1 - elric: 1 - yagami: 1 - lifestream: 1 - yaoi: 1 - iamsleepless: 1
Wrote data to output/word_frequencies.csv

Removed all words occuring 3 or less times
Reduced vocab from 77441 to 27305 words


In [23]:
no_rare_words = remove_rare_words(stemmed, limit=3)

Total words: 3329460
Unique words: 61144
Most frequent:
's: 24010; n't: 23146; 'm: 18776; like: 18603; know: 18514; get: 16429; got: 16221; go: 15013; see: 14062; one: 13623
Least frequent:
clogg: 1 - starlt: 1 - uhoh: 1 - widlin: 1 - tamaki: 1 - elric: 1 - yagami: 1 - lifestream: 1 - yaoi: 1 - iamsleepless: 1

Removed all words occuring 3 or less times
Reduced vocab from 61144 to 21856 words
Removed: ['dreampt', 'thoedest', 'kinkel', 'cillzan', 'slamin', 'nicen', 'yakidi', 'anbodi', 'alev', 'childcar', 'prowlerss', 'ronno', 'ogash', 'alfi', 'sint', "'tomorrow", 'goddddd', 'goriest', 'creedon', 'mauv', 'foxglov', 'cask', 'photog', 'henessi', 'montecristo', 'tininim', 'hooous', 'sideview', 'housefli', "'engag", 'bahamian', "'tainment", 'humala', 'channelin', 'machetti', 'biaaaatch', 'miramax', 'zaw', 'euclidean', 'aggreg', 'snibunni', 'bloodsuckin', 'oyl', 'brinng', "we'lll", 't9', 'vonc', 'gentler', 'fairw', 'dickridin', 'ソウル間は約二時間', 'finici', 'waterlow', 'numaish', 'capper', 'mizzix',