In [35]:
import pandas as pd
from regex import regex as re
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk import FreqDist
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Hein\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [36]:
def count_words(documents, details=''):
    total_words = []
    for document in documents:
        if isinstance(document, str):
            total_words += word_tokenize(document)
        else:
            total_words += document
    total_length = len(total_words)
    unique_words = len(set(total_words))
    print(f'Total words: {total_length} ({unique_words} unique) [{details}]')

In [37]:
def remove_notes(docs: list[str]) -> list[str]:
    result = []
    # RegEx pattern to target all notes in brackets, like [Chorus:] or [Kanye West Sings]
    pattern = r'\[[^\n\[\]]+\]'
    for doc in docs:
        # Remove all bracket notes
        new_doc = re.sub(pattern, '', doc)
        result.append(new_doc)
    return result

In [38]:
def remove_punctuation(docs: list[str]) -> list[str]:
    '''Remove useless punctuation from all documents'''
    result = []
    # RegEx pattern to target all characters that are NOT:
    # a letter, underscore, dash, space, or newline
    pattern = r"[^\w' \n\-\_]"
    for doc in docs:
        # Reduce repeated newlines to singular newlines
        new_doc = re.sub(r'\n+', ' ', doc)
        # Remove punctuation 
        new_doc = re.sub(pattern, ' ', new_doc)
        result.append(new_doc)
    return result

In [39]:
def normalize(docs: list[str]) -> list[str]:
    '''Simply lowercase all words'''
    return [doc.lower() for doc in docs]

In [40]:
def tokenize(docs: list[str]) -> list[list[str]]:
    '''Tokenize documents using nltk'''
    result = []
    for doc in docs:
        new_doc = word_tokenize(doc)
        result.append(new_doc)
    return result

In [41]:
def remove_stopwords(docs: list[list[str]]) -> list[list[str]]:
    '''Remove stopwords in documents using nltk english stopwords'''
    stop_words = stopwords.words('english')
    result = []
    for doc in docs:
        new_doc = [word for word in doc if word not in stop_words]
        result.append(new_doc)
    return result

In [42]:
def lemmatize(docs: list[list[str]]) -> list[list[str]]:
    '''Lemmatize documents usting nltk WordNetLemmatizer'''
    lemmatizer = WordNetLemmatizer()
    result = []
    for doc in docs:
        new_doc = [lemmatizer.lemmatize(word) for word in doc]
        result.append(new_doc)
    return result

In [43]:
def stem(docs: list[list[str]]) -> list[list[str]]:
    '''Stem documents usting nltk PorterStemmer'''
    stemmer = PorterStemmer()
    result = []
    for doc in docs:
        new_doc = [stemmer.stem(word) for word in doc]
        result.append(new_doc)
    return result

In [44]:
def vocabulary(docs: list[list[str]]) -> tuple[list[list[str]], list[str]]:
    '''Return unique words per document, and a total vocabulary'''
    vocab = set()
    result = []
    for doc in docs:
        for word in doc:
            vocab.add(word)
        result.append(list(set(doc)))
    return result, list(vocab)

In [45]:
def write_out(data, columns, location):
    df = pd.DataFrame(data, columns=columns)
    file_name = f'output/{location}.csv'
    df.to_csv(file_name)
    print(f'Wrote data to {file_name}')

In [46]:
def remove_rare_words(docs: list[list[str]], limit=1) -> list[list[str]]:
    '''Remove words that occur in less than a given proportion of documents'''
    unique_docs, vocab = vocabulary(docs)
    all_words = []
    for doc in unique_docs:
        all_words += doc
    size = len(all_words)
    distribution = FreqDist(all_words)
    distribution_words = list(distribution)

    print(f'Total words: {size}')
    print(f'Unique words: {len(vocab)}')
    print('Most frequent:')
    print('; '.join([f'{word}: {int(distribution.freq(word) * size)}' for word in distribution_words[:10]]))
    print('Least frequent:')
    print(' - '.join([f'{word}: {int(distribution.freq(word) * size)}' for word in distribution_words[-10:]]))

    result = []
    removed = []
    for doc in docs:
        new_doc = [word for word in doc if int(distribution.freq(word) * size) > limit]
        removed += [word for word in doc if int(distribution.freq(word) * size) <= limit]
        result.append(new_doc)
    new_vocab = [word for word in vocab if int(distribution.freq(word) * size) > limit]

    output = []
    for word in vocab:
        output.append([word, int(distribution.freq(word) * size), int(distribution.freq(word) * size) > limit])
    write_out(output, ['word', 'occurence', 'included'], 'word_frequencies')

    print(f'\nRemoved all words occuring {limit} or less times')
    print(f'Reduced vocab from {len(vocab)} to {len(new_vocab)} words')
    print(f'Removed: {(list(set(removed)))}')
    return result

In [47]:
genres = ['rap', 'rock', 'pop']
limit = '1000'
df = pd.read_csv(f'data/song_lyrics_reduced_{"_".join(genres)}_{limit}.csv')
documents = df['lyrics']

In [48]:
count_words(documents, 'normal')
noteless = remove_notes(documents)
count_words(noteless, 'noteless')
just_words = remove_punctuation(noteless)
count_words(just_words, 'just_words')
normalized = normalize(just_words)
count_words(normalized, 'normalized')
tokens = tokenize(normalized)
count_words(tokens, 'tokenized')
no_stopwords = remove_stopwords(tokens)
count_words(no_stopwords, 'no_stopwords')
lemmatized = lemmatize(no_stopwords)
count_words(lemmatized, 'lemmatized')
stemmed = stem(no_stopwords)
count_words(stemmed, 'stemmed')
for document in lemmatized[:5]:
    print(' '.join(document))
print()
for document in stemmed[:5]:
    print(' '.join(document))

Total words: 1731972 (45288 unique) [normal]
Total words: 1627338 (44593 unique) [noteless]
Total words: 1447062 (43494 unique) [just_words]
Total words: 1447062 (35121 unique) [normalized]
Total words: 1447062 (35121 unique) [tokenized]
Total words: 793363 (34972 unique) [no_stopwords]
Total words: 793363 (31279 unique) [lemmatized]
Total words: 793363 (26016 unique) [stemmed]
killa cam killa cam cam killa cam killa cam killa cam cam killa cam killa cam cam killa killa killa cam killa cam cam killa killa killa cam killa cam cam base loaded killa cam killa cam uh-huh killa cam cam santana second jim third killa cam killa cam cam 'm bat killa killa killa cam killa cam cam killa 'm 'bout hit shit world killa cam ugh heatmakerz killa cam cam killa cam killa cam killa cam cam hahahaha killa cam killa cam cam killa killa killa cam killa cam cam killa make shit clap killa cam killa cam cam killa cam killa cam killa cam cam killa cam killa cam cam killa killa killa cam killa killa killa cam c

In [49]:
pd.DataFrame(noteless)
pd.DataFrame(just_words)

Unnamed: 0,0
0,Killa Cam Killa Cam Cam Killa Cam Killa Ca...
1,Yeah hah yeah Roc-A-Fella We invite you to...
2,Maybe cause I'm eatin And these bastards fiend...
3,Ugh Killa Baby Kanye this that 1970s Hero...
4,So they ask me Young boy What you gon' do th...
...,...
2995,Turn up the music 'cause this song just came ...
2996,O say can you see by the dawn's early light Wh...
2997,I seem to find myself talking to the powers t...
2998,Hey hey hey hey Hey hey hey hey Me plus...


In [50]:
no_rare_words = remove_rare_words(lemmatized, limit=3)

Total words: 387687
Unique words: 31279
Most frequent:
's: 2610; n't: 2431; 'm: 2089; like: 2055; know: 1982; got: 1894; get: 1738; ': 1570; go: 1565; see: 1503
Least frequent:
deprivation: 1 - si-li-cone: 1 - essex: 1 - mortified: 1 - d-e-v: 1 - elapsing: 1 - overworking: 1 - correcting: 1 - villans: 1 - humbles: 1
Wrote data to output/word_frequencies.csv

Removed all words occuring 3 or less times
Reduced vocab from 31279 to 9072 words
Removed: ['promoter', 'amare', 'coolio', 'pussy-whipped', 'globetrotter', 'wheelies', 'loin', '2027', 'diamond-cream', 'ryo', 'loading', 'clef', 'chatterbox', 'poser', 'enlightenment', 'naggos', 'arguin', 'droved', 'rebuttal', 'penn', 'gps', 'mater', 'twenty-eight', "'stand", 'a-can', 'recycle', 'supersedes', 'nomar', 'spleen', 'timberland-ed', 'blamin', 'jawa', 'upping', 'miwacles', 'doth', 'downpour', '-alec', 'toothpick', 'inflicts', 'diehard', 'deplaning', 'crazily', 'russia', 'choppable', 'nonreligious', 'dabbled', 'beacon', 'flappin', 'irregardl

In [51]:
no_rare_words = remove_rare_words(stemmed, limit=3)

Total words: 381942
Unique words: 26016
Most frequent:
's: 2610; n't: 2431; 'm: 2089; like: 2058; know: 1995; got: 1894; get: 1809; go: 1609; ': 1571; see: 1527
Least frequent:
vauntingli: 1 - heav'n: 1 - zed: 1 - scroog: 1 - essex: 1 - mortifi: 1 - elaps: 1 - d-e-v: 1 - villan: 1 - si-li-con: 1
Wrote data to output/word_frequencies.csv

Removed all words occuring 3 or less times
Reduced vocab from 26016 to 8040 words
Removed: ['d-twizzi', 'disabl', 'capitol', 'aloha', 'jew-el', 'kazakhstan', 'prix', '5-o', 'fourhead', 'hardtim', 'afghanistan', 'shin', 'coolio', 'downright', 'pillowcas', 'fictiti', 'forelock', 'reno', 'distress', 'bird-crazi', 'gymnast', 'auditionin', 'neo-soul', 'theodor', 'nine-f', 'loin', '2027', 'lexo', 'deprav', 'donat', 'wheezin', 'granni', 'self-paid', 'flask', '500sl', 'devoid', 'diamond-cream', 'klack', 'ryo', 'recordin', 'clef', 'documentari', 'mae', 'hip-hous', 'sitch', 'g-g-g', 'yukmouth', '2-0', 'façad', 'chatterbox', 'cross-ey', 'saunter', 'law-abid', 'tr