In [1]:
import os
import re
import json
import tqdm
import spacy

from pathlib import Path
from sqlitedict import SqliteDict
from spacy_langdetect import LanguageDetector
from pytorch_pretrained_bert import BertTokenizer

In [2]:
raw_lyrics_dir = Path('./rawLyrics/')
raw_lyrics_files = sorted(list(raw_lyrics_dir.glob('*.json')))
print(f'Found {len(raw_lyrics_files)} files')

clean_lyrics_dir = Path('./cleanLyrics/')
clean_lyrics_dir.mkdir(exist_ok=True) 

all_lyrics = Path('AllLyrics.sqlite')
aly = SqliteDict(all_lyrics)

Found 78728 files


In [3]:
class LyricGeniusFormatter:
    def __init__(self):
        self.nlp = spacy.load('en')
        self.nlp.add_pipe(LanguageDetector(), name="language_detector", last=True)
        self.tok = BertTokenizer.from_pretrained('bert-base-uncased')

    def detect_lang(self, text: str):
        doc = self.nlp(text)
        return doc._.language

    def format_lyrics(self, raw_lyrics_file: Path):

        # Load song dict
        with open(raw_lyrics_file, 'r') as rlf:
            raw = json.loads(rlf.read())
        raw_lyrics = str(raw['songs'][0]['lyrics'])

        # Detect language
        detect = self.detect_lang(raw_lyrics)
        if 'en' not in detect['language'] or detect['score'] < 0.9:
#             print(f'Not EN: {raw_lyrics_file}\n')
            return

        # Copy fields
        formatted = dict()
        formatted['artist'] = raw['artist']
        formatted['title'] = raw['songs'][0]['title']
        formatted['year'] = raw['songs'][0]['year']
        formatted['image'] = raw['songs'][0]['image']
        formatted['raw_lyrics'] = raw_lyrics

        # Clean lyrics
        sections = list()
        header_seed = '(\n\n(\[.*|\(.*\)|\{.*|[0-9].*|.*:\n|[R-r]epeat.*))'
        # For splitting by \n\n followed by
        # [... , (...) , {... , int... , ...: , or (R/r)epeat...
        raw_sections = re.split(header_seed, '\n\n' + raw_lyrics)   # Catch [Intro]
        for raw_sect in raw_sections:
            clean_seed = r'\([^)].*\)|\[.*?\]|\(|\)|\[|\]|:'
            clean_sect = re.sub(clean_seed, '', raw_sect)        # Clean residual
            split_sect = [l for l in clean_sect.split('\n') if len(self.tok.tokenize(l))]
            if len(split_sect) > 1:
                sections.append(split_sect)

        formatted['sections'] = sections

        return formatted

In [4]:
# TEST
f = LyricGeniusFormatter()

for i, rlf in enumerate(raw_lyrics_files):
    formatted_song = f.format_lyrics(rlf)

    # Verify visually
    if i == 0:
        for k, v in formatted_song.items():
            if isinstance(v, list):
                print(f'\n\n{k}\n')
                for s in v:
                    for l in s:
                        print(l)
                    print('\n')
            else:
                print(f'{k} :: {v}\n')
                pass
        break

artist :: 03 Greedo

title :: 03 Purple Hearts

year :: 2017-07-26

image :: https://images.genius.com/6c258cf65eef13108dc10c3720ad2a25.1000x1000x1.jpg

raw_lyrics :: [Intro]
Three purple hearts
Three purple hearts
Three purple hearts

[Verse]
She ain't love me 'til she saw me shine
Playin' with a nigga all this time
You ain't love me 'til you saw me shine
Playin' with a nigga all this time
She wear a minute, now I'm on your mind
Since I lost Lil Money I been on my grind
Always been one hundred, never told her lies
I can't let her go, I will never really know why
Shay Shay you got my attention
Sorry for times I went missing
Have you heard 'bout whose lips I'm kissin'
I understand why you keepin' your distance
But I still love you
And I never care who done fucked you
Been in this game, I can't judge, wave it above you
That's what these lames do
Know I done played you so I can't blame you
But why you so playful? don't make me hate you
I never fake and act like you wasn't there when I had

In [5]:
n = 0

train_file = Path(f'./cleanLyrics/{n:02d}-train-base-uncased-78728.txt')
dev_file = Path(f'./cleanLyrics/{n:02d}-dev-base-uncased-78728.txt')
test_file = Path(f'./cleanLyrics/{n:02d}-test-base-uncased-78728.txt')
progress_file = Path(f'./cleanLyrics/{n:02d}-progress-base-uncased-78728.txt')

progress = list()
if os.path.exists(progress_file):
    with open(progress_file, 'r') as pf:
        for line in pf:
            progress.append(str(line).replace('\n', ''))
else:
    with open(progress_file, 'a') as pf:
        pass

print(len(progress))

0


In [6]:
# Clean text and write to dev, test, and train files
i = 0
for rlf in tqdm.tqdm_notebook(raw_lyrics_files):
    if str(rlf) not in progress:
        formatted_song = f.format_lyrics(rlf)

        if formatted_song:
            i += 1

            # 5% to dev
            if i % 20 == 0:
                with open(dev_file, 'a') as df:
                    for section in formatted_song['sections']:
                        for l in section:
                            df.write(f'{l}\n')
                        df.write('\n')  # Double newline between sections

            # 5% to test
            elif i % 20 == 1:
                with open(test_file, 'a') as tef:
                    for section in formatted_song['sections']:
                        for l in section:
                            tef.write(f'{l}\n')
                        tef.write('\n')

            # 90% to train
            else:
                with open(train_file, 'a') as trf:
                    for section in formatted_song['sections']:
                        for l in section:
                            trf.write(f'{l}\n')
                        trf.write('\n')  

        # Record progress
        progress.append(str(rlf))
        with open(progress_file, 'a') as pf:
            pf.write(f'{rlf}\n')

HBox(children=(IntProgress(value=0, max=78728), HTML(value='')))




In [7]:
from pytorch_pretrained_bert import BertTokenizer

bt = BertTokenizer.from_pretrained('bert-large-uncased')

In [11]:
line_lens = list()
with open(train_file, 'r') as lf:
    for i, line in enumerate(lf):
        pass

with open(train_file, 'r') as lf:
    for line in tqdm.tqdm_notebook(lf, total=i):
        tokens = bt.tokenize(line)
        line_lens.append(len(tokens))
print(len(line_lens))

HBox(children=(IntProgress(value=0, max=4149686), HTML(value='')))


4149687


In [12]:
line_lens.sort()

In [20]:
print(f'Max: {max(line_lens)}    Min: {min(line_lens)}    %>64: {100*sum([1 for l in line_lens if l > 64])/len(line_lens)}')

Max: 1054    Min: 0    %>64: 0.06665562968966093


In [None]:
print(f'Max: {max(line_lens)}')

In [None]:
# Put lyrics in AllLyrics.sqlite
for i, rlf in enumerate(raw_lyrics_files):
    formatted_song = f.format_lyrics(rlf)

    artist = formatted_song['artist']
    if artist not in aly:
        aly[artist] = list()
    if len(aly[artist]) == 0 or formatted_song['title'] not in aly[artist][:]['title']:
        aly[artist].append(formatted_song)