In [None]:
import re
import json
import spacy

from pathlib import Path
from sqlitedict import SqliteDict
from spacy_langdetect import LanguageDetector

In [None]:
raw_lyrics_dir = Path('./rawLyrics/')
raw_lyrics_files = list(raw_lyrics_dir.glob('*.json'))
print(f'Found {len(raw_lyrics_files)} files')

clean_lyrics_dir = Path('./cleanLyrics/')
clean_lyrics_dir.mkdir(exist_ok=True)

all_lyrics = Path('AllLyrics.sqlite')
aly = SqliteDict(all_lyrics)

In [None]:
class LyricGeniusFormatter:
    def __init__(self):
        self.nlp = spacy.load('en')
        self.nlp.add_pipe(LanguageDetector(), name="language_detector", last=True)

    def detect_lang(self, text: str):
        doc = self.nlp(text)
        return doc._.language

    def format_lyrics(self, raw_lyrics_file: Path):

        # Load song dict
        with open(raw_lyrics_file, 'r') as rlf:
            raw = json.loads(rlf.read())
        raw_lyrics = str(raw['songs'][0]['lyrics'])

        # Detect language
        detect = self.detect_lang(raw_lyrics)
        if 'en' not in detect['language'] or detect['score'] < 0.9:
            print(f'Not EN: {raw_lyrics_file}\n')
            return

        # Copy fields
        formatted = dict()
        formatted['artist'] = raw['artist']
        formatted['title'] = raw['songs'][0]['title']
        formatted['year'] = raw['songs'][0]['year']
        formatted['image'] = raw['songs'][0]['image']
        formatted['raw_lyrics'] = raw_lyrics

        # Clean lyrics
        sections = list()
        header_seed = '(\n\n(\[.*|\(.*\)|\{.*|[0-9].*|.*:\n|[R-r]epeat.*))'
        # For splitting by \n\n followed by
        # [... , (...) , {... , int... , ...: , or (R/r)epeat...
        raw_sections = re.split(header_seed, '\n\n' + raw_lyrics)           # Catch [Intro]
        for raw_sect in raw_sections:
            clean_sect = re.sub(r'\([^)].*\)|\[.*?\]|\(|\)', '', raw_sect)  # Clean residual
            split_sect = [l for l in clean_sect.split('\n') if l]
            if len(split_sect) > 1:
                sections.append(split_sect)

        formatted['sections'] = sections

        return formatted

In [None]:
# TEST
f = LyricGeniusFormatter()

for i, rlf in enumerate(raw_lyrics_files):
    formatted_song = f.format_lyrics(rlf)

    # Verify visually
    if i == 0:
        for k, v in formatted_song.items():
            if isinstance(v, list):
                print(f'\n\n{k}\n')
                for s in v:
                    for l in s:
                        print(l)
                    print('\n')
            else:
                print(f'{k} :: {v}\n')
        break

In [None]:
# Make {artist}_lyrics.txt for each artist
for i, rlf in enumerate(raw_lyrics_files):
    formatted_song = f.format_lyrics(rlf)

    lyrics_file = Path(f'./cleanLyrics/{artist}_allcleansongs.txt')
    with open(lyrics_file, 'a') as lf:
        for section in formatted_song['sections']:
            for l in section['text']:
                lf.write(f'{l}\n')
            lf.write('\n')  # Double newline between sections
        lf.write('\n')      # Tripple newline between songs

    # Verify visually 
    if i == 0:
        with open(lyrics_file, 'r') as lf:
            for line in lf:
                print(f'{len(line):2d} :: {line}')
        break

In [None]:
# Put lyrics in AllLyrics.sqlite
for i, rlf in enumerate(raw_lyrics_files):
    formatted_song = f.format_lyrics(rlf)

    artist = formatted_song['artist']
    if artist not in aly:
        aly[artist] = list()
    if len(aly[artist]) == 0 or formatted_song['title'] not in aly[artist][:]['title']:
        aly[artist].append(formatted_song)