In [None]:
import re
import json
import spacy

from pathlib import Path
from sqlitedict import SqliteDict
from spacy_langdetect import LanguageDetector

In [None]:
raw_lyrics_dir = Path('./rawLyrics/')
raw_lyrics_files = list(raw_lyrics_dir.glob('*.json'))
print(f'Found {len(raw_lyrics_files)} files')

clean_lyrics_dir = Path('./cleanLyrics/')
clean_lyrics_dir.mkdir(exist_ok=True)

all_lyrics = Path('AllLyrics.sqlite')
aly = SqliteDict(all_lyrics)

In [None]:
class Formatter:
    def __init__(self):
        self.nlp = spacy.load('en')
        self.nlp.add_pipe(LanguageDetector(), name="language_detector", last=True)

    def detect_lang(self, text: str):
        doc = self.nlp(text)
        return doc._.language
        
    def format_lyrics(self, raw_lyrics_file: Path):

        # Load song dict
        with open(raw_lyrics_file, 'r') as rlf:
            raw = json.loads(rlf.read())
        raw_lyrics = str(raw['songs'][0]['lyrics'])

        # Detect language
        detect = self.detect_lang(raw_lyrics)
        if 'en' not in detect['language'] or detect['score'] < 0.9:
            print(f'Not EN: {raw_lyrics_file}\n')
            return

        # Copy fields
        formatted = dict()
        formatted['artist'] = raw['artist']
        formatted['title'] = raw['songs'][0]['title']
        formatted['year'] = raw['songs'][0]['year']
        formatted['image'] = raw['songs'][0]['image']
        formatted['raw_lyrics'] = raw_lyrics

        # Clean lyrics
        no_headers = re.sub(r'\[.*?\]', '', raw_lyrics)          # Remove brackets
        no_parenth = re.sub(r'\([^)]*\)', '', no_headers)        # Remove parentheses
        clean_lyrics = [l for l in no_parenth.split('\n') if l]  # Remove empty strings
        formatted['clean_lyrics'] = clean_lyrics

        # Format sections
        sections = list()
        section = dict()
        for chunk in raw_lyrics.split('\n\n'):  # Split sections by double newline 
            lines = [l for l in chunk.split('\n') if l] 

            # Find headder
            if len(lines) > 2 and any(c in lines[0] for c in ['[', '(', '{', ':']):
                section['type'] = lines[0]
                section['text'] = [re.sub(r'\([^)]*\)', '', l) for l in lines[1:]]  

            # Sometimes headder is in 2nd line
            elif len(lines) > 3 and any(c in lines[1] for c in ['[', '(', '{', ':']): 
                section['type'] = lines[1]
                section['text'] = [re.sub(r'\([^)]*\)', '', l) for l in lines[2:]] 

            # Record exceptions
            elif len(lines) > 1:
                section['type'] = 'MISSING HEADER'
                section['text'] = lines
            else:
                print(f'{lines}\n\n')
                section['type'] = 'UNKNOWN'
                section['text'] = lines

            sections.append(section)
            section = dict()

        formatted['sections'] = sections

        return formatted

In [None]:
# TEST
f = Formatter()

for i, rlf in enumerate(raw_lyrics_files):
    formatted = f.format_lyrics(rlf)

    if i == 0:
        for k, v in formatted.items():
            print(f'{k} :: {v}')
        break
