In [4]:
import json
import re
import os
import pandas as pd

### Cleanup saved genre dataframes

#### genius metadata regex

In [14]:
genius_regex_text = r"""
\d*.*             # number of contributors
contributors      # 'contributors'
.*                # translators and otherwise
lyrics            # 'lyrics'
(.*)              # everything including newline characters
\d*embed          # optional numbers then 'embed'
"""
genius_regex_text = r'\d*.*contributors.*lyrics(.*)\d*embed'
# genius_regex = re.compile(regex_text, flags = re.VERBOSE)
genius_regex = re.compile(genius_regex_text)

# removal of last number left after the regex
number_removal_regex = r'(\d+)(?!.*\d)'

In [16]:
def cleanup_genius_lyrics(df, genius_regex, number_removal_regex = None,
                         remove_newlines=True,
                         remove_question=True, remove_interp=True,
                         remove_dollar=True):
    """
    Clean up dataframes with text lyrics in 'Song Lyrics' Column
    genius_regex - for genius metadata
    number_removal_regex - optional if genius regex doesnt remove them
    """
    # lowercase first for easier regex application
    df['Song Lyrics'] = df['Song Lyrics'].str.lower()

    df['Song Lyrics'] = df['Song Lyrics'].str.strip()

    # removing newlines before regex makes it supremely easier to apply
    if remove_newlines:
        df['Song Lyrics'] = df['Song Lyrics'].str.replace('\n', ' ')

    # remove genius metadata
    df['Song Lyrics'].replace({genius_regex: r'\1'}, inplace=True, regex=True)

    # leftover number removal
    if number_removal_regex:
        df['Song Lyrics'].replace({number_removal_regex: ''}, inplace=True, regex=True)
     
    df['Song Lyrics'] = df['Song Lyrics'].str.replace('(', '')
    df['Song Lyrics'] = df['Song Lyrics'].str.replace(')', '')
    df['Song Lyrics'] = df['Song Lyrics'].str.replace('-', '')

    # remove single apostrophe for improved word counting / uniformity
    df['Song Lyrics'] = df['Song Lyrics'].str.replace("'", '')

    if remove_question:
        df['Song Lyrics'] = df['Song Lyrics'].str.replace('?', '')
    
    if remove_dollar:
        df['Song Lyrics'] = df['Song Lyrics'].str.replace('$', '')

    if remove_interp:
        df['Song Lyrics'] = df['Song Lyrics'].str.replace(',', '')
        df['Song Lyrics'] = df['Song Lyrics'].str.replace('.', '')
        df['Song Lyrics'] = df['Song Lyrics'].str.replace('!', '')
        df['Song Lyrics'] = df['Song Lyrics'].str.replace(':', '')
        df['Song Lyrics'] = df['Song Lyrics'].str.replace(';', '')
    
    return df

In [17]:
dataframes_path = '../dataframes/genres'

### Clean all genre dataframes, save to new ones

In [19]:
for frame in os.listdir(dataframes_path):
    if frame == '.ipynb_checkpoints':
        continue
    if frame.startswith('cleaned'):
        continue
    frame_path = f'{dataframes_path}/{frame}'
    dataframe = pd.read_csv(frame_path, index_col=0)
    
    df = cleanup_genius_lyrics(dataframe, genius_regex_text, number_removal_regex)
    df.to_csv(f'{dataframes_path}/cleaned_{frame}')