In [4]:
import pandas as pd
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import spacy

In [5]:
def extract_lyrics(file):
    with open(file, 'r') as f:
        lyrics_raw = f.readlines()
        
    matched_lyrics = [x for x in lyrics_raw if re.search('We do not have the lyrics', x) is None]
    
    matched_lyrics_split = [x.split('|') for x in matched_lyrics]
    
    correct_length = [x for x in matched_lyrics_split if len(x) == 4]
    
    df = pd.DataFrame(correct_length, columns = ['lyrics', 'title_name', 'artist_name', 'clean_title'])
    df.clean_title = df.clean_title.str.replace(r'\n', r'', regex = True)
    
    return df

In [6]:
def spacy_lemmatizer(text, nlp):
    doc = nlp(' '.join(text))
    
    return [token.lemma_ for token in doc]

In [7]:
def process_lyrics(df, min_valid_tokens = 15):
    processed_df = df.copy()
    # Remove scraping artifacts
    processed_df.lyrics = processed_df.lyrics.str.replace('^(.+\s{4,})', r' ', regex = True, flags = re.IGNORECASE)
    processed_df.lyrics = processed_df.lyrics.str.replace(r'\w+:', r' ', regex = True)

    # Remove non-alphabetic characters and remove words with length <= 2
    processed_df.lyrics = processed_df.lyrics.str.replace(r'[\W\d]',
                                                          r' ', regex = True)
    processed_df.lyrics = processed_df.lyrics.str.replace(r'\b\w{0,2}\b',
                                                          r' ', regex = True)
    # Remove header 'Song title Lyrics'
    processed_df.lyrics = processed_df.lyrics.str.replace(r'^[\w\W]+ Lyrics',
                                                          r'', regex = True)
    # Convert all lengths of whitespace to single whitespaces, strip outer whitespaces
    processed_df.lyrics = processed_df.lyrics.str.replace(r'\s+',
                                                          r' ', regex = True).str.strip()
    # Lowercase all words
    processed_df.lyrics = processed_df.lyrics.str.lower()
    # Reconvert to list
    processed_df.lyrics = processed_df.lyrics.apply(lambda x: word_tokenize(x))
    # Remove stopwords
    stop_words = stopwords.words('english')
    processed_df.lyrics = processed_df.lyrics.apply(lambda x:\
                                            [word for word in x if word not in stop_words])
    # Keep records with a minimum number of tokens
    processed_df = processed_df.loc[processed_df.lyrics.apply(lambda x: len(x)) >= min_valid_tokens]
    # Lemmatization
    nlp = spacy.load('en_core_web_sm')
    processed_df.lyrics = processed_df.lyrics.apply(lambda x: spacy_lemmatizer(x, nlp))

    return processed_df

In [8]:
def extract_and_process_lyrics(files, min_valid_tokens = 15):
    if type(files) == str:
        df = extract_lyrics(files)
        processed_df = process_lyrics(df, min_valid_tokens)
        
        return processed_df
    elif type(files) == list:
        dfs = [process_lyrics(extract_lyrics(file), min_valid_tokens) for file in files]
        processed_df = pd.concat(dfs, axis = 0)
        
        return processed_df
    else:
        print('Must input a single file string or a list of file strings.')
        return

In [9]:
def stitch_lyrics_and_metadata_frames(lyrics_df, metadata_df):
    merged_df = metadata_df.merge(lyrics_df, how = 'inner', on = ['title_name', 'artist_name'])
    merged_df = merged_df.drop(['title_id', 'genre_id', 'album_id', 'album_name', 'artist_id'], axis = 1)
    return merged_df

In [10]:
def process_data(lyric_files, metadata_file, dest_file, min_valid_tokens = 15):
    lyrics_df = extract_and_process_lyrics(lyric_files, min_valid_tokens)
    
    columns = ['title_id', 'title_name', 'genre_id', 'genre_name', 'album_id', 'album_name', 'artist_id', 'artist_name']
    metadata_df = pd.read_csv(metadata_file, sep = '\t', header = None)
    metadata_df.columns = columns
    
    merged_df = stitch_lyrics_and_metadata_frames(lyrics_df, metadata_df)
    merged_df = merged_df.drop_duplicates(subset = ['title_name', 'artist_name'])
    
    merged_df.to_parquet(dest_file)
    
    return merged_df

In [12]:
song_lyrics_file = '../data/raw/songlyrics_lyrics_merged.csv'
metadata_file = '../data/raw/music_data.tsv'
dest_file = '../data/cleaned/tokenized_data_complete.parquet'
merged_data_all_from_source = process_data(song_lyrics_file, metadata_file, dest_file, min_valid_tokens = 10)