In [1]:
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk import pos_tag

In [2]:
def extract_correctly_matched_songs(file):
    with open(file, 'r') as f:
        genre_raw = f.readlines()
    
    matched_lyrics = [x for x in genre_raw
                     if x.split('\t')[1].lower() in x.split('\t')[0].split('|')[0].lower()]
    
    matched_lyrics_split = [x.split('\t') for x in matched_lyrics]
    
    
    df = pd.DataFrame(matched_lyrics_split, columns = ['lyrics', 'song_title', 'artist'])
    
    return df
    

In [3]:
# helper function to change nltk's part of speech tagging to a wordnet format.
def pos_tagger(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:         
        return None

In [42]:
def process_lyrics(df):
    processed_df = df.copy()
    # Lyrics processing
    # Remove pipe delineations between lines, convert to spaces
    processed_df.lyrics = processed_df.lyrics.str.split('|').str.join(' ')
    # Remove scraping artifacts
    processed_df.lyrics = processed_df.lyrics.str.replace(r'You might also like',
                                                          r' ', regex = True)
    processed_df.lyrics = processed_df.lyrics.str.replace(r'Embed',
                                                          r' ', regex = True)
    # Remove non-alphabetic characters and remove words with length <= 2
    processed_df.lyrics = processed_df.lyrics.str.replace(r'[\W\d]',
                                                          r' ', regex = True)
    processed_df.lyrics = processed_df.lyrics.str.replace(r'\b\w{0,2}\b',
                                                          r' ', regex = True)
    # Remove header 'Song title Lyrics'
    processed_df.lyrics = processed_df.lyrics.str.replace(r'^[\w\W]+ Lyrics',
                                                          r'', regex = True)
    # Convert all lengths of whitespace to single whitespaces, strip outer whitespaces
    processed_df.lyrics = processed_df.lyrics.str.replace(r'\s+',
                                                          r' ', regex = True).str.strip()
    # Lowercase all words
    processed_df.lyrics = processed_df.lyrics.str.lower()
    # Reconvert to list
    processed_df.lyrics = processed_df.lyrics.apply(lambda x: word_tokenize(x))
    # Remove stopwords
    stop_words = stopwords.words('english')
    processed_df.lyrics = processed_df.lyrics.apply(lambda x:\
                                            [word for word in x if word not in stop_words])
    # POS tagging and lemmatization
    wnl = WordNetLemmatizer()
    processed_df.lyrics = processed_df.lyrics.apply(lambda x: pos_tag(x))
    processed_df.lyrics = processed_df.lyrics.apply(lambda x:\
                                                    [(y[0], pos_tagger(y[1])) for y in x])
    processed_df.lyrics = processed_df.lyrics.apply(lambda x:\
                        [wnl.lemmatize(token, pos) for token, pos in x if pos is not None])
    
    # Artist Processing
    # Remove newline characters from artist names
    processed_df.artist = processed_df.artist.str.replace(r'\n', r'', regex = True)
    
    return processed_df

In [32]:
file = '../data/raw/black_death_metal_song_lyrics.tsv'
rock_death_black = extract_correctly_matched_songs(file)

In [43]:
rock_death_black_processed = process_lyrics(rock_death_black)

In [48]:
rock_death_black_processed.lyrics.str.join(' ')[0]

'line spew forth death meat bone march dawnless night bear die procreation sick spawn sicken breed carry forth putrid seed onward die sacrifice life field carnal remains war continue drench blood sewage warrior crawl forth onwards die burn warmachine churn spit forth death disease bring'

In [49]:
rock_death_black.lyrics.str.split('|').str.join(' ')[0]

'Onward To Die Lyrics Lining up to spew forth death Meat and bone now march Into a dawnless night Born only to die Procreation of the sick A spawn of sickened breed To carry forth their putrid seed Onward to die Sacrificed your life Upon fields of carnal remains The war continues Drenched in blood and sewage The warriors crawl forth Onwards to die and burn The warmachine churns Spitting forth the death And the the disease they bring'

In [50]:
rock_death_black_processed

Unnamed: 0,lyrics,song_title,artist
0,"[line, spew, forth, death, meat, bone, march, ...",Onward to Die,Paganizer
1,"[distant, place, know, float, air, midnight, s...",An Eternal Dark Horizon,Throne Of Katarsis
2,"[dead, risen, worship, icon, resist, new, hope...",An Icon for the Damned,Paganizer
3,"[saw, morbid, dream, feel, lust, drift, place,...",Slaughtered Corpse,Amagortis
4,"[flame, knight, meet, severe, thy, moonshine, ...",Heraldic,Autumnblaze
...,...,...,...
5194,"[nocturnal, wish, sin, crave, shadow, thirsty,...",Kingdom of Abyss,Christ Agony
5195,"[spirit, live, creature, different, color, bea...",Eye Sockets Empty,Centinex
5196,"[search, specie, cover, terror, devourer, move...",Devourer of Worlds,Debauchery
5197,"[bag, fly, sea, introduce, plastic, plague, tr...",Immensity,Aeolian
