In [1]:
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk import pos_tag

In [70]:
def extract_correctly_matched_songs(file, clean_titles = False):
    with open(file, 'r') as f:
        genre_raw = f.readlines()
    
    if clean_titles:
        matched_lyrics = [x for x in genre_raw
                         if x.split('\t')[3].lower() in x.split('\t')[0].split('|')[0].lower()]
        correct_length = 4
    else:
        matched_lyrics = [x for x in genre_raw
                         if x.split('\t')[1].lower() in x.split('\t')[0].split('|')[0].lower()]
        correct_length = 3

    matched_lyrics_split = [x.split('\t') for x in matched_lyrics]

    correct_length = [x for x in matched_lyrics_split if len(x) == correct_length]

    df = pd.DataFrame(correct_length, columns = ['lyrics', 'song_title', 'artist'])

    return df
#     wrong_lyrics_split = [x.split('\t') for x in genre_raw if x not in matched_lyrics and len(x.split('\t')) == 3]

#     wrong_df = pd.DataFrame(wrong_lyrics_split, columns = ['lyrics', 'song_title', 'artist'])
    
#     return df, (len(correct_length), len(matched_lyrics), len(genre_raw)), wrong_df
    

In [18]:
# helper function to change nltk's part of speech tagging to a wordnet format.
def pos_tagger(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:         
        return None

In [68]:
def process_lyrics(df):
    processed_df = df.copy()
    # Lyrics processing
    # Remove pipe delineations between lines, convert to spaces
    processed_df.lyrics = processed_df.lyrics.str.split('|').str.join(' ')
    # Remove scraping artifacts
    processed_df.lyrics = processed_df.lyrics.str.replace(r'You might also like',
                                                          r' ', regex = True)
    processed_df.lyrics = processed_df.lyrics.str.replace(r'Embed',
                                                          r' ', regex = True)
    # Remove non-alphabetic characters and remove words with length <= 2
    processed_df.lyrics = processed_df.lyrics.str.replace(r'[\W\d]',
                                                          r' ', regex = True)
    processed_df.lyrics = processed_df.lyrics.str.replace(r'\b\w{0,2}\b',
                                                          r' ', regex = True)
    # Remove header 'Song title Lyrics'
    processed_df.lyrics = processed_df.lyrics.str.replace(r'^[\w\W]+ Lyrics',
                                                          r'', regex = True)
    # Convert all lengths of whitespace to single whitespaces, strip outer whitespaces
    processed_df.lyrics = processed_df.lyrics.str.replace(r'\s+',
                                                          r' ', regex = True).str.strip()
    # Lowercase all words
    processed_df.lyrics = processed_df.lyrics.str.lower()
    # Reconvert to list
    processed_df.lyrics = processed_df.lyrics.apply(lambda x: word_tokenize(x))
    # Remove stopwords
    stop_words = stopwords.words('english')
    processed_df.lyrics = processed_df.lyrics.apply(lambda x:\
                                            [word for word in x if word not in stop_words])
    # POS tagging and lemmatization
    wnl = WordNetLemmatizer()
    processed_df.lyrics = processed_df.lyrics.apply(lambda x: pos_tag(x))
    processed_df.lyrics = processed_df.lyrics.apply(lambda x:\
                                                    [(y[0], pos_tagger(y[1])) for y in x])
    processed_df.lyrics = processed_df.lyrics.apply(lambda x:\
                        [wnl.lemmatize(token, pos) for token, pos in x if pos is not None])
    
    # Artist Processing
    # Remove newline characters from artist names
    processed_df.artist = processed_df.artist.str.replace(r'\n', r'', regex = True)
    
    # Keep records with a minimum of 15 tokens
    processed_df = processed_df.loc[processed_df.lyrics.apply(lambda x: len(x)) >= 15]
    
    return processed_df

In [9]:
def process_genres(files):
    if type(files) == str:
        df = extract_correctly_matched_songs(file)
        processed_df = process_lyrics(df)
        
        return processed_df
    elif type(files) == list:
        dfs = [process_lyrics(extract_correctly_matched_songs(file)) for file in files]
        processed_df = pd.concat(dfs, axis = 0)
        
        return processed_df
    else:
        print('Must input a single file string or a list of file strings.')
        return

In [None]:
def process_data(lyric_files, metadata_file):
    lyrics_df = process_genres(lyric_files)
    
    columns = ['title_id', 'title_name', 'genre_id', 'genre_name', 'album_id', 'album_name', 'artist_id', 'artist_name']
    metadata_df = pd.read_csv(metadata_file, sep = '\t', header = None)
    metadata_df.columns = columns
    
    merged_df = metadata_df.merge(lyrics_df, how = 'inner', on = ['title_name', 'artist_name'])
    
    merged_df = merged_df.drop(['title_id', 'genre_id', 'album_id', 'album_name', 'artist_id', 'artist_name'], axis = 1)
    
    merged_df.to_parquet('../data/cleaned/processed_lyrics.parquet')
    
    return

In [71]:
lyrics_files = ['../data/raw/alternative_song_lyrics.tsv',
                '../data/raw/black_death_metal_song_lyrics.tsv',
                '../data/raw/blues_song_lyrics.tsv',
                '../data/raw/christian_gospel_song_lyrics.tsv',
                '../data/raw/country_song_lyrics.tsv',
                '../data/raw/dance_song_lyrics.tsv',
                '../data/raw/hip_hop_rap_song_lyrics.tsv',
                '../data/raw/jazz_song_lyrics.tsv',
                '../data/raw/pop_song_lyrics.tsv',
                '../data/raw/rb_soul_song_lyrics.tsv',
                '../data/raw/reggae_song_lyrics.tsv',
                '../data/raw/rock_hard_rock_song_lyrics.tsv']
lyrics = process_genres(lyrics_files)

In [77]:
lyrics.loc[lyrics.artist == 'Metallica']

Unnamed: 0,lyrics,song_title,artist
968,"[poor, twist, poor, twisted, feast, sympathy, ...",Poor Twisted Me,Metallica
1331,"[tread, say, tread, liberty, death, proudly, h...",Don't Tread On Me,Metallica
1470,"[poor, twist, poor, twisted, feast, sympathy, ...",Poor Twisted Me,Metallica
1996,"[tread, say, tread, liberty, death, proudly, h...",Don't Tread On Me,Metallica
3105,"[blacken, end, winter, send, throw, see, obscu...",Blackened,Metallica


In [48]:
temp = lyrics.copy()

In [49]:
temp['num_tokens'] = temp.lyrics.apply(lambda x: len(x))

In [67]:
temp.loc[temp.lyrics.apply(lambda x: len(x)) < 15]

Unnamed: 0,lyrics,song_title,artist,num_tokens
9,[],Act of Quiet Desperation,Walt Mink,0
29,"[hang, life, feel, pain, feel, nothing, empty,...",Crush My Soul,Death In June,13
45,[],My Only One,Upon Beauty Rests,0
54,[],Bed Ridden,Safe To Say,0
61,[],Undone,Gameface,0
...,...,...,...,...
3012,"[go, style, flame, flame]",The Embers of Fire,Coheed and Cambria,4
3013,"[join, enter, come, join, come, feel, come, ma...",Enter,After Forever,10
3119,"[come, invasion]",Invasion Warning,Gun Barrel,2
3121,"[monkey, darkbuster, come, play, town]",We Are Darkbuster,Darkbuster,5


In [66]:
temp.loc[temp.num_tokens < 15].sort_values('num_tokens')

Unnamed: 0,lyrics,song_title,artist,num_tokens
9,[],Act of Quiet Desperation,Walt Mink,0
1221,[],To End It All,Soul Embraced,0
1187,[],Victory,Aeternus,0
1148,[],Dead Alive,Soul Embraced,0
1032,[],Bloodstained Nevada,Soul Embraced,0
...,...,...,...,...
68,"[cold, steel, table, kübler, ross, model, aest...",The Swarming of the Locusts,I Shalt Become,14
3503,"[half, hearted, reference, half, hearted, refe...",Borderline Sarcasm,As the Sun Sets,14
3292,"[slowdown, cheer, pioneer, recall, forget, eve...",No Ordinary Caveman,Head Like A Kite,14
681,"[warm, warm, creep, way, head, warm, warm, get...",Warm,Puppy,14


In [72]:
temp.loc[temp.artist == 'Romain Virgo'].iloc[0]

lyrics        [exists, solely, purpose, archive, reggae, lyr...
song_title                                            Beautiful
artist                                             Romain Virgo
num_tokens                                                    9
Name: 844, dtype: object

In [14]:
for genre in lyrics_files:
    print(genre, extract_correctly_matched_songs(genre)[1])

../data/raw/alternative_song_lyrics.tsv (3701, 3716, 7906)
../data/raw/black_death_metal_song_lyrics.tsv (5199, 5199, 8654)
../data/raw/blues_song_lyrics.tsv (1983, 1995, 7445)
../data/raw/christian_gospel_song_lyrics.tsv (1856, 1869, 7734)
../data/raw/country_song_lyrics.tsv (2326, 2332, 8105)
../data/raw/dance_song_lyrics.tsv (948, 954, 5716)
../data/raw/hip_hop_rap_song_lyrics.tsv (1067, 1073, 6945)
../data/raw/jazz_song_lyrics.tsv (1502, 1508, 6465)
../data/raw/pop_song_lyrics.tsv (1704, 1710, 5871)
../data/raw/rb_soul_song_lyrics.tsv (806, 818, 6588)
../data/raw/reggae_song_lyrics.tsv (2061, 2069, 7567)
../data/raw/rock_hard_rock_song_lyrics.tsv (3140, 3143, 8631)


In [15]:
reggae = extract_correctly_matched_songs('../data/raw/reggae_song_lyrics.tsv')[0]

In [16]:
reggae

Unnamed: 0,lyrics,song_title,artist
0,Mama Told Me Lyrics|Sugar in your tea|What's a...,Mama Told Me,The Slackers\n
1,"Numbered Lyrics|Your days numbered, num number...",Numbered,RastaMiles\n
2,What is it? Lyrics|Guess I haven't seen the su...,What Is It?,Indios Bravos\n
3,"Dark Forces Lyrics|The wind in the air, and th...",Dark Forces,RastaMiles\n
4,No Fear Lyrics|I first saw you pass me down th...,No Fear,Mouthwash\n
...,...,...,...
2056,Piece of the Pie Lyrics|What is the jungle|Equ...,Piece of the Pie,Jimmy Cliff\n
2057,Road Foggy Lyrics|My way is so long so long|Bu...,Road Foggy,Burning Spear\n
2058,Real and Right Lyrics|Oh what a competition|Bu...,Real and Right,Israel Vibration\n
2059,Marcus garvey Lyrics|Kebra Negast means Glory ...,Marcus Garvey,Tarrus Riley\n


In [26]:
reggae_wrong = extract_correctly_matched_songs('../data/raw/reggae_song_lyrics.tsv')[2]

In [42]:
reggae_wrong.iloc[40:50]

Unnamed: 0,lyrics,song_title,artist
40,Ouais gros on s'retrouve en exclusivité pour l...,Sugarboys,Sugarboys\n
41,The surgeon had to operate on his knee to repl...,Really Gone!,Mom Blaster\n
42,"""All this happened,"" thought Nekhludoff, ""beca...",To Be In Love Under The Rain,The Expos\n
43,Computing Machinery and Intelligence Lyrics|By...,Almighty's Creation,Joshua Alo\n
44,"Grew up in the 80’s, started banging in the ni...",Life Is a Grave,The Blaster Master\n
45,Ancora silenzio pezzi di merda|È il traffico c...,Step By Step,Skunk Allstars\n
46,"In Which Jos Takes Flight, and the War Is Brou...",All This Time (Live),William White\n
47,I will not insist on this point because I am n...,Lay Awake (Live),Passafire\n
48,"Quartermaster Lyrics|Yo, when I step in, man t...",Stand Down,T.U.G.G.\n
49,"When it was the Forty-sixth Night,|""Most beaut...",Forward Home,Levi Myaz\n


In [41]:
reggae_wrong.lyrics.iloc[32]

"Rock Steady Lyrics|Sometimes I feel like I'd sink like a stone|Some days I feel like a king in a castle|Other days I feel like a man with no home|You know, sometimes I feel like a man on a mission|Sometimes I feel like I'm lost in space|Some days I hold to the law by the letter|Other days I lean on mercy and grace|Welcome to my rollercoaster|These ups and downs don't bother me|You see, God is rock steady|Faithful ever ready|Through the good and the bad|Through the pleasure, the pain|God is rock steady|Faithful ever ready|This worlds keeps changing|He is the same, oh...|He is the same|Sometimes I feel I've got a faith to move mountains|The tough get going, well, you know the rest|Some days I feel like old doubtin' Tommy|Crashing and a-burning with every single test|Welcome to my rollercoaster|These ups and downs don't bother me|You see, God is rock steady|Faithful ever ready|Through the good and the bad|Through the pleasure, the pain|God is rock steady|Faithful ever ready|This worlds k