In [42]:
import os
import re
import json
import pandas as pd
from langdetect import detect

In [43]:
def detect_language(text):
    try:
        language = detect(text)
    except Exception as e:
        print(e)
        language = None
    print(f"Detected lang = {language}")

    return language

INSTRUMENTAL_COMMENT = "This song is an instrumental"

def load_lyric_dataset(input_path):

    rows = list()
    ids = list()

    lyric_files = [os.path.join(input_path, pos_json) for pos_json in os.listdir(input_path) if pos_json.endswith('.json')]

    for file_path in lyric_files:
        with open(file_path) as f:
            song_info = json.load(f)

        try:
            id = song_info['id']
            id = id.replace("ML", "")
            id = int(id)
        except:
            id = None
            print(f"For {file_path} there is no id")

        try:
            mood = song_info['mood']
        except:
            mood = None
            print(f"For {file_path} there is no mood")

        try:
            title = song_info['title']
        except:
            title = None
            print(f"For {file_path} there is no title")

        try:
            lyric = song_info['song']['lyrics']
            if lyric == '': 
                print(f"For {file_path} lyric is empty")
        except:
            lyric = None
            print(f"For {file_path} there is no lyrics")

        try:
            language = song_info['song']['language']
            if language == None: language = detect_language(lyric)
        except:
            print(f"For {file_path} there is no language info in dataset")
            language = detect_language(lyric)


        try:
            comment = song_info['song']['//coment']
            if comment == INSTRUMENTAL_COMMENT:
                instrumental = True
                print(f"For {file_path} is instrumental\n")
            else:
                instrumental = False
        except:
            instrumental = False

        row = (mood, title, lyric, language, instrumental)

        rows.append(row)
        ids.append(id)

    df = pd.DataFrame(rows, columns=['mood', 'title', 'lyric', 'language', 'instrumental'], index=ids)

    return df


def load_en_dataset(path):

    dataset = load_lyric_dataset(path) 

    dataset = dataset.loc[dataset['language'] == "en"]
    en_dataset = dataset.loc[dataset['instrumental'] == False]

    return en_dataset

def clean_lyric(lyric, title):

    #remove title and genius annotation
    lyric = re.sub(".+Lyrics.+\]", '',  lyric)

    #removing title (exception detected)
    lyric = re.sub(f"{title}.+Lyrics", '', lyric)

    #remove exery anotation like [Verse 1], [Chorus], [Bridge], [Part 1] etc.
    lyric = re.sub('\[.+\]', '', lyric)

    #remove every ********* in the lyric
    lyric = re.sub('\*.+\*', '', lyric)

    #remove Genius anotation "You might also like"
    lyric = re.sub('You might also like', '', lyric)

    #remove Embed exist in every lyric in the end
    if lyric[-5:] == 'Embed':
        lyric = re.sub('Embed', '', lyric)
        if lyric[-1:].isdigit():
            lyric = re.sub('\d', '', lyric)

    #remove punctuation
    lyric = re.sub('[^\w\s]', '', lyric)

    #split by lines
    temp_lines = lyric.split('\n')

    # Delete empty lines
    lines = [ln for ln in temp_lines if ln != '']

    return lyric, lines

In [10]:
input_path = os.path.join('..', '..', 'database', 'lyrics_cleaned')
en_dataset = load_en_dataset(input_path)

Detected lang = so
Detected lang = it
Detected lang = en
For ..\..\database\lyrics_cleaned\ML1159.json lyric is empty
For ..\..\database\lyrics_cleaned\ML1159.json there is no language info in dataset
No features in text.
Detected lang = None
For ..\..\database\lyrics_cleaned\ML1159.json is instrumental

For ..\..\database\lyrics_cleaned\ML1230.json lyric is empty
For ..\..\database\lyrics_cleaned\ML1230.json there is no language info in dataset
No features in text.
Detected lang = None
For ..\..\database\lyrics_cleaned\ML1230.json is instrumental

For ..\..\database\lyrics_cleaned\ML1336.json lyric is empty
For ..\..\database\lyrics_cleaned\ML1336.json there is no language info in dataset
No features in text.
Detected lang = None
For ..\..\database\lyrics_cleaned\ML1336.json is instrumental

For ..\..\database\lyrics_cleaned\ML1349.json lyric is empty
For ..\..\database\lyrics_cleaned\ML1349.json there is no language info in dataset
No features in text.
Detected lang = None
For ..\..\

In [28]:
en_dataset.describe()

Unnamed: 0,mood,title,lyric,language,instrumental
count,1882,1882,1882,1882,1882
unique,4,1863,1882,1,1
top,angry,Fire,(\nIn the past there were arguments for and ag...,en,False
freq,490,3,1,1882,1882


In [29]:
en_dataset['lyric'].describe()

count                                                  1882
unique                                                 1882
top       (\nIn the past there were arguments for and ag...
freq                                                      1
Name: lyric, dtype: object

In [39]:
for index, row in en_dataset.iterrows():
    lyric = ''.join([str(x) for x in row['lyric']])
    lyric, lines = clean_lyric(lyric, row['title'])
    en_dataset.at[index, 'lyric'] = lines

In [40]:
en_dataset.head()

Unnamed: 0,mood,title,lyric,language,instrumental
1,happy,I Want Your Sex,[In the past there were arguments for and agai...,en,False
10,happy,Heart of Glass,"[Once I had a love and it was a gas, Soon turn...",en,False
100,happy,Crazy Little Thing Called Love,"[This thing called love, I just cant handle it...",en,False
1000,happy,Almost,"[I almost got drunk at school at , Where I alm...",en,False
1001,happy,Glow,"[I never thought that youd find out I did it, ...",en,False


In [41]:
en_dataset.at[1, 'lyric']

['In the past there were arguments for and against casual sex Then it became a question of morality These days it can be a question of life or death Its as simple as that And this song is not about casual sex',
 'Theres things that you guess',
 'And athings athat you know',
 'Theres boys you can trust',
 'And agirls that you dont',
 'Theres little things you hide',
 'And little things that you show',
 'Sometimes you think youre gonna get it',
 'But you dont and thats just the way it goes',
 'I swear I wont tease you',
 'Wont tell you no lies Yeah',
 'Dont need no Bible',
 'Just look in my eyes',
 'Ive awaited so long baby',
 'Now that were friends',
 'Every mans got his patience',
 'And heres where mine ends',
 'I want your sex',
 'I want your love',
 'I want your sex',
 'I want yoursex',
 'Its playin on my mind',
 'Its dancin on my soul',
 'Its taken so much time',
 'So why dont you just let me go',
 'Id really like to try',
 'Oh Id really love to know',
 'When you tell me youre gonna

In [45]:
test = en_dataset.at[1, 'lyric']
test

['In the past there were arguments for and against casual sex Then it became a question of morality These days it can be a question of life or death Its as simple as that And this song is not about casual sex',
 'Theres things that you guess',
 'And athings athat you know',
 'Theres boys you can trust',
 'And agirls that you dont',
 'Theres little things you hide',
 'And little things that you show',
 'Sometimes you think youre gonna get it',
 'But you dont and thats just the way it goes',
 'I swear I wont tease you',
 'Wont tell you no lies Yeah',
 'Dont need no Bible',
 'Just look in my eyes',
 'Ive awaited so long baby',
 'Now that were friends',
 'Every mans got his patience',
 'And heres where mine ends',
 'I want your sex',
 'I want your love',
 'I want your sex',
 'I want yoursex',
 'Its playin on my mind',
 'Its dancin on my soul',
 'Its taken so much time',
 'So why dont you just let me go',
 'Id really like to try',
 'Oh Id really love to know',
 'When you tell me youre gonna