In [1]:
import os
import json
import re
import spacy
import numpy as np
import pandas as pd

from langdetect import detect


In [2]:
def detect_language(text):
    try:
        language = detect(text)
    except Exception as e:
        print(e)
        language = None
    print(f"Detected lang = {language}")

    return language

INSTRUMENTAL_COMMENT = "This song is an instrumental"

def load_lyric_dataset(input_path):

    rows = list()
    ids = list()

    lyric_files = [os.path.join(input_path, pos_json) for pos_json in os.listdir(input_path) if pos_json.endswith('.json')]

    for file_path in lyric_files:
        with open(file_path) as f:
            song_info = json.load(f)

        try:
            id = song_info['id']
            id = id.replace("ML", "")
            id = int(id)
        except:
            id = None
            print(f"For {file_path} there is no id")

        try:
            mood = song_info['mood']
        except:
            mood = None
            print(f"For {file_path} there is no mood")

        try:
            title = song_info['title']
        except:
            title = None
            print(f"For {file_path} there is no title")

        try:
            lyric = song_info['song']['lyrics']
            if lyric == '': 
                print(f"For {file_path} lyric is empty")
        except:
            lyric = None
            print(f"For {file_path} there is no lyrics")
        
        try:
            language = song_info['song']['language']
            if language == None: language = detect_language(lyric)
        except:
            print(f"For {file_path} there is no language info in dataset")
            language = detect_language(lyric)
            

        try:
            comment = song_info['song']['//coment']
            if comment == INSTRUMENTAL_COMMENT:
                instrumental = True
                print(f"For {file_path} is instrumental\n")
            else:
                instrumental = False
        except:
            instrumental = False

        row = (mood, title, lyric, language, instrumental)
        
        rows.append(row)
        ids.append(id)

    df = pd.DataFrame(rows, columns=['mood', 'title', 'lyric', 'language', 'instrumental'], index=ids)
    
    return df

In [3]:
def load_en_dataset(path):

    dataset = load_lyric_dataset(path) 

    dataset = dataset.loc[dataset['language'] == "en"]
    en_dataset = dataset.loc[dataset['instrumental'] == False]
    
    return en_dataset

In [4]:
input_path = os.path.join('..', '..', 'database', 'lyrics_cleaned')

en_dataset = load_en_dataset(input_path)

Detected lang = so
Detected lang = it
Detected lang = en
For ..\..\database\lyrics_cleaned\ML1159.json lyric is empty
For ..\..\database\lyrics_cleaned\ML1159.json there is no language info in dataset
No features in text.
Detected lang = None
For ..\..\database\lyrics_cleaned\ML1159.json is instrumental

For ..\..\database\lyrics_cleaned\ML1230.json lyric is empty
For ..\..\database\lyrics_cleaned\ML1230.json there is no language info in dataset
No features in text.
Detected lang = None
For ..\..\database\lyrics_cleaned\ML1230.json is instrumental

For ..\..\database\lyrics_cleaned\ML1336.json lyric is empty
For ..\..\database\lyrics_cleaned\ML1336.json there is no language info in dataset
No features in text.
Detected lang = None
For ..\..\database\lyrics_cleaned\ML1336.json is instrumental

For ..\..\database\lyrics_cleaned\ML1349.json lyric is empty
For ..\..\database\lyrics_cleaned\ML1349.json there is no language info in dataset
No features in text.
Detected lang = None
For ..\..\

In [5]:
en_dataset.describe()

Unnamed: 0,mood,title,lyric,language,instrumental
count,1882,1882,1882,1882,1882
unique,4,1863,1882,1,1
top,angry,Fire,I Want Your Sex Lyrics[From a PSA recorded for...,en,False
freq,490,3,1,1882,1882


In [6]:
#duplicated english lyrics
df2 = en_dataset[en_dataset['lyric'].duplicated()]
df2

Unnamed: 0,mood,title,lyric,language,instrumental


There is no duplicated rows - good news!!!

In [7]:
def clean_lyric(lyric, title):
    
    #remove title and genius annotation
    lyric = re.sub(".+Lyrics.+\]", '',  lyric)

    #removing title (exception detected)
    lyric = re.sub(f'{title}.+Lyrics', '', lyric)

    #remove exery anotation like [Verse 1], [Chorus], [Bridge], [Part 1] etc.
    lyric = re.sub('\[.+\]', '', lyric)

    #remove every ********* in the lyric
    lyric = re.sub('\*.+\*', '', lyric)

    #remove Genius anotation "You might also like"
    lyric = re.sub('You might also like', '', lyric)

    #remove Embed exist in every lyric in the end
    if lyric[-5:] == 'Embed':
        lyric = re.sub('Embed', '', lyric)
        if lyric[-1:].isdigit():
            lyric = re.sub('\d', '', lyric)

    return lyric

In [8]:
#clean lyric in dataset
# en_dataset

# iterate through the dataframe and clean the lyric then update the lyric column

for index, row in en_dataset.iterrows():
    lyric = row['lyric']
    title = row['title']
    en_dataset.at[index, 'lyric'] = clean_lyric(lyric, title)


In [9]:


emotion_labels_dict = {'happy': 0, 'sad': 1, 'relaxed': 2, 'angry': 3}

nlp = spacy.load('en_core_web_lg')

# The language model we are using has some issues with stop words.
# Basically we need to grab stopwords from the 'en' language model
# and add them back to the model we are using.
# https://github.com/explosion/spaCy/issues/922
nlp.vocab.add_flag(lambda s: s.lower() in spacy.lang.en.stop_words.STOP_WORDS, spacy.attrs.IS_STOP)

def remove_stopwords(doc):
    tks = list(filter(lambda tk: not tk.is_stop, doc))
    return spacy.tokens.Doc(nlp.vocab, words=[tk.text for tk in tks])


def load_dataset(df):
    rows = list()
    
    for _, row in df.iterrows():
        mood = emotion_labels_dict[row['mood']]
        
        lyric = row['lyric']
        doc = nlp(lyric)
        doc = remove_stopwords(doc)
                               
        if len(doc.vector) == 300:
            rows.append((mood, doc.vector, doc.vector_norm)) 

    return pd.DataFrame(rows, columns=['Mood', 'Vector', 'Vector_Norm'])

In [10]:
dataset = load_dataset(en_dataset)

In [11]:
X_vect = dataset['Vector'].to_numpy().T
X_vect = np.array([np.array(x) for x in X_vect])

y = dataset['Mood'].to_numpy()

In [12]:
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score

clf = SVC(kernel= 'linear' , C= 0.01)

k = 10
scores = cross_val_score(clf, X_vect, y, cv=k)
print(f"Accuracy for k={k}: {round(scores.mean(), 2)} (+/- {round((scores.std() * 1.96), 2)})")

Accuracy for k=10: 0.56 (+/- 0.07)


In [13]:
from sklearn.model_selection import GridSearchCV

# Define the set of parameters we want to test on
params = [
    { 'kernel': ['linear'], 'C': [ 0.01, 0.05, 1, 10, 100 ]},
    { 'kernel': ['rbf', 'sigmoid'], 'C': [ 0.01, 0.05, 0.1, 0.3, 0.8, 1, 3, 10, 50, 100, 150, 200 ] }
]

gs = GridSearchCV(SVC(), params, cv=10, n_jobs=-1, verbose=False)
gs.fit(X_vect, y) 

svm_best = gs.best_estimator_
best_params = gs.best_params_
print('Best parameters:', best_params)

Best parameters: {'C': 0.01, 'kernel': 'linear'}


In [14]:
scores = cross_val_score(svm_best, X_vect, y, cv=10)
print(f"Accuracy for k={k}: {round(scores.mean(), 2)} (+/- {round((scores.std() * 1.96), 2)})")

Accuracy for k=10: 0.56 (+/- 0.07)


Considered features:
according to: https://github.com/sgiammy/emotion-patterns-in-music-playlists

<ul>
    <li>**Title_vector**</li>
    <li>**Lyric_vector**</li>
    <li>**%Rhymes**:<br> defined as the percentage of the number of rhymes over the number of total lines. A rhyme is defined as a rhyme between two following lines.</li>
    <li>**%Past_tense_verbs**:<br> defined as the the percentage of the number of past tense verbs over the total number of verbs.</li>
    <li>**%Present_tense_verbs**:<br>  defined as the the percentage of the number of present tense verbs over the total number of verbs.</li>
    <li>**%Future_tense_verbs**:<br>  defined as the the percentage of the number of future tense verbs over the total number of verbs, where future is just will + base form.</li>
    <li>**%ADJ**:<br> Percentage of adjectives over the total number of words.</li>
    <li>**%ADP**:<br> Percentage of adpositions (e.g. in, to, during) over the total number of words.</li>
    <li>**%ADV**:<br> Percentage of adverbs (e.g. very, tomorrow, down, where, there) over the total number of words.</li>
    <li>**%AUX**:<br> Percentage of auxiliaries (e.g. is, has (done), will (do), should (do)) over the total number of words.</li>
    <li>**%INTJ**:<br> Percentage of interjections (e.g. psst, ouch, bravo, hello) over the total number of words.</li>
    <li>**%NOUN**:<br> Percentage of nouns over the total number of words.</li>
    <li>**%NUM**:<br> Percentage of numerals over the total number of words.</li>
    <li>**%PRON**:<br> Percentage of pronouns (e.g. I, you, he, she, myself, themselves, somebody,...) over the total number of words.</li> 
    <li>**%PROPN**:<br> Percentage of proper nouns (e.g. Mary, John) over the total number of words.</li>
    <li>**%PUNCT**:<br> Percentage of puntuctuation (e.g. ., (, ), ?) over the total number of words.</li>
    <li>**%VERB**:<br> Percentage of verbs over the total number of words.</li>
    <li>**Selfish_degree**:<br> Percentage of 'I' pronouns over the total number of pronouns</li>
    <li>**%Echoism**:<br> Percentage of echoism over the total number of words, where an echoism is either a sequence of two subsequent repeated words or the repetition of a vowel in a word. </li>
    <li>**%Duplicates**:<br> Percentage of duplicate words over the total number of words</li>
    <li>**isTitleInLyric**:<br> Boolean, true if the title string is also a substring of the lyric</li>
    <li>**sentiment**:<br> Sentiment between -1 and 1</li>
    <li>**subjectivity degree**:<br> Degree of subjectivity of the text</li>
</ul>

In [15]:
input_path = os.path.join('..', '..', 'database', 'lyrics_cleaned')

en_dataset = load_en_dataset(input_path)

for index, row in en_dataset.iterrows():
    lyric = row['lyric']
    title = row['title']
    en_dataset.at[index, 'lyric'] = clean_lyric(lyric, title)
    

Detected lang = so
Detected lang = it
Detected lang = en
For ..\..\database\lyrics_cleaned\ML1159.json lyric is empty
For ..\..\database\lyrics_cleaned\ML1159.json there is no language info in dataset
No features in text.
Detected lang = None
For ..\..\database\lyrics_cleaned\ML1159.json is instrumental

For ..\..\database\lyrics_cleaned\ML1230.json lyric is empty
For ..\..\database\lyrics_cleaned\ML1230.json there is no language info in dataset
No features in text.
Detected lang = None
For ..\..\database\lyrics_cleaned\ML1230.json is instrumental

For ..\..\database\lyrics_cleaned\ML1336.json lyric is empty
For ..\..\database\lyrics_cleaned\ML1336.json there is no language info in dataset
No features in text.
Detected lang = None
For ..\..\database\lyrics_cleaned\ML1336.json is instrumental

For ..\..\database\lyrics_cleaned\ML1349.json lyric is empty
For ..\..\database\lyrics_cleaned\ML1349.json there is no language info in dataset
No features in text.
Detected lang = None
For ..\..\