In [74]:
# import libraries
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import numpy as np
import re
import random
import nltk
from scipy import sparse
from scipy.sparse import csr_matrix, vstack
from textblob import TextBlob
import pickle
from datetime import datetime

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

Import the data

In [75]:
df = pd.read_csv('/content/drive/MyDrive/translated.csv')
df.head()

Unnamed: 0,Artist,Song,Translated_Lyrics
0,ԱՐԹՈՒՐ ՄԵՍՉՅԱՆ,ԱՀԱ ԵՎ ՎԵՐՋ,Here is an end\n\r\nEnd of paragraphs and last...
1,ԱՐԹՈՒՐ ՄԵՍՉՅԱՆ,ԱՄԵՆԸ ՁԵԶ,Large-scale and wonderful temples\n\r\nAnd the...
2,ԱՐԹՈՒՐ ՄԵՍՉՅԱՆ,ԱՆԱՌԱԿ ՈՐԴՈՒ ՎԵՐԱԴԱՐՁԸ,When like a prodigal son\n\r\nBecome your own ...
3,ԱՐԹՈՒՐ ՄԵՍՉՅԱՆ,ԱՇԽԱՐՀԻ ՑԱՎՈՎ,My heart with Kariy's world pain\n\r\nHow much...
4,ԱՐԹՈՒՐ ՄԵՍՉՅԱՆ,ԱՌԱՋ ԳԻՏԵԻ,I was known before\n\r\nWhat people would be a...


### Preprocessing

**Check missing values**

In [76]:
missing_values = df['Translated_Lyrics'].isnull().sum()
df.dropna(inplace = True)

**Check duplicated values**

In [77]:
df[df["Translated_Lyrics"].duplicated()]

Unnamed: 0,Artist,Song,Translated_Lyrics
265,Silva Hakobyan,Տնից փախել եմ (Tynits pakhel em),I wore the green clothes\nI have literal to Ho...


In [78]:
df.drop(265, inplace =True)

**Remove some symbols**

In [79]:
df['Translated_Lyrics'] = df['Translated_Lyrics'].str.replace(r'\n|\r|\t', ' ', regex=True)

**Brackets**

Round brackets

In [80]:
text_in_round_brackets = sum(list(df['Translated_Lyrics'].map(lambda s: re.findall(r'\((.*?)\)',s))), [])
print('Number of round brackets: {}'.format(len(text_in_round_brackets)))

Number of round brackets: 0


Square brackets

In [81]:
text_in_square_brackets = sum(list(df['Translated_Lyrics'].map(lambda s: re.findall(r'\[(.*?)\]',s))), [])
print('Number of square brackets: {}'.format(len(text_in_square_brackets)))

Number of square brackets: 0


Curly brackets

In [82]:
text_in_curly_brackets = sum(list(df['Translated_Lyrics'].map(lambda s: re.findall(r'\{(.*?)\}',s))), [])
print('Number of square brackets: {}'.format(len(text_in_curly_brackets)))

Number of square brackets: 0


**Tokenization**

A very common way to analyse text is to seperate it into a list of words which makes it much easier to do further analysis. I'm using nltk.tokenize to do that. Furthermore, all punctuations are removed as well. Below is an example how the text is converted.

In [83]:
song_df = df

In [84]:
song_df.rename(columns={'Artist': 'artist', 'Translated_Lyrics': 'text', 'Song': 'song' }, inplace=True)

In [85]:
tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
song_df['tokens'] = song_df['text'].map(tokenizer.tokenize)

print('Text:')
print(song_df['text'].iloc[0])

print('Tokens:')
print(song_df['tokens'].iloc[0])

Text:
Here is an end   End of paragraphs and lasting   As a last closed page   And no return already   Oh native bugs   You my last Last Prayer   Suburbs of prison psyche   Chain and solo   Here. And the end   In the heart of a powerless cry   Oh unfinished my song   You my last capital   Now we believe   Where is the Spirit's Church?   We won the war   But it turns out that we lost   Here. And the end   Tell me a voice from the depth   Is the mask of the mask now speaks   Which puppet is the crowd?   The song is the arrow of freedom   And the spear of myth   Bel Joschin Hayk won   Or the opposite of probably   And the pages close the myth tired   Stone Statue History Forgotten   There is no start and finish for this eternal struggle   Your Type of Elimination is delivered to Bell   Here. And the end   Hayk lost to Bell   Here. And the end   I lost in a terrible battle   The song was hit by my unfinished   But why not end   Oh native bugs   You my last Last Prayer   Printed Hoku suburb

**Stemming**

Stemming is a simple method to convert words to a common base form. For example converting plural to singular or past tense to present tense. This helps to treat words with the same meaning in the same way.

In contrast to lemmatization it uses a very simple method and does not always find the grammatically correct from.

In [86]:
# initialise stemmer
stemmer = nltk.stem.porter.PorterStemmer()

In [87]:
# create dictionary to map tokens their stem
token_to_stem = {}
# initialise word count
token_count = 0
# iterate through all songs
for lst in song_df['tokens']:
    # iterate through all tokens of song
    for token in lst:
        token_count += 1
        # check if token is in dictionary
        if token not in token_to_stem:
            # add token to dictionary
            token_to_stem[token] = stemmer.stem(token)
            
song_df['stems'] = song_df['tokens'].map(lambda lst: [token_to_stem[token] for token in lst])

print('Number of tokens: {}'.format(token_count))
print('Number of unique tokens: {}'.format(len(token_to_stem.keys())))
print('Number of unique stems: {}'.format(len(set(token_to_stem.values()))))

Number of tokens: 89371
Number of unique tokens: 8837
Number of unique stems: 5833


In [88]:
song_df['n_stems'] = song_df['stems'].map(len)

This ratio typically varies from 0.2 to 0.8 among songs of the same artist. Nevertheless, there are some artists who repeat their words more often than others. For example Lilit Hovhannisyan and Aram MP3.

**Word frequency (TFIDF)**

We use TFIDF to analyse word frequencies in more detail. TFIDF means term frequency - inverse document frequency. Term frequency means how often a word appears in a specific text (in this case song lyrics). Inverse document frequency is the inverse frequency of the same word in the whole document (in this case all song lyrics).

The idea of TFIDF is to find out if a specific word appears unusually often in the text. If this is the case, this word gets a high value, if the word appears more often in other texts, its value will be low.

This method is very common to cluster texts with similar topics.

In [89]:
song_df

Unnamed: 0,artist,song,text,tokens,stems,n_stems
0,ԱՐԹՈՒՐ ՄԵՍՉՅԱՆ,ԱՀԱ ԵՎ ՎԵՐՋ,Here is an end End of paragraphs and lasting...,"[Here, is, an, end, End, of, paragraphs, and, ...","[here, is, an, end, end, of, paragraph, and, l...",263
1,ԱՐԹՈՒՐ ՄԵՍՉՅԱՆ,ԱՄԵՆԸ ՁԵԶ,Large-scale and wonderful temples And the pr...,"[Large, scale, and, wonderful, temples, And, t...","[larg, scale, and, wonder, templ, and, the, pr...",191
2,ԱՐԹՈՒՐ ՄԵՍՉՅԱՆ,ԱՆԱՌԱԿ ՈՐԴՈՒ ՎԵՐԱԴԱՐՁԸ,When like a prodigal son Become your own cit...,"[When, like, a, prodigal, son, Become, your, o...","[when, like, a, prodig, son, becom, your, own,...",152
3,ԱՐԹՈՒՐ ՄԵՍՉՅԱՆ,ԱՇԽԱՐՀԻ ՑԱՎՈՎ,My heart with Kariy's world pain How much su...,"[My, heart, with, Kariy, s, world, pain, How, ...","[my, heart, with, kariy, s, world, pain, how, ...",134
4,ԱՐԹՈՒՐ ՄԵՍՉՅԱՆ,ԱՌԱՋ ԳԻՏԵԻ,I was known before What people would be a wo...,"[I, was, known, before, What, people, would, b...","[i, wa, known, befor, what, peopl, would, be, ...",149
...,...,...,...,...,...,...
537,Mi Qani Hogi,Chanapar,Several people WHAT WILL THIS WAY TO YOU? ...,"[Several, people, WHAT, WILL, THIS, WAY, TO, Y...","[sever, peopl, what, will, thi, way, to, you, ...",205
538,Mi Qani Hogi,Es Pahin,It seems to us seems to be That one is watchi...,"[It, seems, to, us, seems, to, be, That, one, ...","[it, seem, to, us, seem, to, be, that, one, is...",199
539,Mi Qani Hogi,Rhyme,Several people ICF: Did you find a wave o...,"[Several, people, ICF, Did, you, find, a, wave...","[sever, peopl, icf, did, you, find, a, wave, o...",185
541,Mi Qani Hogi,Yes Qo Nmanei,one Like: One one all ...,"[one, Like, One, one, all, One, one, holiday, ...","[one, like, one, one, all, one, one, holiday, ...",63


In [90]:
song_df['stems_str'] = song_df['stems'].map(lambda lst: ' '.join(lst))

In [91]:
# initialise count vectorizer
cv = CountVectorizer()

# generate word counts
stem_count_vector = cv.fit_transform(song_df['stems_str'])

# compute idf
tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
tfidf_transformer.fit(stem_count_vector)

Next we calculate an TFIDF vector for each song. Every element of this vector represents one word. Its value is calculated by multiplying its frequency with the corresponding weight which was calculated above. This value is normalized by the total number of words of the song. Words which don't appear in the text get a value of 0. The TFIDF score is the sum of the TFIDF vector elements.

In [92]:
# assign tf idf scores to each song
tf_idf_vector = tfidf_transformer.transform(stem_count_vector)

# attach count vectors to dataframe
tf_idf_vector_lst = [-1] * len(song_df)
for i in range(len(song_df)):
    tf_idf_vector_lst[i] = tf_idf_vector[i]
song_df['tf_idf_vector'] = tf_idf_vector_lst    

song_df['tf_idf_score'] = song_df['tf_idf_vector'].map(lambda vec: np.sum(vec.todense()))

In [94]:
fig = px.box(song_df, x='artist', y='tf_idf_score', title='TFIDF scores of songs per artist')
fig.show()

The distributions are quite different.The artist using the most unusual words is HT Hayko.
Another way to quantify differences in word selections is to calculate the similarity of TFIDF vectors. This can be done by calculating the angle between two vectors. For example if v1 is (1, 0) and v2 is (0, 1) these vectors point in orthogonal directions and have an angle of 90 degree. The cosine of these vectors would be 0 (this is the lowest possible similarity we can get as all TFIDF vectors cannot have negative values). If we change v1 to (1, 1) the angle between the vectors would be smaller wich increases the cosine value and therewith their similarity. When two vectors are parallel their angle is 0 which means that the cosine would be 1 which is the maximum similarity.
We will apply this metric to TFIDF vectors to compare their similarity. 

First we will calculate an TFIDF vector for each artist which is calculated by taking the average of all TFIDF vectors of the artist's songs.

In [95]:
# caclculate mean vector
def get_mean_vector(vec_lst):
    return csr_matrix(vstack(vec_lst).mean(axis=0))

In [96]:
# calculate mean vector over all songs of same artist
artist_df = song_df.groupby('artist').agg({'tf_idf_vector': get_mean_vector, 'song': len}).reset_index()\
                   .rename(columns={'song': 'songs'})

In [97]:
similarity_matrix = cosine_similarity(vstack(artist_df['tf_idf_vector']), 
                                      vstack(artist_df['tf_idf_vector']))
artist_names = artist_df['artist'].tolist()
fig = go.Figure(data=go.Heatmap(z=np.flipud(similarity_matrix), x=artist_names, y=list(reversed(artist_names)), 
                                colorscale='balance', zmin=0.5, zmax=1.1))
fig.show()

This matrix visualises the similarity between TFIDF artist vectors. Again, a value of 1 (red) means that the vectors are identical which only appears when comparing vectors of the same artist. The lowest similarities are 0.4 between Louis Jordan and Vengaboys.

The matrix shows that ՍԱՅԱԹ-ՆՈՎԱ and ԿՈՄԻՏԱՍ seem to use very different words than any other artists. It is obvious. We can also see that some artists use very similar words like Arsen Safaryan and Հայկ Հակոբյան, ԱՐԹՈՒՐ ՄԵՍՉՅԱՆ and ՌՈՒԲԵՆ ՀԱԽՎԵՐԴՅԱՆ.

We want to analyse how different TFIDF vectors of songs from the same artist are. Therefore, we calculate the similarity of the artist's TFIDF vector with the TFIDF vector of each song. The problem hereby is that the artist vector was averaged over all song vectors including the one we want to compare against. To avoid any bias from that we calculate the artist vector over all song vectors except the one we are comparing it against.

For example if an artist has three songs: A, B, C. In order to compare how similar song A is with all songs of the artist we calculate the artist vector only from song B and C. To compare song B, the artist vector only consists of song A and C, and so on.

In [98]:
artist_song_df = pd.merge(artist_df[['artist', 'tf_idf_vector', 'songs']].assign(key = 0), 
                                 song_df[['artist', 'tf_idf_vector', 'song']].assign(key = 0), on='key', 
                                 suffixes=['_artist', '_song']).drop('key', axis=1).reset_index(drop=True)
artist_song_df['same_artist'] = artist_song_df['artist_artist'] == artist_song_df['artist_song']

In [99]:
# calculate similarity of artist tf idf vector and song vector
def tf_idf_vector_similarity(artist_vector, song_vector, songs, same_artist):
    # check if song is from same artist
    if same_artist:
        # deduct song vector from artist vector
        artist_vector = (songs * artist_vector - song_vector) / (songs - 1)
    # calculate similarity
    return cosine_similarity(artist_vector, song_vector)[0][0]

In [100]:
artist_song_df['vector_similarity'] = \
    artist_song_df.apply(lambda row: tf_idf_vector_similarity(row['tf_idf_vector_artist'], 
                                                                     row['tf_idf_vector_song'], 
                                                                     row['songs'], row['same_artist']), axis=1)

In [101]:
df = artist_song_df

The distributions show the similarity of song vectors from the same artist with the artist's vector (blue) and the similarity of song vectors from other artists with the artist vector (red).

**Sentiment analysis**

The last features we engineer are polarity and subjectivity of songs. The TextBlob library has a function to get these values with respect to a given text.

In [102]:
polarity_lst = [-1] * len(song_df)
subjectivity_lst = [-1] * len(song_df)
for i, text in enumerate(song_df['text']):
    sentiment = TextBlob(text)
    polarity_lst[i] = sentiment.polarity
    subjectivity_lst[i] = sentiment.subjectivity
    
song_df['polarity'] = polarity_lst
song_df['subjectivity'] = subjectivity_lst

In [103]:
fig = px.scatter(song_df, x='polarity', y='subjectivity', color='artist', hover_data=['song'], 
                 title='Polarity and Subjectivity of Songs')
fig.show()

Every point of this plot represents one song. The x value is the polarity and the y value the subjectivity. Although there is a high variance in both features.

In [104]:
fig = px.box(song_df, x='artist', y='polarity', title='Polarity by artist')
fig.show()

The diagram above shows the polarity distribution of songs from the same artist.

## Logistic Regression

In this part we create a model match song lyrics to their artist. Therefore, we use all song features which were presented above. The artist features are calculated by averaging the features of all songs from this artist.

First, we match all songs with all aritsts. Then use logistic regression to estimate the probability that a song belongs to an artist for each song artist pair. After that we choose for each song the artist with the highest probability.

We use following variables to create the datasets for training and validating the model:

n_set: number of sets for training and validation
n_artist: number of artists per set
n_song_min: minimum number of songs an artist must have to be selected
n_song_artist_max: maximum number of song - artist pairs per artist set

The artsits are randomly assigned to sets. It is possible that the same artist is assigned to several sets, but it is not possible that there are two sets with identical artists.

In [105]:
# number of sets
n_set = {'train': 20, 'val': 20}
# number of artists per set
n_artist = 3
# minimum number of songs of one artist
n_song_min = 5
# maximum number of song - artist pairs per artist set
n_song_artist_max = 100

In [106]:
def select_artist_song_create_feature(song_df, n_set, n_artist, n_song_min, n_song_artist_max):
    song_count_df = song_df.groupby('artist')[['artist']].count().rename(columns={'artist': 'count'})
    artist_lst = list(song_count_df.loc[song_count_df['count'] >= n_song_min].index.values)

    n_set_total = sum(n_set.values())

    artist_set = []
    while len(artist_set) < n_set_total:
        new_artist = tuple(np.random.choice(artist_lst, size=n_artist, replace=False))
        if new_artist not in artist_set:
            artist_set.append(new_artist)

    # split artist sets
    artist_select = {}
    for field, n in n_set.items():
        i_select = np.random.choice(range(len(artist_set)), size=n, replace=False)
        artist_list = list(artist_set)
        artist_select[field] = [artist_list[i] for i in i_select]
        artist_set = [s for s in artist_set if s not in artist_select[field]]
    # create dataframe with all features
    feature_dict = {}
    # dictionary to map artist set id to list of artists
    set_id_to_artist_tp = {}

    i = 0
    for field, artist_set in artist_select.items():
        df_lst = []
        for artist_tp in artist_set:
            i += 1
            df = song_df.loc[song_df['artist'].isin(artist_tp), 
                             ['artist', 'song', 'n_stems', 'unique_stems_ratio', 'tf_idf_vector', 
                              'tf_idf_score', 'polarity']]
            # check if number of songs is too high
            if len(df) * n_artist > n_song_artist_max:
                df = df.sample(int(n_song_artist_max / n_artist), random_state=0)
            df['artist_set_id'] = i
            set_id_to_artist_tp[i] = artist_tp
            df_lst.append(df)
        feature_dict[field] = pd.concat(df_lst)  
        print('Number of songs in {}: {}'.format(field, len(feature_dict[field])))

    # get all selected artists
    artist_select_set = set.union(*[set(sum(tp_lst, ())) for tp_lst in artist_select.values()])

    # create artist dataframe from training data
    df_lst = []
    for artist, df in song_df.loc[song_df['artist'].isin(artist_select_set)].groupby('artist'):
        dic = {'artist': artist}
        # calculate averages and standard diviations
        for field in ['n_stems', 'unique_stems_ratio', 'tf_idf_score', 'polarity']:
            dic[field + '_mean'] = df[field].mean()
            dic[field + '_std'] = df[field].std()
        # number of songs
        dic['songs'] = len(df)

        # calculate average tf idf vector
        dic['tf_idf_vector_mean'] = get_mean_vector(df['tf_idf_vector'])

        df_lst.append(pd.DataFrame(dic, index=[0]))
    artist_feature_df = pd.concat(df_lst)

    def get_features(df):
        # get artist set id
        artist_set_id = df['artist_set_id'].iloc[0]
        
        # get all artists
        artist_feature_select_df = artist_feature_df.loc[artist_feature_df['artist']\
                                                         .isin(set_id_to_artist_tp[artist_set_id])]
        # merge dataframes
        artist_song_feature_df = pd.merge(artist_feature_select_df.assign(key=0), df.assign(key=0), on='key', 
                                          suffixes=['_artist', '_song']).drop('key', axis=1)    
        artist_song_feature_df['same_artist'] = \
            artist_song_feature_df['artist_artist'] == artist_song_feature_df['artist_song']

        # calculate features
        for feature in ['n_stems', 'unique_stems_ratio', 'tf_idf_score', 'polarity']:
            artist_song_feature_df[feature + '_diff'] = \
                artist_song_feature_df[feature] - artist_song_feature_df[feature + '_mean']
            artist_song_feature_df[feature + '_diff_std'] = \
                artist_song_feature_df[feature + '_diff'] / artist_song_feature_df[feature + '_std']

        # calculate vector similarity between artist and song
        artist_song_feature_df['vector_similarity'] = \
            artist_song_feature_df.apply(lambda row: tf_idf_vector_similarity(row['tf_idf_vector_mean'], 
                                                      row['tf_idf_vector'], row['songs'], row['same_artist']), 
                                         axis=1) 
        return artist_song_feature_df

    artist_song_feature = {}
    for field in feature_dict:
        artist_song_feature[field] = feature_dict[field].groupby('artist_set_id').apply(get_features)\
                                                        .reset_index(drop=True)
        
    return artist_song_feature

In [109]:
# number of unique stems
song_df['n_unique_stems'] = song_df['stems'].map(lambda lst: len(set(lst)))
# ratio of unique stems
song_df['unique_stems_ratio'] = song_df['n_unique_stems'] / song_df['n_stems']

In [110]:
np.random.seed(0)
artist_song_feature = select_artist_song_create_feature(song_df, n_set, n_artist, n_song_min, n_song_artist_max)

Number of songs in train: 660
Number of songs in val: 660


The above function creates a dictionary with the fields specified in n_set with the number of defined artist sets. An artist set is a set of songs from n_artist number of randomly selected artists.

The values of this dictionary are dataframes with all artist sets, e.g. if n_set['train'] = 20 it contains 20 artist sets. Each row of the data set contains a pair of artist - song matches. Thereby, the song is from one of the randomly selected artists of this set.

In [111]:
artist_song_feature['train'].iloc[0]

artist_artist                                                           HT Hayko
n_stems_mean                                                             451.125
n_stems_std                                                           141.850567
unique_stems_ratio_mean                                                 0.391701
unique_stems_ratio_std                                                  0.045657
tf_idf_score_mean                                                       9.409263
tf_idf_score_std                                                        2.028583
polarity_mean                                                            0.02805
polarity_std                                                            0.107685
songs                                                                         16
tf_idf_vector_mean               (0, 4)\t0.002311604255428804\n  (0, 6)\t0.00...
artist_song                                                   Լիլիթ Հովհաննիսյան
song                        

Above are all columns with values of the first row of the dataframe. Every artist set has an id which is contained in artist_set_id. The column artist_artist contains the name of the artist from whom all artist related features were taken. aritst_song is the name of the artist whoes song was matched to the artist. In this case the artist features are from "Little Mix" and the song features are from the song "Secret Love" from "Little Mix". Thus, the prediction algorithm is expected to return a high probability that the artist and song match (the target variable is same_artist).

Following features are added to each artist and song: n_stems, unique_stems_ratio, stems_per_line, tf_idf_score, and polarity. The artist features contain the mean (_mean) and standard deviation (_std). The difference of the song and artist features (song feature - artist feature) have the suffix _diff. Additionally these features are divided by the standard deviation to get a normalised measure for the difference (_diff_std). The dataframe also contains the TFIDF vector of all artist songs and the matched song (tf_idf_vector_mean and tf_idf_vector), the similarity of the vectors contains the feature vector_similarity.

In [112]:
feature = ['n_stems_diff', 'n_stems_diff_std',
       'unique_stems_ratio_diff', 'unique_stems_ratio_diff_std',
       'tf_idf_score_diff',
       'tf_idf_score_diff_std', 'polarity_diff', 'polarity_diff_std',
       'vector_similarity']
df_lst = []
for f in feature:
    df = artist_song_feature['train'][['same_artist']]
    df['feature'] = f
    df['value'] = artist_song_feature['train'][f]
    df_lst.append(df)
feature_df = pd.concat(df_lst)
feature_df.head()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

Unnamed: 0,same_artist,feature,value
0,False,n_stems_diff,-258.125
1,False,n_stems_diff,-201.125
2,False,n_stems_diff,-113.125
3,True,n_stems_diff,33.875
4,False,n_stems_diff,-402.125


In [113]:
def violine_feature_plot(feature_df, feature_select):

    fig = go.Figure()
    df = feature_df.loc[feature_df['feature'].isin(feature_select)]

    fig.add_trace(go.Violin(x=df['feature'][df['same_artist']],
                            y=df['value'][df['same_artist']],
                            legendgroup='Same Artist', scalegroup='Same Artist', name='Same Artist',
                            side='negative')
                 )
    fig.add_trace(go.Violin(x=df['feature'][~df['same_artist']],
                            y=df['value'][~df['same_artist']],
                            legendgroup='Different Artists', scalegroup='Different Artists', name='Different Artists',
                            side='positive')
                 )

    fig.update_traces(meanline_visible=True)
    fig.update_layout(violingap=0, violinmode='overlay')
    fig.update_layout(title='Feature Comparison')
    fig.update_xaxes(title='Feature')
    return fig

In [114]:
fig = violine_feature_plot(feature_df, ['n_stems_diff_std', 'unique_stems_ratio_diff_std',
                                        'tf_idf_score_diff_std', 'polarity_diff_std'])
fig.update_xaxes(range=[-0.5, 4.5])
fig.show()

The violine plot above shows the distribution of the normalised difference features n_stems, unique_stems_ratio, tf_idf_score, and polarity. There is one distribution created which only contains artist - song pairs of the same artist (blue) and one distribution for the case of different artists (red).

In [115]:
fig = violine_feature_plot(feature_df, ['vector_similarity'])
fig.update_xaxes(range=[-1, 1])
fig.show()

This plot shows the distribution for the similarity of TFIDF vectors. Here we can see a difference in the distributions for songs from the same artist or different artists. So this seems to be the best feature. However, the distributions overlap a lot, therefore it will be difficult in general to distinguish songs from the same or different artist.

In [116]:
def prepare_data(df, feature_org, feature_abs):
    for f in feature_abs:
        df[f] = df[f].abs()
    X = df[feature_org + feature_abs].values
    y = df['same_artist'].values
    
    return X, y

def select_songs_train_pipeline(song_df, n_set, n_artist, n_song_min, n_song_artist_max, feature_org, feature_abs, 
                                pipeline):
    artist_song_feature = select_artist_song_create_feature(song_df, n_set, n_artist, n_song_min, n_song_artist_max)

    # prepare data
    X, y = prepare_data(artist_song_feature['train'], feature_org, feature_abs)

    pipeline = pipeline.fit(X, y)
    
    return artist_song_feature, pipeline

Above we created one function to convert the extracted feature values to a matrix and get a target vector for prediction. The other function combines the creation of the feature dataframe, convertion of the features to a matrix and training a machine learning pipeline.

In [117]:
#prepare data create and train pipeline
n_artist = 3
n_song_min = 5
n_set = {'train': 100}
n_song_artist_max = 100

feature_org = ['n_stems', 'unique_stems_ratio', 'tf_idf_score', 'polarity', 'vector_similarity']
feature_abs = ['n_stems_diff', 'n_stems_diff_std', 'unique_stems_ratio_diff', 'unique_stems_ratio_diff_std', 
               'tf_idf_score_diff', 'tf_idf_score_diff_std', 
               'polarity_diff', 'polarity_diff_std']

pipeline = Pipeline([('scale', StandardScaler()), 
                     ('clf', LogisticRegression(solver='lbfgs', max_iter=3000, 
                                                class_weight={False: 1/n_artist, True:(n_artist - 1)/n_artist}))])

np.random.seed(1)
artist_song_feature, pipeline = select_songs_train_pipeline(song_df, n_set, n_artist, n_song_min, n_song_artist_max, 
                                                            feature_org, feature_abs, pipeline)

Number of songs in train: 3295


We selected all features which were introduced in the feature engineering part, created a pipeline which normalises all features and uses logistic regression to estimate the probabilities. In the definition of the classifier we use class weights: positive samples (song is from matched artist) are weithted with (n_artist - 1)/n_artist (n_artist is the number of artists per set) while negative samples are weithed with 1/n_artist. The higher the weight the stronger the impact of a wrong prediction on the loss function. For example in case of 5 artists predicting a true sample as false has a five times higher impact than predicting a false sample as true. This is done to prevent the classifier from predicting every sample as false.

In [118]:
feature_importance_df = pd.DataFrame({'feature': feature_org+feature_abs, 'coefficient':pipeline['clf'].coef_[0]})

px.bar(feature_importance_df.sort_values('coefficient'), x='feature', y='coefficient')

The plot shows the coefficient values of the logistic regression model. The higher the absolute value of the coefficient, the more important the feature. Thus, the features at the edge of the plot are the most important ones.

The more positive a coeffiecient the stronger the corresponding feature (all features were normalised and have positive values) causes a positive prediction. Thus, if the features on the left side of the plot have high values a negative prediction (different artists) is more likely, while the higher the features on the right the more likely a positive prediction (same artists) becomes.

The most negative coefficient is for tf_idf_score this means the more unusual the words of the song the more likely the song is from a different artist. Although this affects the probability, this feature only depends on the song and therefore does not affect the selection of artist - song pair.

On the other end of the scale is vector_similarity. This feature describes how similar the TF IDF vectors of the artist and the song are. The more similar the more likely it is that the artist and song match.

### Validation

In [119]:
def predict_artist(df, feature_org, feature_abs, pipeline, top_n):
    # prepare data
    X, y = prepare_data(df, feature_org, feature_abs)
    
    # get probability
    proba = pipeline.predict_proba(X)
    # attach to dataframe
    df['probability'] = proba[:, 1]
    df['correct_prediction'] = df['artist_artist'] == df['artist_song']
    
    # get artist song pairs with highest probability
    predict_select = df.sort_values('probability', ascending=False).groupby(['artist_set_id']).head(top_n)\
                       .groupby(['artist_set_id'])['correct_prediction'].max()
    
    # get accuracy
    print('Accuracy: {}'.format(predict_select.mean()))
    
    return predict_select

The function above makes the matching prediction and validates the prediction accuracy. The variable top_n specifies how many top predictions are considered as correct. For example if there are four artists (A, B, C, D) matched to one song which belongs to artist C, the model orders the artist with repspect to the probability that they match to the respective song. The result could be B, C, A, D.

If top_n is set to one, the prediction is only considered as correct if the artist with the highest probability (in this case B) matches to the song. As this is not the case the prediction would be considered as wrong. If top_n is set to 2 or higher the prediction would be correct as C is the artist with the second highest probability. Hence, the higher top_n the more likely a prediction is considered as correct.

In [120]:
artist_predict_df = predict_artist(artist_song_feature['train'], feature_org, feature_abs, pipeline, top_n=1)

Accuracy: 0.88


In [121]:
artist_predict_df = predict_artist(artist_song_feature['train'], feature_org, feature_abs, pipeline, top_n=2)

Accuracy: 0.98


To validate the performance of the model we randomly create 100 training and 100 valdiation sets of 2, 4, 8 artists per set. The accuracy of the model is calculated for values of top_n from 1 until 4.

The graph above shows how the accrucy decreased with respect to the number of artists for different values of top_n.

In [122]:
n_artist_lst = [2, 4, 8]
top_n_lst = [1, 2, 4]
n_song_artist_max = 8
np.random.seed(2)

n_set = {'train': 100, 'val': 100}

feature_org = ['n_stems', 'unique_stems_ratio', 'tf_idf_score', 'polarity', 'vector_similarity']
feature_abs = ['n_stems_diff', 'n_stems_diff_std', 'unique_stems_ratio_diff', 'unique_stems_ratio_diff_std', 
               'tf_idf_score_diff', 'tf_idf_score_diff_std', 
               'polarity_diff', 'polarity_diff_std']

pipeline = Pipeline([('scale', StandardScaler()), 
                     ('clf', LogisticRegression(solver='lbfgs', max_iter=3000, 
                                                class_weight={False: 1/n_artist, True:(n_artist - 1)/n_artist}))])

result_lst = []
for n_artist in n_artist_lst:
    print(datetime.now())
    print('n_artist: {}'.format(n_artist))
    
    artist_song_feature, pipeline = select_songs_train_pipeline(song_df, n_set, n_artist, n_song_min, 
                                                                n_song_artist_max, feature_org, feature_abs, pipeline)
    
    for top_n in [n for n in top_n_lst if n < n_artist]:
        print('top_n: {}'.format(top_n))
        
        predict_select = predict_artist(artist_song_feature['val'], feature_org, feature_abs, pipeline, top_n=top_n)
        result_dict = {'n_artist': n_artist, 'top_n': top_n, 'accuracy': predict_select.mean()}
        result_lst.append(result_dict)
        
    print('')
    
result_df = pd.DataFrame(result_lst)

2023-05-29 14:33:25.518432
n_artist: 2
Number of songs in train: 400
Number of songs in val: 400
top_n: 1
Accuracy: 0.9

2023-05-29 14:33:30.377857
n_artist: 4
Number of songs in train: 200
Number of songs in val: 200
top_n: 1
Accuracy: 0.79
top_n: 2
Accuracy: 0.92

2023-05-29 14:33:36.336032
n_artist: 8
Number of songs in train: 100
Number of songs in val: 100
top_n: 1
Accuracy: 0.58
top_n: 2
Accuracy: 0.76
top_n: 4
Accuracy: 0.9



In [123]:
fig = px.line(result_df, x='n_artist', y='accuracy', color='top_n', 
              title='Accuracy vs number of artist and number of top selections', 
              labels={'n_artist': 'Number of artists per set', 'top_n': 'Top predictions'})\
        .update_traces(mode='lines+markers')
fig.show()

**Potential improvements**

I expect that the most likely way to improve the model's accuracy would be to add more an information about words per line, number of line. Also it will be better to collect more data. Probably the most important feature is the vector similarity which is based on the TFIDF vectors of each song lyric. However, a way to extract more compact information could be to do PCA (princile component analysis) and select the eigenvalues of the most important components.

Another possiblity to create more features would be to apply doc2vec, word2vec or glove which are neural networks based approach to convert a text to a vector.