# Data Exploration

In [1]:
import pandas as pd
import numpy as np

In [2]:
def read(feature, h=None):
    return pd.read_csv(f'data/id_{feature}_mmsr.tsv', delimiter='\t',  header=h)

def embed_and_merge(df1, df2, col_name):
    embedding = df2.columns.difference(['id'], sort=False)
    df2[col_name] = df2[embedding].apply(lambda x: np.array(x, dtype=float), axis=1)
    df2.drop(embedding, inplace=True, axis=1)
    return pd.merge(df1, df2, left_on='id', right_on='id', how='left')

In [3]:
df = read('information', 0)

In [4]:
# read bert embedding
bert = read('lyrics_bert', 0)
df = embed_and_merge(df, bert, 'bert_embedding')

In [5]:
# read word2vec embedding
word2vec = read('lyrics_word2vec', 0)
df = embed_and_merge(df, word2vec, 'word2vec_embedding')

In [6]:
# read tf-idf term weighting
tfidf_weighting = read('lyrics_tf-idf', 0)
df = embed_and_merge(df, tfidf_weighting, 'tf-idf')

In [9]:
df.info()
df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10095 entries, 0 to 10094
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   id                  10095 non-null  object
 1   artist              10095 non-null  object
 2   song                10095 non-null  object
 3   album_name          10095 non-null  object
 4   bert_embedding      10095 non-null  object
 5   word2vec_embedding  10095 non-null  object
 6   tf-idf              10095 non-null  object
dtypes: object(7)
memory usage: 552.2+ KB


Unnamed: 0,id,artist,song,album_name,bert_embedding,word2vec_embedding,tf-idf
0,01Yfj2T3YTwJ1Yfy,We As Human,Take The Bullets Away (feat. Lacey Sturm),We As Human,"[0.0302475523203611, 0.0352500043809413, 0.010...","[0.0193592727054678, 0.0232394714425702, 0.028...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0818293914712727, ..."
1,01gyRHLquwXDlhkO,The Notorious B.I.G.,Somebody's Gotta Die,Life After Death (Remastered Edition),"[0.0084422621876001, 0.0302564185112714, 0.009...","[0.018537292381979, 0.0113115924403394, 0.0107...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,01rMxQv6vhyE1oQX,Against the Current,Chasing Ghosts,In Our Bones,"[0.0490818135440349, 0.0148476688191294, 0.001...","[0.0227837218553759, 0.0231641749730655, 0.012...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,02RGE9FNH65RtMS7,Barthezz,Infected,Trance - The Early Years (1997-2002),"[0.0445394963026046, 0.0214906893670558, 0.013...","[0.0381116103401342, 0.0278804157207017, 0.016...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,02ZnlCGZEbkfCDxo,Laura Pausini,Tra Te E Il Mare,The Best of Laura Pausini - E Ritorno Da Te,"[0.0514551289379596, 0.0297695714980363, -0.01...","[0.0182936789026777, -0.0064870788035669, 0.00...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.2413163920156013, ..."
...,...,...,...,...,...,...,...
10090,zyzILCQvVeUFIINi,Crowded House,When You Come,Temple Of Low Men,"[0.006713552866131, 0.0480893477797508, -0.001...","[0.0195101330379449, 0.0236336907562543, 0.011...","[0.0, 0.0, 0.079623055470056, 0.0, 0.0, 0.0, 0..."
10091,zzgS4ZqyswamEWNj,Britney Spears,My Only Wish (This Year),Platinum Christmas,"[0.0098905526101589, 0.0401467233896255, -0.02...","[0.0268563718791583, 0.0082648759004199, 0.011...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
10092,zzoFYDMlqU1X2zz1,Thundercat,DUI,Drunk,"[0.0101165119558572, 0.0388841480016708, -0.01...","[0.0051499218912795, 0.0028818239457905, 0.017...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
10093,zzpkRCGA5ud8q4mv,Otis Redding,Rock Me Baby,Otis Blue,"[-0.0166116580367088, 0.0266939438879489, -0.0...","[0.0370260450523346, 0.0159991827379498, -0.00...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
