In [3]:
from tqdm import tqdm
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from utils import read_lastfm
from gensim.models import Word2Vec

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.dummy import DummyClassifier

# Song2vec loading

In [4]:
def build_vocab(model):
    emb_vectors = {}
    for n in model.wv.index_to_key:
        emb_vectors[n] = model.wv[n]
    return emb_vectors

def load_model(filename):
    model = Word2Vec.load(filename)
    emb_vectors = build_vocab(model)
    return emb_vectors, model

In [71]:
emb_vectors, model = load_model("data/word2vec/word2vec.model")
s2v_df = pd.DataFrame(data=emb_vectors.values(), index=emb_vectors.keys())
s2v_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
4616724870,0.050536,0.700072,1.502735,0.799808,1.421927,-4.074349,-0.018887,1.857755,-2.008438,-0.825392,...,1.975366,1.545878,-1.979463,-0.711350,1.658343,0.450126,-1.190073,-2.496233,-0.692815,0.132445
670684783,-0.885090,3.123382,0.424417,1.275812,1.166315,-1.786570,-1.911305,2.292119,-1.925104,1.388782,...,1.104928,-0.950362,-1.193954,1.250412,1.085047,2.192419,0.481333,-1.549847,-1.455523,2.833608
3583077772,0.813355,1.055248,-0.999099,2.750681,-0.311156,-0.440768,1.241517,1.853461,-3.593800,-0.130656,...,2.394939,0.184182,-2.865525,-2.697645,0.648006,1.149723,0.454278,-2.541497,-1.710745,-0.740748
1166923991,0.159427,2.187680,2.613667,-0.696208,3.051698,-1.320268,-2.026290,1.033977,-0.464190,0.272858,...,-0.062268,-0.733349,-3.590141,-1.995496,2.511527,0.161250,-0.368792,-2.641105,2.493495,0.988131
3090101702,-0.521306,2.235044,-0.104581,2.320469,1.239244,-0.035983,1.323957,1.696940,-0.670083,1.638341,...,2.428933,0.192384,-3.330963,-0.695708,-1.183517,0.338650,-1.489349,-1.831049,-1.780782,1.766505
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
132745488902,0.030249,0.102062,0.061410,-0.002107,0.153210,-0.021721,-0.082210,0.031395,-0.086457,0.070993,...,0.164100,-0.024325,-0.194986,-0.007661,-0.025965,-0.034655,-0.043044,-0.052290,-0.073300,-0.022071
1386951220,0.021542,0.099487,0.068809,-0.015410,0.146718,-0.011853,-0.072963,0.009805,-0.091299,0.081219,...,0.163627,-0.011587,-0.203534,-0.016010,-0.040406,-0.030845,-0.038733,-0.049052,-0.071852,-0.017132
703180912,-0.012214,0.004268,-0.010293,0.000748,-0.008205,-0.005735,0.002840,-0.019010,0.006980,0.020210,...,0.017696,-0.007379,-0.013066,0.007165,-0.010918,-0.006580,-0.003090,-0.000482,-0.012478,0.008878
40207629099,0.001758,0.083488,0.045756,-0.010171,0.121356,-0.039251,-0.067801,0.029198,-0.090288,0.050345,...,0.123563,-0.004082,-0.143560,-0.032883,-0.030630,-0.013487,-0.015993,-0.041197,-0.037022,-0.017184


# TF-IDF initialization

In [209]:
songs, users = read_lastfm(zip_name="data/lastfm-dataset-1K.zip")

In [210]:
songs["song_id"]= songs.artist_name.cat.codes.astype("int64") * songs.track_name.nunique() \
                            + songs.track_name.cat.codes

In [6]:
corpus_df = songs.sort_values(["user_id", "timestamp"]).groupby("user_id")\
                .agg(sequence=("song_id", list))

In [7]:
corpus = corpus_df.values[:,0]

In [8]:
vectorizer = TfidfVectorizer(tokenizer=lambda x: x, lowercase=False)
X = vectorizer.fit_transform(corpus)

In [9]:
X.shape

(992, 1498727)

In [10]:
def get_song_vector(song_id, X, vectorizer):
    idx = vectorizer.vocabulary_[song_id]
    return X[:, idx].toarray()[:,0]

In [11]:
get_song_vector(3583077562, X, vectorizer)[:10]

array([0.01709776, 0.        , 0.        , 0.02194015, 0.        ,
       0.00360557, 0.        , 0.        , 0.        , 0.        ])

In [12]:
song_ids = list(vectorizer.vocabulary_.keys())
indices = list(vectorizer.vocabulary_.values())

In [13]:
song_ids = [x for _,x in sorted(zip(indices,song_ids))]

In [14]:
tfidf_df = pd.DataFrame(data=X.T.todense(), index=song_ids)

# Classification Tasks

* Predict if two songs appear in the same context
* Predict if two songs are from the same artist
* Predict the tag of a song

In [45]:
SEED = 42

In [46]:
sorted_songs = songs.sort_values(["user_id", "timestamp"])

In [47]:
def compute_metrics(y_test, y_pred, y_proba):
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_proba)
    return acc, f1, auc

In [48]:
def predict_RF(X_train, X_test, y_train, y_test, 
               param_grid = {'n_estimators': [2, 50 ,100, 150],
                      'criterion': ['gini', 'entropy']}):    
    model = RandomForestClassifier(random_state=SEED)
    cv = KFold(n_splits=5, shuffle=True, random_state=SEED)
    print("Starting Grid search")
    grid = GridSearchCV(model, param_grid, cv=cv, scoring='accuracy', verbose=4)
    grid.fit(X_train, y_train)
    best_model = grid.best_estimator_
    print(f"Best parameters are: {grid.best_params_}")
    y_pred = best_model.predict(X_test)
    y_proba = best_model.predict_proba(X_test)[:,1]
    return compute_metrics(y_test, y_pred, y_proba)

In [49]:
def predict_random(X_train, y_train, X_test, y_test):
    random = DummyClassifier(strategy='uniform', random_state=SEED)
    random.fit(X_train, y_train)
    y_pred = random.predict(X_test)
    y_proba = random.predict_proba(X_test)[:, 1]
    return compute_metrics(y_test, y_pred, y_proba)

def predict_majority(X_train, y_train, X_test, y_test):
    majority = DummyClassifier(strategy='most_frequent', random_state=SEED)
    majority.fit(X_train, y_train)
    y_pred = majority.predict(X_test)
    y_proba = majority.predict_proba(X_test)[:, 1]
    return compute_metrics(y_test, y_pred, y_proba)

## Same context classification

In [78]:
def pick_song_in_same_context(songs, half_n):
    """Simply chooses random songs pair them with the previous or following one"""
    np.random.seed(SEED)
    idx1 = np.random.randint(1, len(songs) - 1, half_n) #Exclude first and last song
    idx2 = idx1.copy() #Offset the first index by -1 or 1 
    idx2[::2] += 1
    idx2[1::2] -= 1
    songs1 = songs.iloc[idx1].song_id.values
    songs2 = songs.iloc[idx2].song_id.values
    X = np.c_[songs1, songs2]
    y = np.ones(half_n)
    return X, y
    
def pick_songs_in_diff_context(songs, half_n):
    songs1 = songs.sample(half_n, random_state=SEED).song_id.values
    songs2 = songs.sample(half_n, random_state=SEED + 1).song_id.values
    X = np.c_[songs1, songs2]
    y = np.zeros(half_n)
    return X, y

def create_context_dataset(sorted_songs, n=20000):
    """Create a dataset of song pairs that either appeared in the same context or not""" 
    X_pos, y_pos = pick_song_in_same_context(sorted_songs, n//2)
    X_neg, y_neg = pick_songs_in_diff_context(sorted_songs, n//2)
    old_X = np.r_[X_pos, X_neg]
    old_y = np.r_[y_pos, y_neg]
    dataset = list(zip(old_X, old_y))
    np.random.seed(SEED)
    np.random.shuffle(dataset)
    X, y = zip(*dataset)
    return pd.DataFrame(X, columns=["song1", "song2"]), np.array(y)

In [115]:
song_pairs, labels = create_context_dataset(sorted_songs, n= 10000)

In [116]:
#Only keep song with s2v embeddings
song_pairs = song_pairs.copy()
song_pairs['labels'] = labels
song_pairs = song_pairs[song_pairs.song1.isin(s2v_df.index) & song_pairs.song2.isin(s2v_df.index)]
labels = song_pairs.labels
song_pairs.drop('labels', axis=1, inplace=True)

In [99]:
def create_vectors_pairs(vectors, song_pairs):
    vector_pairs = vectors.merge(song_pairs, right_on="song1", left_index=True)\
            .merge(vectors, left_on="song2", right_index=True)\
            .drop(['song1', 'song2'], axis=1)

    print(len(vector_pairs))
    print(len(song_pairs))
    assert len(vector_pairs) == len(song_pairs)
    assert vector_pairs.shape[1] == vectors.shape[1] * 2
    return vector_pairs

### TF-IDF

In [118]:
tfidf_pairs = create_vectors_pairs(tfidf_df, song_pairs)

In [119]:
tfidf_pairs.head()

Unnamed: 0,0_x,1_x,2_x,3_x,4_x,5_x,6_x,7_x,8_x,9_x,...,982_y,983_y,984_y,985_y,986_y,987_y,988_y,989_y,990_y,991_y
1382,0.0,0.0,0.0,0.009143,0.0,0.0,0.0,0.0,0.0,0.0,...,0.003974,0.0,0.0,0.002027,0.0,0.0,0.0,0.0,0.0,0.0
6851,0.0,0.0,0.0,0.005735,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8584,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3106,0.0,0.000804,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9648,0.0,0.00079,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [120]:
X_train, X_test, y_train, y_test = \
                train_test_split(tfidf_pairs, labels, train_size=0.8, random_state=SEED)
# acc, f1, auc = predict_RF(X_train, X_test, y_train, y_test)

In [121]:
print("TF-IDF:", acc, f1, auc)
print("Random:", predict_random(X_train, y_train, X_test, y_test))
print("Majority:",predict_majority(X_train, y_train, X_test, y_test))

TF-IDF: 0.52 0.07692307692307693 0.40077110389610393
Random: (0.49616971125515613, 0.48955223880597015, 0.5)
Majority: (0.4926340601060695, 0.6600868535333597, 0.5)


### song2vec

In [122]:
s2v_pairs = create_vectors_pairs(s2v_df, song_pairs)

In [123]:
s2v_pairs.head()

Unnamed: 0,0_x,1_x,2_x,3_x,4_x,5_x,6_x,7_x,8_x,9_x,...,90_y,91_y,92_y,93_y,94_y,95_y,96_y,97_y,98_y,99_y
2416,0.050536,0.700072,1.502735,0.799808,1.421927,-4.074349,-0.018887,1.857755,-2.008438,-0.825392,...,2.049515,1.042242,-2.304445,-0.64561,1.034146,0.522461,-0.801188,-2.008663,-0.396661,0.024584
3249,-0.932494,1.68211,-0.600307,1.201052,1.746724,0.416302,-0.009236,0.224624,0.71416,1.807721,...,2.049515,1.042242,-2.304445,-0.64561,1.034146,0.522461,-0.801188,-2.008663,-0.396661,0.024584
2532,0.050536,0.700072,1.502735,0.799808,1.421927,-4.074349,-0.018887,1.857755,-2.008438,-0.825392,...,1.072537,-0.068162,-1.121787,-0.130818,-0.168337,-0.234738,-0.270714,-0.263553,-0.45397,-0.143811
1651,-0.88509,3.123382,0.424417,1.275812,1.166315,-1.78657,-1.911305,2.292119,-1.925104,1.388782,...,0.514146,-0.03214,-0.602289,-0.052574,-0.092585,-0.073549,-0.129805,-0.187154,-0.227991,-0.06
4873,-0.88509,3.123382,0.424417,1.275812,1.166315,-1.78657,-1.911305,2.292119,-1.925104,1.388782,...,1.174746,0.092594,-3.189829,0.784431,-0.239225,1.520638,0.963496,-1.337876,-1.011305,0.964345


In [124]:
X_train, X_test, y_train, y_test = \
                train_test_split(s2v_pairs, labels, train_size=0.8, random_state=SEED)
acc, f1, auc = predict_RF(X_train, X_test, y_train, y_test)

Starting Grid search
Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV 1/5] END .................criterion=gini, n_estimators=2; total time=   0.2s
[CV 2/5] END .................criterion=gini, n_estimators=2; total time=   0.2s
[CV 3/5] END .................criterion=gini, n_estimators=2; total time=   0.2s
[CV 4/5] END .................criterion=gini, n_estimators=2; total time=   0.3s
[CV 5/5] END .................criterion=gini, n_estimators=2; total time=   0.2s
[CV 1/5] END ................criterion=gini, n_estimators=50; total time=   4.1s
[CV 2/5] END ................criterion=gini, n_estimators=50; total time=   4.0s
[CV 3/5] END ................criterion=gini, n_estimators=50; total time=   3.9s
[CV 4/5] END ................criterion=gini, n_estimators=50; total time=   3.9s
[CV 5/5] END ................criterion=gini, n_estimators=50; total time=   3.7s
[CV 1/5] END ...............criterion=gini, n_estimators=100; total time=   7.3s
[CV 2/5] END ...............

In [125]:
print("Song2Vec:", acc, f1, auc)

Song2Vec: 0.5085444902769594 0.5250569476082005 0.49960822232966007


## Same artist classification

In [211]:
discography = songs[["artist_name", "track_name", "song_id"]].drop_duplicates()

In [127]:
def pick_song_from_same_artist(discography, half_n):
    """Simply chooses random songs pair them with the previous or following one"""
    artist_with_multiple_songs = discography.groupby("artist_name")\
                                .agg(count=("track_name", "count")).reset_index()
    discography = discography.merge(artist_with_multiple_songs)
    song_pairs = discography.groupby("artist_name").agg(song1=("song_id", "first"), 
                                                        song2=("song_id", "last"))
    X = song_pairs.sample(half_n, random_state=SEED).values
    y = np.ones(half_n)
    return X, y
    
def pick_song_from_diff_artist(discography, half_n):
    discography = discography.drop_duplicates("artist_name")
    songs = discography.sample(2*half_n, random_state=SEED).song_id.values
    songs1 = songs[:half_n]
    songs2 = songs[half_n:]
    X = np.c_[songs1, songs2]
    y = np.zeros(half_n)
    return X, y

def create_artist_dataset(discography, n=20000):
    """Create a dataset of song pairs that either appeared in the same context or not""" 
    discography = discography[discography.song_id.isin(s2v_df.index)]
    X_pos, y_pos = pick_song_in_same_context(discography, n//2)
    X_neg, y_neg = pick_songs_in_diff_context(discography, n//2)
    old_X = np.r_[X_pos, X_neg]
    old_y = np.r_[y_pos, y_neg]
    dataset = list(zip(old_X, old_y))
    np.random.seed(SEED)
    np.random.shuffle(dataset)
    X, y = zip(*dataset)
    return pd.DataFrame(X, columns=["song1", "song2"]), np.array(y)

In [128]:
song_pairs, labels = create_artist_dataset(discography, 10000)

### TF-IDF

In [129]:
tfidf_pairs = create_vectors_pairs(tfidf_df, song_pairs)

In [130]:
X_train.shape

(6788, 200)

In [131]:
X_train, X_test, y_train, y_test = \
                train_test_split(tfidf_pairs, labels, train_size=0.8, random_state=SEED)
acc, f1, auc = predict_RF(X_train, X_test, y_train, y_test)

Starting Grid search
Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV 1/5] END .................criterion=gini, n_estimators=2; total time=   0.6s
[CV 2/5] END .................criterion=gini, n_estimators=2; total time=   0.4s
[CV 3/5] END .................criterion=gini, n_estimators=2; total time=   0.4s
[CV 4/5] END .................criterion=gini, n_estimators=2; total time=   0.5s
[CV 5/5] END .................criterion=gini, n_estimators=2; total time=   0.5s
[CV 1/5] END ................criterion=gini, n_estimators=50; total time=   7.8s
[CV 2/5] END ................criterion=gini, n_estimators=50; total time=   8.3s
[CV 3/5] END ................criterion=gini, n_estimators=50; total time=   8.8s
[CV 4/5] END ................criterion=gini, n_estimators=50; total time=   8.6s
[CV 5/5] END ................criterion=gini, n_estimators=50; total time=   8.0s
[CV 1/5] END ...............criterion=gini, n_estimators=100; total time=  15.6s
[CV 2/5] END ...............

In [132]:
print("TF-IDF:", acc, f1, auc)

TF-IDF: 0.4845 0.33867864015394483 0.49308083066038155


### Song2vec

In [133]:
s2v_pairs = create_vectors_pairs(s2v_df, song_pairs)

In [134]:
X_train, X_test, y_train, y_test = \
                train_test_split(s2v_pairs, labels, train_size=0.8, random_state=SEED)
acc, f1, auc = predict_RF(X_train, X_test, y_train, y_test)

Starting Grid search
Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV 1/5] END .................criterion=gini, n_estimators=2; total time=   0.3s
[CV 2/5] END .................criterion=gini, n_estimators=2; total time=   0.2s
[CV 3/5] END .................criterion=gini, n_estimators=2; total time=   0.2s
[CV 4/5] END .................criterion=gini, n_estimators=2; total time=   0.2s
[CV 5/5] END .................criterion=gini, n_estimators=2; total time=   0.2s
[CV 1/5] END ................criterion=gini, n_estimators=50; total time=   4.8s
[CV 2/5] END ................criterion=gini, n_estimators=50; total time=   4.7s
[CV 3/5] END ................criterion=gini, n_estimators=50; total time=   4.9s
[CV 4/5] END ................criterion=gini, n_estimators=50; total time=   4.6s
[CV 5/5] END ................criterion=gini, n_estimators=50; total time=   4.5s
[CV 1/5] END ...............criterion=gini, n_estimators=100; total time=   9.4s
[CV 2/5] END ...............

In [135]:
print("Song2Vec:", acc, f1, auc)

Song2Vec: 0.5125 0.4981986618630983 0.4989103158433776


In [136]:
print("Random:", predict_random(X_train, y_train, X_test, y_test))
print("Majority:",predict_majority(X_train, y_train, X_test, y_test))

Random: (0.4945, 0.4937406109163745, 0.5)
Majority: (0.4935, 0.0, 0.5)


## Tag Prediction

### TF-IDF initialization (with song_tags)

In [250]:
song_tags = pd.read_pickle("data/pickle/song_tags_top1")

In [259]:
artist_name_cc_map = {k: v for k, v in zip(songs['artist_name'], songs['artist_name'].cat.codes.astype("int64"))}
track_name_cc_map = {k: v for k, v in zip(songs['track_name'], songs['track_name'].cat.codes.astype("int64"))}

In [260]:
song_tags['artist_name_cc'] = song_tags['artist_name'].map(artist_name_cc_map)
song_tags['track_name_cc'] = song_tags['track_name'].map(track_name_cc_map)

In [298]:
song_tags["song_id"]= song_tags.artist_name_cc * songs.track_name.nunique() \
                            + song_tags.track_name_cc

In [299]:
corpus_df = song_tags.sort_values(["user_id", "timestamp"]).groupby("user_id")\
                .agg(sequence=("song_id", list))

In [300]:
corpus = corpus_df.values[:,0]

In [301]:
vectorizer = TfidfVectorizer(tokenizer=lambda x: x, lowercase=False)
X = vectorizer.fit_transform(corpus)

In [302]:
X.shape

(992, 1498676)

In [304]:
song_ids = list(vectorizer.vocabulary_.keys())
indices = list(vectorizer.vocabulary_.values())

In [305]:
song_ids = [x for _,x in sorted(zip(indices,song_ids))]

In [306]:
tfidf_df = pd.DataFrame(data=X.T.todense(), index=song_ids)

### Prediction

In [307]:
discography = song_tags[["artist_name", "track_name", "top_tag", "song_id"]].drop_duplicates()

In [308]:
def pick_song_with_same_tag(discography, half_n):
    """Simply chooses random songs pair them with the previous or following one"""
    tags_with_multiple_songs = discography.groupby("top_tag")\
                                .agg(count=("track_name", "count")).reset_index()
    discography = discography.merge(tags_with_multiple_songs)
    song_pairs = discography.groupby("top_tag").agg(song1=("song_id", "first"), 
                                                        song2=("song_id", "last"))
    X = song_pairs.sample(half_n, random_state=SEED).values
    y = np.ones(half_n)
    return X, y

In [309]:
def pick_song_from_diff_tag(discography, half_n):
    discography = discography.drop_duplicates("top_tag")
    songs = discography.sample(2*half_n, random_state=SEED).song_id.values
    songs1 = songs[:half_n]
    songs2 = songs[half_n:]
    X = np.c_[songs1, songs2]
    y = np.zeros(half_n)
    return X, y

In [310]:
def create_tag_dataset(discography, n=20000):
    """Create a dataset of song pairs that either appeared in the same context or not""" 
    discography = discography[discography.song_id.isin(s2v_df.index)]
    X_pos, y_pos = pick_song_with_same_tag(discography, n//2)
    X_neg, y_neg = pick_song_from_diff_tag(discography, n//2)
    old_X = np.r_[X_pos, X_neg]
    old_y = np.r_[y_pos, y_neg]
    dataset = list(zip(old_X, old_y))
    np.random.seed(SEED)
    np.random.shuffle(dataset)
    X, y = zip(*dataset)
    return pd.DataFrame(X, columns=["song1", "song2"]), np.array(y)

In [311]:
song_pairs, labels = create_tag_dataset(discography, 10000)

(527552, 4)


### TF-IDF

In [312]:
tfidf_pairs = create_vectors_pairs(tfidf_df, song_pairs)

        0         1    2        3    4    5    6    7    8    9  ...  984  \
204   0.0  0.000000  0.0  0.00000  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0   
9861  0.0  0.000000  0.0  0.00000  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0   
5486  0.0  0.000000  0.0  0.00000  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0   
9888  0.0  0.006878  0.0  0.00000  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0   
2567  0.0  0.000000  0.0  0.00233  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0   
...   ...       ...  ...      ...  ...  ...  ...  ...  ...  ...  ...  ...   
4853  0.0  0.000000  0.0  0.00000  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0   
7931  0.0  0.000000  0.0  0.00000  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0   
9745  0.0  0.000000  0.0  0.00000  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0   
9108  0.0  0.000000  0.0  0.00000  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0   
4072  0.0  0.000000  0.0  0.00000  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0   

           985  986  987  988  989  990  991         song1         song2  


In [313]:
X_train, X_test, y_train, y_test = \
                train_test_split(tfidf_pairs, labels, train_size=0.8, random_state=SEED)
acc, f1, auc = predict_RF(X_train, X_test, y_train, y_test)

Starting Grid search
Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV 1/5] END .................criterion=gini, n_estimators=2; total time=   1.0s
[CV 2/5] END .................criterion=gini, n_estimators=2; total time=   0.6s
[CV 3/5] END .................criterion=gini, n_estimators=2; total time=   0.7s
[CV 4/5] END .................criterion=gini, n_estimators=2; total time=   0.6s
[CV 5/5] END .................criterion=gini, n_estimators=2; total time=   0.6s
[CV 1/5] END ................criterion=gini, n_estimators=50; total time=  12.0s
[CV 2/5] END ................criterion=gini, n_estimators=50; total time=  10.8s
[CV 3/5] END ................criterion=gini, n_estimators=50; total time=  11.8s
[CV 4/5] END ................criterion=gini, n_estimators=50; total time=  14.2s
[CV 5/5] END ................criterion=gini, n_estimators=50; total time=  13.2s
[CV 1/5] END ...............criterion=gini, n_estimators=100; total time=  23.0s
[CV 2/5] END ...............

In [314]:
print("TF-IDF:", acc, f1, auc)

TF-IDF: 0.497 0.3813038130381304 0.5028854876474124


### Song2vec

In [315]:
def create_vectors_pairs(vectors, song_pairs):
    vector_pairs = vectors.merge(song_pairs, right_on="song1", left_index=True)
    print(vector_pairs)
    vector_pairs = vector_pairs.merge(vectors, left_on="song2", right_index=True)
    vector_pairs = vector_pairs.drop(['song1', 'song2'], axis=1)

    #discography.merge(s2v_df, right_index=True, left_on="song_id") 
    print(len(vectors))
    print(len(song_pairs))
    assert len(vector_pairs) == len(song_pairs)
    assert vector_pairs.shape[1] == vectors.shape[1] * 2
    return vector_pairs

In [316]:
s2v_pairs = create_vectors_pairs(s2v_df, song_pairs)

             0         1         2         3         4         5         6  \
1184  0.321568  0.725977  1.296536 -0.161113  1.502496 -0.727384 -1.279780   
5689  0.340931  1.017123  1.508705  0.012276  1.510483  0.028978 -1.349701   
8226  0.340931  1.017123  1.508705  0.012276  1.510483  0.028978 -1.349701   
4415 -0.162334  1.294950  0.845603  1.302312  1.833844 -0.362208 -0.085894   
8107 -0.162334  1.294950  0.845603  1.302312  1.833844 -0.362208 -0.085894   
...        ...       ...       ...       ...       ...       ...       ...   
454   0.013267  0.028717  0.038180  0.011470  0.052610  0.004531 -0.016271   
2363  0.032708  0.140178  0.104537 -0.023315  0.218134 -0.030840 -0.106504   
758   0.021439  0.029057  0.042824 -0.029588  0.072754 -0.025689 -0.032623   
762   0.025023  0.109973  0.053499 -0.009783  0.158154 -0.031726 -0.097743   
780   0.010644  0.037680  0.007588 -0.010716  0.038968 -0.000019 -0.021869   

             7         8         9  ...        92        93    

In [317]:
X_train, X_test, y_train, y_test = \
                train_test_split(s2v_pairs, labels, train_size=0.8, random_state=SEED)
acc, f1, auc = predict_RF(X_train, X_test, y_train, y_test)

Starting Grid search
Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV 1/5] END .................criterion=gini, n_estimators=2; total time=   0.3s
[CV 2/5] END .................criterion=gini, n_estimators=2; total time=   0.2s
[CV 3/5] END .................criterion=gini, n_estimators=2; total time=   0.2s
[CV 4/5] END .................criterion=gini, n_estimators=2; total time=   0.3s
[CV 5/5] END .................criterion=gini, n_estimators=2; total time=   0.2s
[CV 1/5] END ................criterion=gini, n_estimators=50; total time=   4.9s
[CV 2/5] END ................criterion=gini, n_estimators=50; total time=   4.5s
[CV 3/5] END ................criterion=gini, n_estimators=50; total time=   5.0s
[CV 4/5] END ................criterion=gini, n_estimators=50; total time=   5.2s
[CV 5/5] END ................criterion=gini, n_estimators=50; total time=   5.9s
[CV 1/5] END ...............criterion=gini, n_estimators=100; total time=  10.4s
[CV 2/5] END ...............

In [318]:
print("Song2Vec:", acc, f1, auc)

Song2Vec: 0.502 0.48178980228928203 0.5065381049397348


In [319]:
print("Random:", predict_random(X_train, y_train, X_test, y_test))
print("Majority:",predict_majority(X_train, y_train, X_test, y_test))

Random: (0.4945, 0.4937406109163745, 0.5)
Majority: (0.4935, 0.0, 0.5)
