In [1]:
from tqdm import tqdm
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from utils import read_lastfm
from gensim.models import Word2Vec

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.dummy import DummyClassifier

# Song2vec loading

In [2]:
def build_vocab(model):
    emb_vectors = {}
    for n in model.wv.index_to_key:
        emb_vectors[n] = model.wv[n]
    return emb_vectors

def load_model(filename):
    model = Word2Vec.load(filename)
    emb_vectors = build_vocab(model)
    return emb_vectors, model

In [3]:
emb_vectors, model = load_model("data/word2vec/word2vec.model")
s2v_df = pd.DataFrame(data=emb_vectors.values(), index=emb_vectors.keys())

# TF-IDF initialization

In [4]:
songs, users = read_lastfm(zip_name="data/lastfm-dataset-1K.zip")

In [5]:
songs["song_id"]= songs.artist_name.cat.codes.astype("int64") * songs.track_name.nunique() \
                            + songs.track_name.cat.codes

In [6]:
corpus_df = songs.sort_values(["user_id", "timestamp"]).groupby("user_id")\
                .agg(sequence=("song_id", list))

In [7]:
corpus = corpus_df.values[:,0]

In [8]:
vectorizer = TfidfVectorizer(tokenizer=lambda x: x, lowercase=False)
X = vectorizer.fit_transform(corpus)

In [9]:
X.shape

(992, 1498727)

In [10]:
def get_song_vector(song_id, X, vectorizer):
    idx = vectorizer.vocabulary_[song_id]
    return X[:, idx].toarray()[:,0]

In [11]:
get_song_vector(3583077562, X, vectorizer)[:10]

array([0.01709776, 0.        , 0.        , 0.02194015, 0.        ,
       0.00360557, 0.        , 0.        , 0.        , 0.        ])

In [12]:
song_ids = list(vectorizer.vocabulary_.keys())
indices = list(vectorizer.vocabulary_.values())

In [13]:
song_ids = [x for _,x in sorted(zip(indices,song_ids))]

In [14]:
tfidf_df = pd.DataFrame(data=X.T.todense(), index=song_ids)

# Classification Tasks

* Predict if two songs appear in the same context
* Predict if two songs are from the same artist
* Predict the tag of a song

In [15]:
SEED = 42

In [16]:
sorted_songs = songs.sort_values(["user_id", "timestamp"])

In [17]:
def compute_metrics(y_test, y_pred, y_proba):
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_proba)
    return acc, f1, auc

In [112]:
def predict_RF(X_train, X_test, y_train, y_test, 
               param_grid = {'n_estimators': [2, 50 ,100, 150],
                      'criterion': ['gini', 'entropy']}):    
    model = RandomForestClassifier(random_state=SEED)
    cv = KFold(n_splits=5, shuffle=True, random_state=SEED)
    print("Starting Grid search")
    grid = GridSearchCV(model, param_grid, cv=cv, scoring='accuracy', verbose=4)
    grid.fit(X_train, y_train)
    best_model = grid.best_estimator_
    print(f"Best parameters are: {grid.best_params_}")
    y_pred = best_model.predict(X_test)
    y_proba = best_model.predict_proba(X_test)[:,1]
    return compute_metrics(y_test, y_pred, y_proba)

In [19]:
def predict_random(X_train, y_train, X_test, y_test):
    random = DummyClassifier(strategy='uniform', random_state=SEED)
    random.fit(X_train, y_train)
    y_pred = random.predict(X_test)
    y_proba = random.predict_proba(X_test)[:, 1]
    return compute_metrics(y_test, y_pred, y_proba)

def predict_majority(X_train, y_train, X_test, y_test):
    majority = DummyClassifier(strategy='most_frequent', random_state=SEED)
    majority.fit(X_train, y_train)
    y_pred = majority.predict(X_test)
    y_proba = majority.predict_proba(X_test)[:, 1]
    return compute_metrics(y_test, y_pred, y_proba)

## Same context classification

In [114]:
def pick_song_in_same_context(songs, half_n):
    """Simply chooses random songs pair them with the previous or following one"""
    np.random.seed(SEED)
    idx1 = np.random.randint(1, len(songs) - 1, half_n) #Exclude first and last song
    idx2 = idx1.copy() #Offset the first index by -1 or 1 
    idx2[::2] += 1
    idx2[1::2] -= 1
    songs1 = songs.iloc[idx1].song_id.values
    songs2 = songs.iloc[idx2].song_id.values
    X = np.c_[songs1, songs2]
    y = np.ones(half_n)
    return X, y
    
def pick_songs_in_diff_context(songs, half_n):
    songs1 = songs.sample(half_n, random_state=SEED).song_id.values
    songs2 = songs.sample(half_n, random_state=SEED + 1).song_id.values
    X = np.c_[songs1, songs2]
    y = np.zeros(half_n)
    return X, y

def create_context_dataset(sorted_songs, n=20000):
    """Create a dataset of song pairs that either appeared in the same context or not""" 
    X_pos, y_pos = pick_song_in_same_context(sorted_songs, n//2)
    X_neg, y_neg = pick_songs_in_diff_context(sorted_songs, n//2)
    old_X = np.r_[X_pos, X_neg]
    old_y = np.r_[y_pos, y_neg]
    dataset = list(zip(old_X, old_y))
    np.random.seed(SEED)
    np.random.shuffle(dataset)
    X, y = zip(*dataset)
    return pd.DataFrame(X, columns=["song1", "song2"]), np.array(y)

In [115]:
song_pairs, labels = create_context_dataset(sorted_songs, n= 10000)

In [116]:
#Only keep song with s2v embeddings
song_pairs = song_pairs.copy()
song_pairs['labels'] = labels
song_pairs = song_pairs[song_pairs.song1.isin(s2v_df.index) & song_pairs.song2.isin(s2v_df.index)]
labels = song_pairs.labels
song_pairs.drop('labels', axis=1, inplace=True)

In [117]:
def create_vectors_pairs(vectors, song_pairs):
    vector_pairs = vectors.merge(song_pairs, right_on="song1", left_index=True)\
            .merge(vectors, left_on="song2", right_index=True)\
            .drop(['song1', 'song2'], axis=1)

    assert len(vector_pairs) == len(song_pairs)
    assert vector_pairs.shape[1] == vectors.shape[1] * 2
    return vector_pairs

### TF-IDF

In [118]:
tfidf_pairs = create_vectors_pairs(tfidf_df, song_pairs)

In [119]:
tfidf_pairs.head()

Unnamed: 0,0_x,1_x,2_x,3_x,4_x,5_x,6_x,7_x,8_x,9_x,...,982_y,983_y,984_y,985_y,986_y,987_y,988_y,989_y,990_y,991_y
1382,0.0,0.0,0.0,0.009143,0.0,0.0,0.0,0.0,0.0,0.0,...,0.003974,0.0,0.0,0.002027,0.0,0.0,0.0,0.0,0.0,0.0
6851,0.0,0.0,0.0,0.005735,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8584,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3106,0.0,0.000804,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9648,0.0,0.00079,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [120]:
X_train, X_test, y_train, y_test = \
                train_test_split(tfidf_pairs, labels, train_size=0.8, random_state=SEED)
# acc, f1, auc = predict_RF(X_train, X_test, y_train, y_test)

In [121]:
print("TF-IDF:", acc, f1, auc)
print("Random:", predict_random(X_train, y_train, X_test, y_test))
print("Majority:",predict_majority(X_train, y_train, X_test, y_test))

TF-IDF: 0.52 0.07692307692307693 0.40077110389610393
Random: (0.49616971125515613, 0.48955223880597015, 0.5)
Majority: (0.4926340601060695, 0.6600868535333597, 0.5)


### song2vec

In [122]:
s2v_pairs = create_vectors_pairs(s2v_df, song_pairs)

In [123]:
s2v_pairs.head()

Unnamed: 0,0_x,1_x,2_x,3_x,4_x,5_x,6_x,7_x,8_x,9_x,...,90_y,91_y,92_y,93_y,94_y,95_y,96_y,97_y,98_y,99_y
2416,0.050536,0.700072,1.502735,0.799808,1.421927,-4.074349,-0.018887,1.857755,-2.008438,-0.825392,...,2.049515,1.042242,-2.304445,-0.64561,1.034146,0.522461,-0.801188,-2.008663,-0.396661,0.024584
3249,-0.932494,1.68211,-0.600307,1.201052,1.746724,0.416302,-0.009236,0.224624,0.71416,1.807721,...,2.049515,1.042242,-2.304445,-0.64561,1.034146,0.522461,-0.801188,-2.008663,-0.396661,0.024584
2532,0.050536,0.700072,1.502735,0.799808,1.421927,-4.074349,-0.018887,1.857755,-2.008438,-0.825392,...,1.072537,-0.068162,-1.121787,-0.130818,-0.168337,-0.234738,-0.270714,-0.263553,-0.45397,-0.143811
1651,-0.88509,3.123382,0.424417,1.275812,1.166315,-1.78657,-1.911305,2.292119,-1.925104,1.388782,...,0.514146,-0.03214,-0.602289,-0.052574,-0.092585,-0.073549,-0.129805,-0.187154,-0.227991,-0.06
4873,-0.88509,3.123382,0.424417,1.275812,1.166315,-1.78657,-1.911305,2.292119,-1.925104,1.388782,...,1.174746,0.092594,-3.189829,0.784431,-0.239225,1.520638,0.963496,-1.337876,-1.011305,0.964345


In [124]:
X_train, X_test, y_train, y_test = \
                train_test_split(s2v_pairs, labels, train_size=0.8, random_state=SEED)
acc, f1, auc = predict_RF(X_train, X_test, y_train, y_test)

Starting Grid search
Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV 1/5] END .................criterion=gini, n_estimators=2; total time=   0.2s
[CV 2/5] END .................criterion=gini, n_estimators=2; total time=   0.2s
[CV 3/5] END .................criterion=gini, n_estimators=2; total time=   0.2s
[CV 4/5] END .................criterion=gini, n_estimators=2; total time=   0.3s
[CV 5/5] END .................criterion=gini, n_estimators=2; total time=   0.2s
[CV 1/5] END ................criterion=gini, n_estimators=50; total time=   4.1s
[CV 2/5] END ................criterion=gini, n_estimators=50; total time=   4.0s
[CV 3/5] END ................criterion=gini, n_estimators=50; total time=   3.9s
[CV 4/5] END ................criterion=gini, n_estimators=50; total time=   3.9s
[CV 5/5] END ................criterion=gini, n_estimators=50; total time=   3.7s
[CV 1/5] END ...............criterion=gini, n_estimators=100; total time=   7.3s
[CV 2/5] END ...............

In [125]:
print("Song2Vec:", acc, f1, auc)

Song2Vec: 0.5085444902769594 0.5250569476082005 0.49960822232966007


## Same artist classification

In [126]:
discography = songs[["artist_name", "track_name", "song_id"]].drop_duplicates()

In [127]:
def pick_song_from_same_artist(discography, half_n):
    """Simply chooses random songs pair them with the previous or following one"""
    artist_with_multiple_songs = discography.groupby("artist_name")\
                                .agg(count=("track_name", "count")).reset_index()
    discography = discography.merge(artist_with_multiple_songs)
    song_pairs = discography.groupby("artist_name").agg(song1=("song_id", "first"), 
                                                        song2=("song_id", "last"))
    X = song_pairs.sample(half_n, random_state=SEED).values
    y = np.ones(half_n)
    return X, y
    
def pick_song_from_diff_artist(discography, half_n):
    discography = discography.drop_duplicates("artist_name")
    songs = discography.sample(2*half_n, random_state=SEED).song_id.values
    songs1 = songs[:half_n]
    songs2 = songs[half_n:]
    X = np.c_[songs1, songs2]
    y = np.zeros(half_n)
    return X, y

def create_artist_dataset(discography, n=20000):
    """Create a dataset of song pairs that either appeared in the same context or not""" 
    discography = discography[discography.song_id.isin(s2v_df.index)]
    X_pos, y_pos = pick_song_in_same_context(discography, n//2)
    X_neg, y_neg = pick_songs_in_diff_context(discography, n//2)
    old_X = np.r_[X_pos, X_neg]
    old_y = np.r_[y_pos, y_neg]
    dataset = list(zip(old_X, old_y))
    np.random.seed(SEED)
    np.random.shuffle(dataset)
    X, y = zip(*dataset)
    return pd.DataFrame(X, columns=["song1", "song2"]), np.array(y)

In [128]:
song_pairs, labels = create_artist_dataset(discography, 10000)

### TF-IDF

In [129]:
tfidf_pairs = create_vectors_pairs(tfidf_df, song_pairs)

In [130]:
X_train.shape

(6788, 200)

In [131]:
X_train, X_test, y_train, y_test = \
                train_test_split(tfidf_pairs, labels, train_size=0.8, random_state=SEED)
acc, f1, auc = predict_RF(X_train, X_test, y_train, y_test)

Starting Grid search
Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV 1/5] END .................criterion=gini, n_estimators=2; total time=   0.6s
[CV 2/5] END .................criterion=gini, n_estimators=2; total time=   0.4s
[CV 3/5] END .................criterion=gini, n_estimators=2; total time=   0.4s
[CV 4/5] END .................criterion=gini, n_estimators=2; total time=   0.5s
[CV 5/5] END .................criterion=gini, n_estimators=2; total time=   0.5s
[CV 1/5] END ................criterion=gini, n_estimators=50; total time=   7.8s
[CV 2/5] END ................criterion=gini, n_estimators=50; total time=   8.3s
[CV 3/5] END ................criterion=gini, n_estimators=50; total time=   8.8s
[CV 4/5] END ................criterion=gini, n_estimators=50; total time=   8.6s
[CV 5/5] END ................criterion=gini, n_estimators=50; total time=   8.0s
[CV 1/5] END ...............criterion=gini, n_estimators=100; total time=  15.6s
[CV 2/5] END ...............

In [132]:
print("TF-IDF:", acc, f1, auc)

TF-IDF: 0.4845 0.33867864015394483 0.49308083066038155


### Song2vec

In [133]:
s2v_pairs = create_vectors_pairs(s2v_df, song_pairs)

In [134]:
X_train, X_test, y_train, y_test = \
                train_test_split(s2v_pairs, labels, train_size=0.8, random_state=SEED)
acc, f1, auc = predict_RF(X_train, X_test, y_train, y_test)

Starting Grid search
Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV 1/5] END .................criterion=gini, n_estimators=2; total time=   0.3s
[CV 2/5] END .................criterion=gini, n_estimators=2; total time=   0.2s
[CV 3/5] END .................criterion=gini, n_estimators=2; total time=   0.2s
[CV 4/5] END .................criterion=gini, n_estimators=2; total time=   0.2s
[CV 5/5] END .................criterion=gini, n_estimators=2; total time=   0.2s
[CV 1/5] END ................criterion=gini, n_estimators=50; total time=   4.8s
[CV 2/5] END ................criterion=gini, n_estimators=50; total time=   4.7s
[CV 3/5] END ................criterion=gini, n_estimators=50; total time=   4.9s
[CV 4/5] END ................criterion=gini, n_estimators=50; total time=   4.6s
[CV 5/5] END ................criterion=gini, n_estimators=50; total time=   4.5s
[CV 1/5] END ...............criterion=gini, n_estimators=100; total time=   9.4s
[CV 2/5] END ...............

In [135]:
print("Song2Vec:", acc, f1, auc)

Song2Vec: 0.5125 0.4981986618630983 0.4989103158433776


In [136]:
print("Random:", predict_random(X_train, y_train, X_test, y_test))
print("Majority:",predict_majority(X_train, y_train, X_test, y_test))

Random: (0.4945, 0.4937406109163745, 0.5)
Majority: (0.4935, 0.0, 0.5)
