In [121]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from io import StringIO
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import scipy.sparse as sp
import tensorflow as tf
from tqdm import tqdm
import pandas as pd
import numpy as np

from keras.utils.vis_utils import model_to_dot
from IPython.display import SVG
import matplotlib.pyplot as plt
import matplotlib
%matplotlib inline

import datetime
import time 
import math
import os
%config Completer.use_jedi = False

In [2]:
# Load the dataset
df = pd.read_table('lastfm-dataset-1K/userid-timestamp-artid-artname-traid-traname.tsv', lineterminator='\n', warn_bad_lines=True, names=['user', 'timestamp', 'artist-id', 'artist', 'song-id', 'song'])
df_profile = pd.read_csv('lastfm-dataset-1K/userid-profile.tsv', sep='\t', error_bad_lines=False, warn_bad_lines=True, skiprows=1, names=['user', 'gender', 'age', 'country', 'signup'])

In [3]:
# Check if some songs share a common ID
grouped = df[['song-id', 'song']].groupby(['song-id']).nunique()

In [4]:
# Get rows with song names containing multiple rows
duplicated = df.loc[df['song-id'].isin(grouped[grouped['song'] > 1].index)]
duplicated = duplicated.drop(columns=['user', 'timestamp', 'artist', 'artist-id'])
containis_extra_rows = duplicated.apply(lambda x: pd.Series({'id': x[0], 'song': x[1], 'flag':'\n' in x[1]}), axis=1)
containis_extra_rows = containis_extra_rows.loc[containis_extra_rows['flag']]

In [5]:
# Iterate over those 5k rows to get the extra rows and correct the song name
for index, row in containis_extra_rows.iterrows():
    row_break = row['song'].split('\n', 1)
    song_name = row_break[0]
    df.loc[df['song-id'] == row['id'], 'song'] = song_name
    
    tsv = StringIO(row_break[1])
    df_extra = pd.read_csv(tsv, sep="\t", warn_bad_lines=True, names=['user', 'timestamp', 'artist-id', 'artist', 'song-id', 'song'])
    df = df.append(df_extra, ignore_index=True)

### Fix names

In [6]:
# Check if some songs share a common ID
grouped = df[['song-id', 'song']].groupby(['song-id']).nunique()

In [7]:
# Check how many ids has more than 1 song name
grouped[grouped['song'] > 1].shape[0]

200

In [8]:
ids_to_be_fixed = grouped[grouped['song'] > 1].index

In [9]:
df = df.set_index('song-id')

In [10]:
map_to_fix = df.groupby(df.index)['song'].head(1)

In [11]:
df.loc[ids_to_be_fixed, 'song'] = df.loc[ids_to_be_fixed].reset_index()['song-id'].map(lambda x: map_to_fix[x])
df = df.reset_index()

### Create matrix

In [75]:
matrix_init = df.groupby(['user', 'song']).count()

In [76]:
matrix_init = matrix_init['timestamp']

In [77]:
matrix_init = matrix_init.reset_index()

In [78]:
def rating_scaler(row):
    row_array = np.array(row)
    a, new_range= 1, 4
    min_, max_ = row_array.min(), row_array.max()
    old_range = max_ - min_
    
    
    scaled_row = (new_range * (row_array - min_)) / (old_range + 1e-6)  + a
    return pd.Series(scaled_row)

In [79]:
matrix_init_scaled = matrix_init.groupby('user')['timestamp'].apply(rating_scaler)

In [80]:
matrix_init['timestamp'] = matrix_init_scaled.reset_index()['timestamp']

### Sparse matrix

In [62]:
from scipy.sparse import csr_matrix
from pandas.api.types import CategoricalDtype

In [63]:
person_c = CategoricalDtype(sorted(matrix_init.user.unique()), ordered=True)
thing_c = CategoricalDtype(sorted(matrix_init.song.unique()), ordered=True)

In [64]:
row = matrix_init.user.astype(person_c).cat.codes
col = matrix_init.song.astype(thing_c).cat.codes

In [65]:
sparse_matrix = csr_matrix((matrix_init["timestamp"], (row, col)), \
                           shape=(person_c.categories.size, thing_c.categories.size))

In [66]:
dfs = pd.DataFrame.sparse.from_spmatrix(sparse_matrix, index=person_c.categories, columns=thing_c.categories)

In [84]:
matrix_init.head()

Unnamed: 0,user,song,rating,user_id,item_id
4106948,user_000927,And Some Ya Lose,1.0,919,72494
4119740,user_000930,Everloving,1.20339,922,291465
920321,user_000210,That'S A Touch I Like,1.0,209,881354
955355,user_000219,Jagger '67,1.971429,218,467505
2142830,user_000503,Creatures,1.0,496,203968


### Another format

In [81]:
matrix_init['user_id'] = matrix_init['user'].astype('category').cat.codes
matrix_init['item_id'] = matrix_init['song'].astype('category').cat.codes

In [82]:
matrix_init = matrix_init.sample(frac=1)

In [83]:
matrix_init = matrix_init.rename({'timestamp':'rating'}, axis=1)

In [102]:
X = matrix_init[['user_id', 'item_id']]
y = matrix_init['rating']
groups = matrix_init['user_id']

In [103]:
# Change splitting to be by index using https://stackoverflow.com/questions/53490497/getting-validation-set-from-train-set-by-using-percentage-from-groupby-in-pand
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=groups)

In [107]:
no_users, no_items, no_factors = matrix_init['user_id'].nunique(), matrix_init['item_id'].nunique(), 100

### Models

In [122]:
def create_shallow_model(no_factors, no_users, no_items):
    # User branch
    user_id = tf.keras.layers.Input(shape=[1], name='user_id')
    user_matrix = tf.keras.layers.Embedding(no_users+1, no_factors, name='user_matrix')(user_id)
    user_vector = tf.keras.layers.Flatten(name='user_vector')(user_matrix)
    # Item branch
    item_id = tf.keras.layers.Input(shape=[1], name='item_id')
    item_matrix = tf.keras.layers.Embedding(no_items+1, no_factors, name='item_matrix')(item_id)
    item_vector = tf.keras.layers.Flatten(name='item_vector')(item_matrix)
    # Dot product 
    vectors_product = tf.keras.layers.dot([user_vector, item_vector], axes=1, normalize=False)
    # Model definition
    model = tf.keras.models.Model(inputs=[user_id, item_id], outputs=[vectors_product], name='shallow_model')
    return model

In [123]:
model = create_shallow_model(no_factors, no_users, no_items)

In [110]:
model.summary()

Model: "shallow_model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
user_id (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
item_id (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
user_matrix (Embedding)         (None, 1, 100)       99300       user_id[0][0]                    
__________________________________________________________________________________________________
item_matrix (Embedding)         (None, 1, 100)       108392400   item_id[0][0]                    
______________________________________________________________________________________

In [111]:
def create_deep_model(no_factors, no_users, no_items):
    # User branch
    user_id = tf.keras.layers.Input(shape=[1], name='user_id')
    user_matrix = tf.keras.layers.Embedding(no_users+1, no_factors, name='user_matrix')(user_id)
    user_vector = tf.keras.layers.Flatten(name='user_vector')(user_matrix)
    # Item branch
    item_id = tf.keras.layers.Input(shape=[1], name='item_id')
    item_matrix = tf.keras.layers.Embedding(no_items+1, no_factors, name='item_matrix')(item_id)
    item_vector = tf.keras.layers.Flatten(name='item_vector')(item_matrix)
    # Concantenation
    vectors_concat = tf.keras.layers.Concatenate()([user_vector, item_vector])
    vectors_concat_dropout = tf.keras.layers.Dropout(0.2)(vectors_concat)
    # Backbone 
    dense_1 = tf.keras.layers.Dense(16,name='fc3')(vectors_concat_dropout)
    dropout_1 = tf.keras.layers.Dropout(0.2,name='d3')(dense_1)
    dense_2 = tf.keras.layers.Dense(8,name='fc4', activation='relu')(dropout_1)
    dense_2_output = tf.keras.layers.Dense(1, activation='relu', name='activation')(dense_2)
    # Model definition
    model = tf.keras.models.Model(inputs=[user_id, item_id], outputs=[dense_2_output], name='deep_model')
    return model

In [112]:
model = create_deep_model(no_factors, no_users, no_items)

### Model training

In [129]:
y_train.head()

1615633    1.000000
2202736    1.358621
262981     1.000000
1212865    1.000000
2661981    1.000000
Name: rating, dtype: float64

In [130]:
# Input-output data definition
X_train = [X_train.user_id, X_train.item_id]

# Model creation
model = create_shallow_model(no_factors, no_users, no_items)

# Model compiling 
model.compile(loss=tf.keras.losses.MeanSquaredError())

# Model training
model.fit(X_train, y_train, epochs=10, batch_size=2048, shuffle=True)

Train on 3527750 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f99d542b410>

### Prediction

In [132]:
# Predictions in the training set
# X_train = [X_train.user_id, X_train.item_id]
y_train_pred = model.predict(X_train, batch_size=2048)

In [133]:
# Predictions in the test set
X_test = [X_test.user_id, X_test.item_id]
y_test_pred = model.predict(X_test, batch_size=2048)

### Evaluation

In [134]:
from sklearn.metrics import mean_squared_error
print('Train RMSE:', mean_squared_error(y_train.values, y_train_pred, squared=False))
print('Test RMSE:', mean_squared_error(y_test.values, y_test_pred, squared=False))

Train RMSE: 0.27254927709385485
Test RMSE: 0.5599868428679718


In [135]:
def predict_from_latent(model, uid, pids, train_ratings=None):
    user_vector = model.get_layer('user_matrix').get_weights()[0][uid]
    item_vectors = model.get_layer('item_matrix').get_weights()[0][pids]
    scores = (np.dot(user_vector, item_vectors.T))
    return scores

In [136]:
def precision_at_k(model, pred_func, train_ratings, test_ratings, no_users, no_items, k=10):
    pid_array = np.arange(no_items, dtype=np.int32)
    precisions = []
    # For each user
    for user_id, user_test_rating in tqdm(test_ratings.groupby('user_id')):
        # Retrieve already-seen items
        train_pids = train_ratings[train_ratings['user_id'] == user_id]['item_id'].values
        # Retrieve the unseen items
        test_pids = set(user_test_rating['item_id'].values)
        # Make rating predictions for all items for that user
        predictions = pred_func(model, user_id, pid_array, train_ratings)
        # Force a low rating to already-seen items
        predictions[train_pids] = - math.inf
        # Sort the items and het the top k
        top_k = set(np.argsort(-predictions)[:k])
        # Compute precision as per definition
        precisions.append(len(top_k & test_pids) / float(k))
    return precisions

In [142]:
X_train = np.array(X_train)
X_test = np.array(X_test)

In [143]:
train_ratings = pd.DataFrame({'user_id': X_train[0, :], 'item_id':X_train[1, :], 'rating': y_train})
test_ratings = pd.DataFrame({'user_id': X_test[0, :], 'item_id':X_test[1, :], 'rating': y_test})

In [144]:
precisions = precision_at_k(model, predict_from_latent, train_ratings, test_ratings, no_users, no_items, k=10)

100%|██████████| 991/991 [08:23<00:00,  1.97it/s]


In [145]:
np.mean(precisions), np.std(precisions)

(0.0009081735620585267, 0.009486441745303166)

In [146]:
### EXERCISE CELL ###
def recall_at_k(model, pred_func, train_ratings, test_ratings, no_users, no_items, k=10):
    pid_array = np.arange(no_items, dtype=np.int32)
    recalls = []
    for user_id, user_test_rating in tqdm(test_ratings.groupby('user_id')):
        train_pids = train_ratings[train_ratings['user_id'] == user_id]['item_id'].values
        test_pids = set(user_test_rating['item_id'].values)
        predictions = pred_func(model, user_id, pid_array, train_ratings)
        predictions[train_pids] = - math.inf
        top_k = set(np.argsort(-predictions)[:k])
        recalls.append(len(top_k & test_pids) / len(test_pids))
    return recalls

In [148]:
recalls = recall_at_k(model, predict_from_latent, train_ratings, test_ratings, no_users, no_items, k=10)

100%|██████████| 991/991 [08:04<00:00,  2.05it/s]


In [149]:
np.mean(recalls), np.std(recalls)

(9.66411002895107e-06, 0.00013890930996086155)

In [150]:
### EXERCISE CELL ###
def map_at_k(model, pred_func, train_ratings, test_ratings, no_users, no_items, k=10):
    pid_array = np.arange(no_items, dtype=np.int32)
    maps = []
    for user_id, user_test_rating in tqdm(test_ratings.groupby('user_id')):
        train_pids = train_ratings[train_ratings['user_id'] == user_id]['item_id'].values
        test_pids = set(user_test_rating['item_id'].values)
        predictions = pred_func(model, user_id, pid_array, train_ratings)
        predictions[train_pids] = - math.inf
        partial_maps = []
        top_k = list(np.argsort(-predictions)[:k])
        for rank, item_id in enumerate(top_k):
            if item_id in test_pids:
                partial_maps.append(len(set(top_k[:rank+1]) & test_pids) / float(rank+1))
        maps.append(.0 if len(partial_maps) == 0 else np.sum(partial_maps) / float(k))
    return maps

In [151]:
maps = map_at_k(model, predict_from_latent, train_ratings, test_ratings, no_users, no_items, k=10)

100%|██████████| 991/991 [07:17<00:00,  2.27it/s]


In [152]:
np.mean(maps), np.std(maps)

(0.000279299408966412, 0.0040065568794788375)