In [55]:
import mysql.connector
from mysql.connector import errorcode

def query_db(query):
    try:
        cnx = mysql.connector.connect(user='root', password='x155564py',
                                  host='127.0.0.1', port=3307,
                                  database='smashup')
        
        if cnx and cnx.is_connected():
            with cnx.cursor() as cursor:
                cursor.execute(query)
                rows = cursor.fetchall()
        
        cnx.close()
        return rows
    
    except mysql.connector.Error as err:
        if err.errno == errorcode.ER_ACCESS_DENIED_ERROR:
            print("Something is wrong with the username or password")
        elif err.errno == errorcode.ER_BAD_DB_ERROR:
            print("Database does not exist")
        else:
            print(err)
    return None


<h1>Content data preprocessing</h1>

In [56]:
from sklearn.preprocessing import StandardScaler
import numpy as np

lim = 10**10
def multifeature_encoder(values, ids, num_unique_values=None, num_unique_ids=None):
    # inputs - vectors, not ndarrays
    values_enum = dict([(i[1],i[0]) for i in enumerate(np.unique(values))])
    if num_unique_values is None:
        num_unique_values = len(values_enum)
    if num_unique_ids is None:
        num_unique_ids = len(np.unique(ids))
        
    result = np.zeros((num_unique_ids, num_unique_values))
    prev_id, cur_ind = 0, -1
    for v, i in zip(values, ids):
        if i!=prev_id:
            prev_id = i
            cur_ind += 1
        result[cur_ind][values_enum[v]] = 1
    
    return result

In [57]:
print(query_db('SELECT statuses, id FROM mashups LIMIT 10'))
print(query_db('SELECT duration, id FROM mashups LIMIT 10'))
print(query_db('SELECT genre, mashup_id FROM mashups JOIN mashups_to_genres ON mashups.id=mashups_to_genres.mashup_id LIMIT 10'))

[(0, 1), (1, 2), (0, 3), (0, 4), (1, 5), (0, 6), (1, 7), (1, 8), (0, 9), (0, 10)]
[(70507, 1), (213655, 2), (284319, 3), (189495, 4), (49896, 5), (220238, 6), (93413, 7), (99291, 8), (180009, 9), (168000, 10)]
[('поп', 1), ('рок', 1), ('рэп', 2), ('электро', 2), ('поп', 3), ('электро', 3), ('поп', 4), ('электро', 4), ('morph', 5), ('поп', 5)]


In [58]:
# vector for each feature (maybe this should be a transaction for reading consistency!)
ids = np.array(query_db(f'SELECT id FROM mashups LIMIT {lim}'))
id_lim = max(ids)[0]

statuses = np.array(query_db(f'SELECT statuses FROM mashups WHERE id<{id_lim}'))
durations = np.array(query_db(f'SELECT duration FROM mashups WHERE id<{id_lim}'))
genres_raw = np.array(query_db(f'SELECT genre, mashup_id FROM mashups JOIN mashups_to_genres ON mashups.id=mashups_to_genres.mashup_id WHERE mashup_id<{id_lim}'))

print(np.shape(statuses),np.shape(durations),np.shape(genres_raw))

n_unique_values = int(query_db('SELECT COUNT(DISTINCT genre) FROM mashups_to_genres')[0][0])
n_unique_ids = int(query_db(f'SELECT COUNT(id) FROM mashups WHERE id<{id_lim}')[0][0])
genres = multifeature_encoder(genres_raw[:,0],genres_raw[:,1],n_unique_values,n_unique_ids)
print(np.shape(genres))

features = (statuses, durations, genres)
for f in features:
    f[np.isnan(f)] = 0

X = np.hstack(features)
print(np.shape(X))
print(X)

(778, 1) (778, 1) (1296, 2)
(778, 14)
(778, 16)
[[0.00000e+00 7.05070e+04 0.00000e+00 ... 0.00000e+00 0.00000e+00
  0.00000e+00]
 [1.00000e+00 2.13655e+05 0.00000e+00 ... 0.00000e+00 0.00000e+00
  1.00000e+00]
 [0.00000e+00 2.84319e+05 0.00000e+00 ... 0.00000e+00 0.00000e+00
  1.00000e+00]
 ...
 [0.00000e+00 1.71075e+05 0.00000e+00 ... 0.00000e+00 0.00000e+00
  1.00000e+00]
 [1.00000e+00 1.51405e+05 0.00000e+00 ... 0.00000e+00 0.00000e+00
  0.00000e+00]
 [0.00000e+00 2.76662e+05 0.00000e+00 ... 0.00000e+00 0.00000e+00
  0.00000e+00]]


In [59]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
print(X_scaled)

[[-0.96712118 -1.1728117  -0.13035898 ... -0.13035898 -0.15821801
  -0.75402148]
 [ 1.03399658  0.7447916  -0.13035898 ... -0.13035898 -0.15821801
   1.32622217]
 [-0.96712118  1.69140294 -0.13035898 ... -0.13035898 -0.15821801
   1.32622217]
 ...
 [-0.96712118  0.1743921  -0.13035898 ... -0.13035898 -0.15821801
   1.32622217]
 [ 1.03399658 -0.08910622 -0.13035898 ... -0.13035898 -0.15821801
  -0.75402148]
 [-0.96712118  1.58883016 -0.13035898 ... -0.13035898 -0.15821801
  -0.75402148]]


<h1>Content clustering (optional)</h1> 
for better scalability: enables search over just one corresponding cluster instead of all the data

In [60]:
# from sklearn.cluster import KMeans, DBSCAN
# 
# # estimate the number of clusters
# dbscan = DBSCAN(eps=0.35, min_samples=5, metric='cosine').fit(X_scaled)
# labels = dbscan.labels_
# 
# n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
# n_noise_ = list(labels).count(-1)
# 
# print("Estimated number of clusters: %d" % n_clusters_)
# print("Estimated number of noise points: %d" % n_noise_)

In [61]:
# # using estimated K, apply K-means clustering for convenient prediction of cluster for new data
# kmeans = KMeans(n_clusters=n_clusters_).fit(X_scaled)
# labels = kmeans.labels_
# centers = kmeans.cluster_centers_
# print(np.shape(labels))
# print(labels)

In [62]:
# new_mashup = np.array([ 1.03528185, -0.08797488, -0.13527991, -0.13027386, 
#                         -0.15811388, -0.75326252, 0.08797488, - 1.03528185,
#                         1.03528185, -0.08797488, -0.13527991, -0.13027386, 
#                         -0.15811388, -0.75326252, 0.08797488, - 1.03528185],ndmin=2)
# new_mashup = scaler.transform(new_mashup)
# print(kmeans.predict(new_mashup))

In [63]:
# duplicate_mashup = np.array(X_scaled[8,:], ndmin=2)
# print(np.shape(duplicate_mashup))
# print(kmeans.predict(duplicate_mashup))

For further application, additional datastructures or table in the database are needed for storage of pairs "mashup id - cluster label"

<h1>Content Filtering: Candidate selection</h1>

In [64]:
def filter_already_liked(user_id, mashup_ids):
    likes = [i[0] for i in query_db(f'SELECT mashup_id FROM mashups_likes WHERE user_id={user_id}')]
    filtered_ids = []
    for i in mashup_ids:
        if i not in likes:
            filtered_ids.append(i)
    return filtered_ids


In [65]:
def get_content_data_point(mashup_id):
    status = np.array(query_db(f'SELECT statuses FROM mashups WHERE id={mashup_id}'))
    duration = np.array(query_db(f'SELECT duration FROM mashups WHERE id={mashup_id}'))
    genre_raw = np.array(query_db(f'SELECT genre FROM mashups JOIN mashups_to_genres ON mashups.id=mashups_to_genres.mashup_id WHERE mashup_id={mashup_id}'))
    
    n_unique_values = int(query_db('SELECT COUNT(DISTINCT genre) FROM mashups_to_genres')[0][0])
    genre = multifeature_encoder(genre_raw.reshape(1,-1)[0], np.array([mashup_id]*len(genre_raw)), n_unique_values)
    features = (status, duration, genre)
    for f in features:
        f[np.isnan(f)] = 0
    
    # each datapoint is of shape (1,16)
    x = np.hstack(features)
    return scaler.transform(x)

def fill_random_rem_mashups(cur_list, required_size, user_id=None):
    if user_id is not None:
        filter_already_liked(user_id, cur_list)
    cur_list = list(set(cur_list))
    while len(cur_list) < required_size:
        remainder = required_size-len(cur_list)
        cur_list += query_db(f'SELECT mashup_id FROM mashups ORDER BY RAND() LIMIT {remainder}')
        cur_list = list(set(cur_list))

def get_pop_ids(user_id, liked_population_size, most_listened_population_size, recently_listened_population_size):
    '''
    Always returns list with exactly l_pop_size+m_pop_size+r_pop_size distinct db mashup ids
    '''
    liked_population = query_db(f'SELECT mashup_id FROM mashups_likes WHERE user_id={user_id} ORDER BY RAND() LIMIT {liked_population_size}')
    most_listened_population = query_db(f"SELECT mashup_id FROM mashups_likes WHERE user_id={user_id} GROUP BY mashup_id ORDER BY COUNT(`time`) DESC, RAND() LIMIT {most_listened_population_size}")
    recently_listened_population = query_db(f"SELECT mashup_id FROM mashups_likes WHERE user_id={user_id} ORDER BY `time` DESC, RAND() LIMIT {recently_listened_population_size}")
    
    fill_random_rem_mashups(liked_population, liked_population_size)
    fill_random_rem_mashups(most_listened_population, most_listened_population_size)
    fill_random_rem_mashups(recently_listened_population, recently_listened_population_size)
        
    return liked_population, most_listened_population, recently_listened_population

def get_pop_data(liked_population_ids, most_listened_population_ids, recently_listened_population_ids):
    liked_population = np.zeros((len(liked_population_ids),16))
    most_listened_population = np.zeros((len(most_listened_population_ids),16))
    recently_listened_population = np.zeros((len(recently_listened_population_ids), 16))
    
    # each datapoint is of shape (1,16)
    for i in range(len(liked_population_ids)):
        liked_population[i] = get_content_data_point(liked_population_ids[i][0])

    for i in range(len(most_listened_population_ids)):
        most_listened_population[i] = get_content_data_point(most_listened_population_ids[i][0])

    for i in range(len(recently_listened_population_ids)):
        recently_listened_population[i] = get_content_data_point(recently_listened_population_ids[i][0])
    
    return liked_population, most_listened_population, recently_listened_population


In [82]:
from sklearn.neighbors import NearestNeighbors
from tqdm import tqdm
knn = NearestNeighbors().fit(X_scaled)

def select_base_candidates(liked_population, most_listened_population, recently_listened_population, l_neighbors, m_neighbors, r_neighbors):
    # neighbor search (candidates are returned as indices stored in ids and corresponding to elements of X provided to the KNN)
    l_dist, l_candidates = knn.kneighbors(liked_population, l_neighbors)
    m_dist, m_candidates = knn.kneighbors(most_listened_population, m_neighbors)
    r_dist, r_candidates = knn.kneighbors(recently_listened_population, r_neighbors)

    print(np.shape(l_dist), np.shape(m_dist), np.shape(r_dist)) # (n_population, n_neighbors)
    # return l_dist, m_dist, r_dist, l_candidates, m_candidates, r_candidates
    return np.concatenate((l_candidates.flatten(),m_candidates.flatten(),r_candidates.flatten()), axis=None)

def select_playlist_candidates(user_id, liked_population_ids, most_listened_population_ids, recently_listened_population_ids, n_neighbors, lim):
    # neighbor search based on playlist data (mashup-candidates are returned as indices stored in ids and corresponding to elements of X provided to the KNN)
    playlist_ids = set()
    # playlists including songs from the populations
    for pop in [liked_population_ids, most_listened_population_ids, recently_listened_population_ids]:
        for i in tqdm(pop, desc='Playlist ids from pop'):
            for j in query_db(f'SELECT playlist_id FROM playlists_to_mashups WHERE mashup_id = {i[0]} ORDER BY RAND() LIMIT {lim}'):
                playlist_ids.add(j[0])
    
    # liked playlists
    for i in query_db(f'SELECT playlist_id FROM playlists_likes WHERE user_id = {user_id} ORDER BY RAND() LIMIT {lim}'):
        playlist_ids.add(i[0])
    
    candidates = []
    for pi in tqdm(playlist_ids, desc='Playlist based recs'):
        mashup_ids = query_db(f'SELECT mashup_id FROM playlists_to_mashups WHERE playlist_id = {pi} ORDER BY RAND() LIMIT {lim}')
        for mi in mashup_ids:
            mashup_data = get_content_data_point(mi[0])
            dist, neigh_inds = knn.kneighbors(mashup_data, n_neighbors)
            for i in range(n_neighbors):
                candidates.append((dist[0][i], neigh_inds[0][i]))
                
    return set([pair[1] for pair in sorted(candidates, key=lambda pair: pair[0])][:lim])

def select_author_candidates(liked_population_ids, most_listened_population_ids, recently_listened_population_ids, n_neighbors, lim):
    # neighbor search based on mashup author data (mashup-candidates are returned as indices stored in ids and corresponding to elements of X provided to the KNN)
    author_ids = set()
    # authors of mashups from the populations
    for pop in [liked_population_ids, most_listened_population_ids, recently_listened_population_ids]:
        for i in tqdm(pop, desc='Author ids from pop'):
            for j in query_db(f'SELECT user_id FROM mashups_to_authors WHERE mashup_id={i[0]} ORDER BY RAND() LIMIT {lim}'):
                author_ids.add(j[0])
            
    # liked mashups - already in liked_population
            
    candidates = []
    for ai in tqdm(author_ids, desc='Author based recs'):
        mashup_ids = query_db(f'SELECT mashup_id FROM mashups_to_authors WHERE user_id={ai} ORDER BY RAND() LIMIT {lim}')
        for mi in mashup_ids:
            mashup_data = get_content_data_point(mi[0])
            dist, neigh_inds = knn.kneighbors(mashup_data, n_neighbors)
            for i in range(n_neighbors):
                candidates.append((dist[0][i], neigh_inds[0][i]))
                
    return set([pair[1] for pair in sorted(candidates, key=lambda pair: pair[0])][:lim])
        
def select_track_candidates(liked_population_ids, most_listened_population_ids, recently_listened_population_ids, n_neighbors, lim):
    # neighbor search based on data about tracks that the mashup consists of (mashup-candidates are returned as indices stored in ids and corresponding to elements of X provided to the KNN)
    track_ids = set()
    # get tracks from mashups from the populations (liked mashups - already in liked_population)
    for pop in [liked_population_ids, most_listened_population_ids, recently_listened_population_ids]:
        for i in tqdm(pop, desc='Track ids from pop'):
            for j in query_db(f'SELECT track_id FROM mashups_to_tracks WHERE mashup_id={i[0]} ORDER BY RAND() LIMIT {lim}'):
                track_ids.add(j[0])
            
    # get the tracks' authors
    author_ids = set()
    for i in track_ids:
        for j in query_db(f'SELECT author_id FROM tracks_to_authors WHERE track_id={i} ORDER BY RAND() LIMIT {lim}'):
            author_ids.add(j[0])
        
    # get other tracks of the authors
    for i in author_ids:
        for j in query_db(f'SELECT track_id FROM tracks_to_authors WHERE author_id={i} ORDER BY RAND() LIMIT {lim}'):
            track_ids.add(j[0])
    
    # recommend other mashups that include these tracks
    candidates = []
    for ti in tqdm(track_ids, desc='Track based recs'):
        mashup_ids = query_db(f'SELECT mashup_id FROM mashups_to_tracks WHERE track_id={ti} ORDER BY RAND() LIMIT {lim}')
        for mi in mashup_ids:
            mashup_data = get_content_data_point(mi[0])
            dist, neigh_inds = knn.kneighbors(mashup_data, n_neighbors)
            for i in range(n_neighbors):
                candidates.append((dist[0][i], neigh_inds[0][i]))\
            
    return set([pair[1] for pair in sorted(candidates, key=lambda pair: pair[0])][:lim])

In [85]:
params_default = {
    'liked_pop_size': 3,
    'most_listened_pop_size': 3,
    'recently_listened_pop_size': 3,
    'base_l_neighb': 5,
    'base_m_neighb': 5,
    'base_r_neighb': 5,
    'playlist_neighb': 3,
    'playlist_cand_lim': 5,
    'author_neighb': 3,
    'author_cand_lim': 5,
    'track_neighb': 3,
    'track_cand_lim': 5
    } 

def get_rec_list(user_id, **kwargs):
    '''
    :param user_id: id of a user in the db
    :param kwargs: parameter values (if any one is not stated, it is set by default)
    :return: list of recommended mashup db ids. number of elements: base_l_neighb + base_m_neighb + base_r_neighb + additional,
    where additional <= playlist_cand_lim+author_cand_lim+track_cand_lim
    '''
    params = params_default
    if kwargs != {}:
        for key, value in kwargs.items():
            params[key] = value
           
    population_ids = get_pop_ids(user_id, params['liked_pop_size'], params['most_listened_pop_size'], params['recently_listened_pop_size'])
    populations = get_pop_data(*population_ids)

    # get base (already filtered from already liked)
    base_cand = select_base_candidates(*populations, params['base_l_neighb'], params['base_m_neighb'], params['base_r_neighb'])
    
    # get additional without already liked
    playlist_cand = filter_already_liked(user_id, select_playlist_candidates(user_id, *population_ids, params['playlist_neighb'], params['playlist_cand_lim']))
    author_cand = filter_already_liked(user_id, select_author_candidates(*population_ids, params['author_neighb'], params['author_cand_lim']))
    track_cand = filter_already_liked(user_id, select_track_candidates(*population_ids, params['track_neighb'], params['track_cand_lim']))
    
    total_cand = list(base_cand) + playlist_cand + author_cand + track_cand
    res = []
    for ind in total_cand:
        if ind >= len(ids):
            print(f'Index {ind} out of bounds.')
        else:
            res.append(ids[ind][0])
    return res

In [86]:
get_rec_list(2)

(3, 5) (3, 5) (3, 5)


Playlist ids from pop: 100%|██████████| 3/3 [00:00<00:00, 40.86it/s]
Playlist ids from pop: 100%|██████████| 3/3 [00:00<00:00, 48.19it/s]
Playlist ids from pop: 100%|██████████| 3/3 [00:00<00:00, 50.24it/s]
Playlist based recs: 100%|██████████| 19/19 [00:07<00:00,  2.59it/s]
Author ids from pop: 100%|██████████| 3/3 [00:00<00:00, 46.15it/s]
Author ids from pop: 100%|██████████| 3/3 [00:00<00:00, 49.03it/s]
Author ids from pop: 100%|██████████| 3/3 [00:00<00:00, 51.08it/s]
Author based recs: 100%|██████████| 9/9 [00:03<00:00,  2.35it/s]
Track ids from pop: 100%|██████████| 3/3 [00:00<00:00, 40.65it/s]
Track ids from pop: 100%|██████████| 3/3 [00:00<00:00, 49.78it/s]
Track ids from pop: 100%|██████████| 3/3 [00:00<00:00, 50.91it/s]
Track based recs: 100%|██████████| 76/76 [00:05<00:00, 13.29it/s]


[681,
 673,
 715,
 709,
 677,
 718,
 677,
 709,
 715,
 681,
 709,
 677,
 718,
 715,
 681,
 709,
 677,
 715,
 718,
 681,
 718,
 677,
 709,
 715,
 681,
 718,
 677,
 709,
 715,
 681,
 709,
 715,
 677,
 681,
 718,
 718,
 677,
 709,
 715,
 681,
 718,
 677,
 709,
 715,
 681,
 718,
 681,
 681,
 718]

<h1>Collaborative Filtering (TODO)</h1>
Find users that liked the songs among total_candidates.<br>
If there is not enough such users ( < num_users), then find users that listened these songs more than once.<br>
If still not enough, select randomly.<br>
In fact, this is an heuristic instead of KNN, which would include measuring the similarity of a given user and each other user.

In [70]:
# hyperparameters
num_collab_users = 10
num_collab_recs = 15

In [71]:
collab_users = []
for ind in total_candidates:
    candidate_id = ids[ind][0]
    response = query_db(f'SELECT user_id FROM mashups_likes WHERE mashup_id={candidate_id} LIMIT {num_collab_users}')
    print(response)
    break
    
...

TypeError: 'ellipsis' object is not iterable

Suggest to the given user such songs that they haven't listened yet, but similar users liked.

In [None]:
...