In [42]:
import pymongo
from pymongo import MongoClient
from bson.objectid import ObjectId
from sklearn.cluster import KMeans
import math
import pprint

pp = pprint.PrettyPrinter()
client = MongoClient('localhost', 27017)
#client = MongoClient('mongodb://musicclustering:o5oF111QxnPaMXmk@clustermdb-shard-00-00-gg5i3.gcp.mongodb.net:27017,clustermdb-shard-00-01-gg5i3.gcp.mongodb.net:27017,clustermdb-shard-00-02-gg5i3.gcp.mongodb.net:27017/test?ssl=true&replicaSet=ClusterMDB-shard-0&authSource=admin&retryWrites=true')
db = client.server

def euclidian_distance(pointA, pointB):
    distance = 0
    for valA, valB in zip(pointA, pointB):
        distance += (valA - valB) ** 2
    
    return math.sqrt(distance)


def sum_distances(centroids, labels, values):
    total_sum = 0
    
    for value, label in zip(values, labels):
        total_sum += euclidian_distance(value, centroids[label]) ** 2        
        
    return total_sum

def k_means(values, k):
    clf = KMeans(n_clusters = k)
    clf.fit(values)

    #print('Fit done. k =', k)

    centroids = clf.cluster_centers_
    labels = clf.labels_

    total_distance = sum_distances(centroids, labels, values)
    
    return (centroids, labels, total_distance)

# def standart_deviation(mean, values):
#   sd_vector = []
#   for i in range(len(mean)):
#       d_sum = 0
#       for value in values:
#           d_sum += (mean[i] - value[i]) ** 2
#       sd = math.sqrt(d_sum / (len(values)-1))
#       sd_vector.append(sd)
#   return sd_vector

def generate_recommendation(user, tracks_ids, k=8, min_popularity = 50):
    # Get user tracks
    results = db.tracks.find({'id' : {'$in' : user['tracks_ids']}}, {'_id': 0, 'id': 1, 'features.acousticness' : 1, 'features.danceability' : 1, 'features.energy' : 1, 'features.instrumentalness' : 1, 'features.liveness' : 1, 'features.speechiness' : 1, 'features.valence': 1})
    user_tracks = list(results);
    #print(len(user_tracks))
    # extract values and run kmeans
    values = [list(item['features'].values()) for item in user_tracks]  
    (centroids, labels, total_distance) = k_means(values, k)

    # Get all tracks
    results = db.tracks.find({'id' : {'$in' : tracks_ids}}, {'_id': 0, 'id': 1, 'features.acousticness' : 1, 'features.danceability' : 1, 'features.energy' : 1, 'features.instrumentalness' : 1, 'features.liveness' : 1, 'features.speechiness' : 1, 'features.valence': 1})
    tracks = [(item['id'], list(item['features'].values())) for item in list(results)]
    #print(len(tracks))

    # Evaluate tracks
    evaluated_tracks = evaluate_tracks(centroids, labels, tracks)

    return evaluated_tracks


def evaluate_tracks(centroids, labels, tracks):
    performance_list = []
    for track in tracks:
        performance = recommendation_performance(track, centroids, labels)
        performance_list.append((performance, track[0]))
    performance_list.sort(reverse=True)
    return performance_list

def recommendation_performance(track, centroids, labels):
    performance = 0
    centroid = centroids[0]
    n_features = len(centroid)
    
    #find closest centroid and biggest centroid size
    lesser_distance = 999
    greatest_count = 0
    cluster_index = 0
    for i in range(len(centroids)):
        c = centroids[i]
        distance = euclidian_distance(c, track[1])
        count = list(labels).count(i)
        if distance < lesser_distance:
            lesser_distance = distance
            centroid = c
            cluster_index = i
        if count > greatest_count:
            greatest_count = count
    
    #ratio of the cluster's respective size in relation to the biggest cluster
    cluster_size_ratio = list(labels).count(cluster_index) / greatest_count
    
    # # Avarage of the cluster's deviations
    # cluster_avg_deviation = 0
    # for val in deviations[cluster_index]:
    #     cluster_avg_deviation += val **2        
    # cluster_avg_deviation = math.sqrt(cluster_avg_deviation) / n_features
    
    return  cluster_size_ratio * (1-lesser_distance)


In [69]:
user = db.users.find_one({'id': '12152580425'})
user_tracks = list(user['tracks_ids'])
chunk_size = len(user_tracks)//10
soma = 0
for i in range(0, len(user_tracks)-3, chunk_size):
    test_data = user_tracks[i:(i+chunk_size)]
    #print('test_data', len(test_data))
    user['tracks_ids'] = [item for item in user_tracks if item not in test_data]
    #print("user['tracks_ids']", len(user['tracks_ids']))
    evaluated_tracks = generate_recommendation(user, user_tracks)
    #print('evaluated_tracks', len(evaluated_tracks))
    blau = [item for item in evaluated_tracks if item[1] in test_data]
    blau2 = [item for item in blau if item[0] >= 0.3]
    print(len(blau2), (chunk_size)-5, (len(blau2)/((chunk_size)-5)))
    soma += len(blau2)/len(blau)
print(soma/10)

101 142 0.7112676056338029
97 142 0.6830985915492958
105 142 0.7394366197183099
89 142 0.6267605633802817
124 142 0.8732394366197183
108 142 0.7605633802816901
112 142 0.7887323943661971
124 142 0.8732394366197183
113 142 0.795774647887324
127 142 0.8943661971830986
0.7482993197278912


In [20]:
db.tracks.count()

  """Entry point for launching an IPython kernel.


90713

In [44]:
users = db.users.find()
pp.pprint([item['id'] for item in list(users)])

['12152580425', '12151527942']


In [None]:
5.870748299319728