# KNN approach parameters analysis

To get the best out of the KNN model, different features are tested to get the best feature subset that should be used. Also, different K values and different distance metrics are evaluated.

In [82]:
# import dependencies

import pandas as pd
import numpy as np
import sklearn.preprocessing as preprocessing
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split


In [83]:
# Loading in songs data

song_data_df = pd.read_csv('../output.csv', header=0)
song_data_df = song_data_df.drop_duplicates(subset=['song_id', 'hotttness', 'familiarity', 'loudness', 'tempo', 'key', 'key_confidence', 'mode', 'mode_confidence'])
song_data_df = song_data_df.reset_index(drop=True)

In [73]:
def get_song_feature_array(user_songs, features, scaler):
    user_songs_data = song_data_df[song_data_df['song_id'].isin(user_songs)]
    rel_data = user_songs_data[features]
    s_feat_arr = scaler.transform(rel_data)
    return s_feat_arr

def content_evaluate(expected, actual):
    '''
        Function that calculates precision and recall based on expected and actual recommendations.
        Precision = TP / (TP + FP)
        Recall = TP / (TP + FN)
    '''
    # intersect is true positives
    intersect_list = [value for value in expected if value in set(actual)]

    # len(actual) is TP + FP
    # len(expected) is TP + FN
    score = len(intersect_list)
    return score/len(actual), score / len(expected)

def run_model(model, feature_array, test_set, n):
    similar_song_ids = model.kneighbors(feature_array, return_distance=False).flatten()
    
    start_ind = len(feature_array)
    end_ind = int(start_ind+n)
    
    topNsongs = similar_song_ids[start_ind:end_ind]
    nearest_n = song_data_df.iloc[topNsongs,:][['song_id']]
    nearest_n_list = nearest_n['song_id'].tolist()
    
    precision, recall = content_evaluate(test_set, nearest_n_list)
    return precision, recall

## Feature subset analysis

Analysis of what features of songs should be selected as relevant to describe each song.
Following groups of features are tested:

```
relevant_features1 = ['hotttness']
relevant_features2 = ['hotttness','familiarity','loudness']
relevant_features3 = ['hotttness','familiarity','loudness', 'duration','tempo','key','mode']
relevant_features4 = ['hotttness','familiarity','key','mode']
relevant_features5 = ['hotttness','familiarity']
relevant_features6 = ['familiarity']
```

In [93]:
relevant_features_subsets = [['hotttness'], ['hotttness','familiarity','loudness'], ['hotttness','familiarity','loudness', 'duration','tempo','key','mode'], ['hotttness','familiarity','key','mode'],  ['hotttness','familiarity'], ['familiarity']]

def create_scalers_and_models_for_different_features(features_list):
    models, scalers = [], []
    for feature_subset in features_list:
        train_data = song_data_df[feature_subset]
        
        scaler = preprocessing.RobustScaler()
        train_data_normalized = scaler.fit_transform(train_data)
        
        KNN = NearestNeighbors(n_neighbors=1000)
        KNN.fit(train_data)
        
        models.append(KNN)
        scalers.append(scaler)
        
    return scalers, models

scalers, models = create_scalers_and_models_for_different_features(relevant_features_subsets)

In [72]:
#Loading in the user data

user_data_df = pd.read_csv('../output_plays.csv')
user_data_df.columns = ['userID', 'songID', 'playCount']

user_data_df = user_data_df[user_data_df['songID'].isin(song_data_df['song_id'].unique().tolist())]

user_id_list = user_data_df['userID'].unique().tolist()


In [75]:
user_data_df.shape

(1253243, 3)

In [79]:
np.random.seed(seed=42)
samples = np.random.choice(user_id_list, 10000)

In [95]:
def calculate_precision_and_recall_for_different_features(user_id):
    user_listens_data = user_data_df.loc[user_data_df['userID'] == user_id]
    
    # all unique songs this specific user listens
    song_id_list = user_listens_data['songID'].unique().tolist()
    
    if len(song_id_list) < 4:
        return [], []
    
    # split songs user listens in 75:25
    songs_i_train, songs_i_test = train_test_split(song_id_list, test_size=0.25, random_state=42)
    
    precisions, recalls = [], []
    
    for features, scaler, model in zip(relevant_features_subsets, scalers, models):

        # get normalized song feature array
        song_feature_arr = get_song_feature_array(songs_i_train, features, scaler)
        p, r = run_model(model, song_feature_arr, songs_i_test, 30)
        precisions.append(p)
        recalls.append(r)
        
    return precisions, recalls

counter = 1

precisions, recalls = [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0]
for user_id in samples:
    user_precisions, user_recalls = calculate_precision_and_recall_for_different_features(user_id)
    if not user_precisions or not user_recalls:
        continue
    
    precisions = [precision + user_precisions[idx] for idx, precision in enumerate(precisions)]
    recalls = [recall + user_recalls[idx] for idx, recall in enumerate(recalls)]
    
    counter += 1
    
    if counter >= 1000:
        break

for precision, recall, features in zip(precisions, recalls, relevant_features_subsets):
    print(f"For {features} features, average precision: {precision / 1000}, average recall: {recall / 1000}")

For ['hotttness'] features, average precision: 0.0014333333333333342, average recall: 0.02615
For ['hotttness', 'familiarity', 'loudness'] features, average precision: 0.0004999999999999999, average recall: 0.009416666666666667
For ['hotttness', 'familiarity', 'loudness', 'duration', 'tempo', 'key', 'mode'] features, average precision: 6.666666666666667e-05, average recall: 0.001
For ['hotttness', 'familiarity', 'key', 'mode'] features, average precision: 0.0005666666666666666, average recall: 0.01103333333333333
For ['hotttness', 'familiarity'] features, average precision: 0.0014333333333333342, average recall: 0.025709523809523807
For ['familiarity'] features, average precision: 0.001333333333333334, average recall: 0.022


## Number K analysis

Analysis of best K number for KNN algorithm is given below, where following values are taken in consideration: `[10, 25, 50, 100, 200, 400]`

In [96]:
possible_K_values = [10, 25, 50, 100, 200, 400]

features = relevant_features_subsets[4]
scaler = scalers[4]
model = models[4]

def calculate_precision_and_recall_for_different_K(user_id):
    user_listens_data = user_data_df.loc[user_data_df['userID'] == user_id]
    
    # all unique songs this specific user listens
    song_id_list = user_listens_data['songID'].unique().tolist()
    
    if len(song_id_list) < 4:
        return [], []
    
    # split songs user listens in 75:25
    songs_i_train, songs_i_test = train_test_split(song_id_list, test_size=0.25, random_state=42)
    
    precisions, recalls = [], []
    
    song_feature_arr = get_song_feature_array(songs_i_train, features, scaler)
    
    for K in possible_K_values:
        
        p, r = run_model(model, song_feature_arr, songs_i_test, K)
        precisions.append(p)
        recalls.append(r)
        
    return precisions, recalls


counter = 1

precisions, recalls = [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0]
for user_id in samples:
    user_precisions, user_recalls = calculate_precision_and_recall_for_different_K(user_id)
    if not user_precisions or not user_recalls:
        continue
    
    precisions = [precision + user_precisions[idx] for idx, precision in enumerate(precisions)]
    recalls = [recall + user_recalls[idx] for idx, recall in enumerate(recalls)]
    
    counter += 1
    
    if counter >= 1000:
        break

for precision, recall, K in zip(precisions, recalls, possible_K_values):
    print(f"For K = {K}, average precision: {precision / 1000}, average recall: {recall / 1000}")

For K = 10, average precision: 0.0019000000000000006, average recall: 0.011833333333333333
For K = 25, average precision: 0.0014800000000000006, average recall: 0.022376190476190473
For K = 50, average precision: 0.0011800000000000007, average recall: 0.0347095238095238
For K = 100, average precision: 0.001250000000000001, average recall: 0.0706857142857143
For K = 200, average precision: 0.001114999999999999, average recall: 0.12993571428571427
For K = 400, average precision: 0.0007974999999999959, average recall: 0.1871357142857143


## Distance metric analysis

Analysis to find best distance metric to use for KNN model. Possible values that are taken into consideration are: `['chebyshev', 'minkowski', 'manhattan', 'euclidean']`

In [97]:
distances = ['chebyshev', 'minkowski', 'manhattan', 'euclidean']
features = relevant_features_subsets[4]

train_data = song_data_df[features]

scaler = preprocessing.RobustScaler()
train_data_normalized = scaler.fit_transform(train_data)

KNN1 = NearestNeighbors(n_neighbors=1000,metric='chebyshev')
KNN1.fit(train_data_normalized)

KNN2 = NearestNeighbors(n_neighbors=1000, metric='minkowski', p=4)
KNN2.fit(train_data_normalized)

KNN3 = NearestNeighbors(n_neighbors=1000, metric='manhattan')
KNN3.fit(train_data_normalized)

KNN4 = NearestNeighbors(n_neighbors=1000)
KNN4.fit(train_data_normalized) # euclidean

models = [KNN1, KNN2, KNN3, KNN4]


def calculate_precision_and_recall_for_different_models(user_id):
    user_listens_data = user_data_df.loc[user_data_df['userID'] == user_id]
    
    # all unique songs this specific user listens
    song_id_list = user_listens_data['songID'].unique().tolist()
    
    if len(song_id_list) < 4:
        return [], []
    
    # split songs user listens in 75:25
    songs_i_train, songs_i_test = train_test_split(song_id_list, test_size=0.25, random_state=42)
    
    precisions, recalls = [], []
    
    song_feature_arr = get_song_feature_array(songs_i_train, features, scaler)
    
    for model in models:
        p, r = run_model(model, song_feature_arr, songs_i_test, 100)
        precisions.append(p)
        recalls.append(r)
        
    return precisions, recalls


counter = 1

precisions, recalls = [0, 0, 0, 0], [0, 0, 0, 0]
for user_id in samples:
    user_precisions, user_recalls = calculate_precision_and_recall_for_different_models(user_id)
    if not user_precisions or not user_recalls:
        continue
    
    precisions = [precision + user_precisions[idx] for idx, precision in enumerate(precisions)]
    recalls = [recall + user_recalls[idx] for idx, recall in enumerate(recalls)]
    
    counter += 1
    
    if counter >= 1000:
        break

for precision, recall, distance in zip(precisions, recalls, distances):
    print(f"For distance {distance}, average precision: {precision / 1000}, average recall: {recall / 1000}")



For distance chebyshev, average precision: 0.0008100000000000006, average recall: 0.04491666666666667
For distance minkowski, average precision: 0.0008000000000000005, average recall: 0.04391666666666667
For distance manhattan, average precision: 0.0008200000000000006, average recall: 0.04675000000000001
For distance euclidean, average precision: 0.0008100000000000006, average recall: 0.044916666666666674
