### 1. Spark消费Kafka中实时采集的音乐数据

In [281]:
import numpy as np
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

kafka_topic_name = "songTopic"
kafka_bootstrap_servers = 'node120:9092'
spark = SparkSession \
        .builder \
        .appName("streaming") \
        .master("local[4]") \
        .getOrCreate()
#设置日志级别
spark.sparkContext.setLogLevel("ERROR")

In [282]:
#加载Kafka中的数据
songs_df = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", kafka_bootstrap_servers) \
    .option("subscribe", kafka_topic_name) \
    .option("startingOffsets", "latest") \
    .load()

In [283]:
songs_df1 = songs_df.selectExpr("CAST(value AS STRING)", "timestamp")

songs_schema_string = "order_id INT,id STRING, name STRING,popularity INT, " \
                      "duration_ms DOUBLE, explicit INT, " \
                      + "artists STRING, id_artists STRING, release_date STRING, " \
                      + "danceability DOUBLE," \
                      + "energy DOUBLE, key INT, loudness DOUBLE, " \
                      + "mode INT," \
                      + "speechiness DOUBLE," \
                      + "acousticness DOUBLE, instrumentalness DOUBLE, liveness DOUBLE, " \
                      + "valence DOUBLE, tempo DOUBLE, time_signature DOUBLE"

songs_df2 = songs_df1 \
    .select(from_csv(col("value"), songs_schema_string) \
     .alias("song"), "timestamp")

songs_df3 = songs_df2.select("song.*", "timestamp")
songs_df3.createOrReplaceTempView("song_find")

In [284]:
#获取音乐数据
song_find_text = spark.sql("SELECT * FROM song_find")

In [285]:
#输出到新的表中【testedTable5】
songs_agg_write_stream = song_find_text \
        .writeStream \
        .trigger(processingTime='5 seconds') \
        .outputMode("append") \
        .option("truncate", "false") \
        .format("memory") \
        .queryName("testedTable1") \
        .start()
songs_agg_write_stream.awaitTermination(1)

IllegalArgumentException: Cannot start query with name testedTable1 as a query with that name is already active in this SparkSession

In [None]:
# 测试是否将采集的数据存储到表中
result = spark.sql("SELECT * FROM testedTable1")
result.show()

### 2. 数据预处理
#### 2.1 消费数据并把喜欢的歌曲加入采集的数据集中

In [286]:
from spotifyRec import spotify_api
import random
# get song data
song_data = spotify_api.get_song_data()
song_data = song_data.drop(['id', 'added_at', 'time_signature','duration_s'], axis='columns')

SpotifyOauthError: error: invalid_grant, error_description: Refresh token revoked

In [None]:
# 从我们的 Spotify 喜欢的歌曲播放列表中检索一首随机歌曲
rand_n = random.randint(0,len(song_data)-1)
# get DataFrame object
add_df = song_data.iloc[rand_n: rand_n+1, :]
print(add_df.columns)
add_df

In [None]:
df = spark.sql("SELECT * FROM testedTable1")
df = df.sort(df.release_date.desc())
# 从采集的数据集中删除不需要的列
df = df.drop('order_id', 'id', 'explicit', 'release_date',
                 'id_artists','time_signature','duration_ms', 'timestamp')
df

In [None]:
df_sp = spark.createDataFrame(add_df)
df_sp

知识点：
* union: 两个df合并，但是不按列名进行合并，而是位置,列名以前表为准(a.union(b) 列名顺序以a为准)
* unionAll:同union方法
* unionByName:合并时按照列名进行合并，而不是位置

In [287]:
# 将我们喜欢的音乐加入到这个数据集中  由于两者列名顺序不同，需要使用unionByName(类型按照为df原先的类型)
union_df = df.unionByName(df_sp)
union_df

AnalysisException: Cannot resolve column name "artistsIndex" among (name, artists, popularity, energy, danceability, speechiness, tempo, valence, liveness, key, acousticness, loudness, mode, instrumentalness)

In [None]:
# 测试效果 ，正常显示popularity（因为df和df_sp第二个类名不同）
union_df.filter('''name = "Starlight (Taylor's Version)"''').select("name", "popularity").show()

#### 2.2 特征向量化

In [288]:
union_df.printSchema()

root
 |-- name: string (nullable = true)
 |-- popularity: long (nullable = true)
 |-- artists: string (nullable = true)
 |-- danceability: double (nullable = true)
 |-- energy: double (nullable = true)
 |-- key: long (nullable = true)
 |-- loudness: double (nullable = true)
 |-- mode: long (nullable = true)
 |-- speechiness: double (nullable = true)
 |-- acousticness: double (nullable = true)
 |-- instrumentalness: double (nullable = true)
 |-- liveness: double (nullable = true)
 |-- valence: double (nullable = true)
 |-- tempo: double (nullable = true)



In [289]:
from pyspark.ml.feature import StringIndexer

indexer = StringIndexer(inputCol = 'artists' , outputCol = 'artistsIndex')
model = indexer.fit(union_df)
idexed = model.transform(union_df)
idexed.show(1)

+--------------------+----------+------------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+------+------------+
|                name|popularity|     artists|danceability|energy|key|loudness|mode|speechiness|acousticness|instrumentalness|liveness|valence| tempo|artistsIndex|
+--------------------+----------+------------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+------+------------+
|I've Got You Unde...|        68|FrankSinatra|       0.585| 0.247|  1| -12.612|   1|       0.04|       0.452|         8.85E-6|   0.107|  0.591|127.15|         1.0|
+--------------------+----------+------------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+------+------------+
only showing top 1 row



In [290]:
# features "name" and " artists" not add
input_cols = idexed.columns[1:]
input_cols.remove("artists")
input_cols

['popularity',
 'danceability',
 'energy',
 'key',
 'loudness',
 'mode',
 'speechiness',
 'acousticness',
 'instrumentalness',
 'liveness',
 'valence',
 'tempo',
 'artistsIndex']

In [291]:
from pyspark.ml.feature import VectorAssembler
# VectorAssembler是将给定列列表组合成单个向量列的转换器。为了训练逻辑回归和决策树等ML模型，将原始特征和不同特征转换器生成的特征组合成一个特征向量是很有用的。VectorAssembler接受以下输入列类型:所有数值类型、布尔类型和向量类型。在每一行中，输入列的值将按照指定的顺序连接到一个向量中。
assembler=VectorAssembler(inputCols=input_cols, outputCol='features')
# 模型训练
assembled_data=assembler.setHandleInvalid("skip").transform(idexed)
assembled_data

DataFrame[name: string, popularity: bigint, artists: string, danceability: double, energy: double, key: bigint, loudness: double, mode: bigint, speechiness: double, acousticness: double, instrumentalness: double, liveness: double, valence: double, tempo: double, artistsIndex: double, features: vector]

In [292]:
assembled_data.head()

Row(name="I've Got You Under My Skin - Remastered 1998", popularity=68, artists='FrankSinatra', danceability=0.585, energy=0.247, key=1, loudness=-12.612, mode=1, speechiness=0.04, acousticness=0.452, instrumentalness=8.85e-06, liveness=0.107, valence=0.591, tempo=127.15, artistsIndex=1.0, features=DenseVector([68.0, 0.585, 0.247, 1.0, -12.612, 1.0, 0.04, 0.452, 0.0, 0.107, 0.591, 127.15, 1.0]))

#### 2.3 数据标准化

In [293]:
from pyspark.ml.feature import StandardScaler
scale = StandardScaler(inputCol='features',outputCol='standardized')
data_scale = scale.fit(assembled_data)
df = data_scale.transform(assembled_data)
df.head()

Row(name="I've Got You Under My Skin - Remastered 1998", popularity=68, artists='FrankSinatra', danceability=0.585, energy=0.247, key=1, loudness=-12.612, mode=1, speechiness=0.04, acousticness=0.452, instrumentalness=8.85e-06, liveness=0.107, valence=0.591, tempo=127.15, artistsIndex=1.0, features=DenseVector([68.0, 0.585, 0.247, 1.0, -12.612, 1.0, 0.04, 0.452, 0.0, 0.107, 0.591, 127.15, 1.0]), standardized=DenseVector([12.5339, 3.7642, 1.1948, 0.2554, -2.0109, 2.0456, 0.5834, 1.8475, 0.0, 1.7815, 2.4613, 3.9792, 0.1361]))

#### 2.4 KMeans聚类

* 轮廓系数（Silhouette Coefficient），是聚类效果好坏的一种评价方式。最早由 Peter J. Rousseeuw 在 1986 提出。它结合内聚度和分离度两种因素。可以用来在相同原始数据的基础上用来评价不同算法、或者算法不同运行方式对聚类结果所产生的影响。
* 平方欧几里得（squaredEuclidean）

In [294]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator

evaluator = ClusteringEvaluator(predictionCol='prediction', featuresCol='standardized',metricName='silhouette', distanceMeasure='squaredEuclidean')

KMeans_algo = KMeans(featuresCol='standardized', k=3)
KMeans_fit = KMeans_algo.fit(df)
output_df = KMeans_fit.transform(df)
output_df

DataFrame[name: string, popularity: bigint, artists: string, danceability: double, energy: double, key: bigint, loudness: double, mode: bigint, speechiness: double, acousticness: double, instrumentalness: double, liveness: double, valence: double, tempo: double, artistsIndex: double, features: vector, standardized: vector, prediction: int]

#### 2.5 推荐系统代码前的数据准备

In [295]:
# 抛弃那个feature vector
columns_to_drop  = ["features", "standardized"]
# spark sql DataFrame
data_sdf = output_df.drop(*columns_to_drop)

print(len(data_sdf.columns))
# python DataFrame
data_df = data_sdf.toPandas()
data_df.drop_duplicates(inplace=True)

16


In [330]:
from spotifyRec.recommender import SpotifyRecommender
import pandas as pd

recommend_num = 10
liked_song_name = add_df.iloc[-1]['name'] # 获取用户收藏音乐的名称
value_cate = data_df.iloc[-1]['prediction'] # 用户收藏音乐对应的类别（k-means预测的）
print(liked_song_name, value_cate)

filtered_data = data_df[data_df['prediction'] == 1]
recommender = SpotifyRecommender(filtered_data)
rec_song = recommender.spotify_recommendations(recommend_num)
rec_song

Starlight (Taylor's Version) 2
*****


Unnamed: 0,name,popularity,artists,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
28,Singin' In The Rain,53,GeneKelly,0.354,0.241,5,-13.811,1,0.0334,0.889,0.000274,0.165,0.408,135.646
27,Claire de lune,62,ClaudeDebussyAlexisWeissenberg,0.365,0.01,1,-25.268,1,0.0519,0.995,0.924,0.0584,0.0364,135.048
4,My Funny Valentine,64,ChetBaker,0.438,0.00756,0,-26.44,0,0.0382,0.941,0.00643,0.109,0.277,133.32
9,I'm Glad There Is You,61,JulieLondon,0.492,0.0671,10,-15.671,0,0.0399,0.958,0.000968,0.109,0.165,124.908
7,I'm In The Mood For Love,65,JulieLondon,0.58,0.0545,9,-14.594,0,0.0473,0.963,0.000265,0.0967,0.177,123.341
3,I Fall In Love Too Easily - Vocal Version,66,ChetBaker,0.532,0.0657,0,-19.136,0,0.0406,0.94,0.00488,0.0946,0.292,128.607
8,Cry Me A River,63,JulieLondon,0.496,0.0688,4,-14.927,0,0.0545,0.964,0.00229,0.112,0.161,119.898
30,'Round Midnight,58,TheloniousMonk,0.542,0.32,8,-9.975,1,0.0387,0.907,0.649,0.0609,0.206,116.648
17,Unforgettable,68,NatKingCole,0.349,0.182,5,-13.507,1,0.031,0.92,0.0152,0.143,0.178,136.094
15,In The Wee Small Hours Of The Morning - Remast...,60,FrankSinatra,0.29,0.0874,0,-16.119,1,0.0346,0.856,0.000957,0.109,0.0734,114.997


* 推荐代码测试

In [297]:
filtered_data = data_df[data_df['prediction'] == 1]
filtered_data

Unnamed: 0,name,popularity,artists,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,artistsIndex,prediction
3,I Fall In Love Too Easily - Vocal Version,66,ChetBaker,0.532,0.0657,0,-19.136,0,0.0406,0.94,0.00488,0.0946,0.292,128.607,2.0,1
4,My Funny Valentine,64,ChetBaker,0.438,0.00756,0,-26.44,0,0.0382,0.941,0.00643,0.109,0.277,133.32,2.0,1
7,I'm In The Mood For Love,65,JulieLondon,0.58,0.0545,9,-14.594,0,0.0473,0.963,0.000265,0.0967,0.177,123.341,3.0,1
8,Cry Me A River,63,JulieLondon,0.496,0.0688,4,-14.927,0,0.0545,0.964,0.00229,0.112,0.161,119.898,3.0,1
9,I'm Glad There Is You,61,JulieLondon,0.492,0.0671,10,-15.671,0,0.0399,0.958,0.000968,0.109,0.165,124.908,3.0,1
10,Black Coffee,56,SarahVaughan,0.38,0.155,1,-11.977,1,0.0311,0.973,1.7e-05,0.0977,0.127,94.23,18.0,1
12,Let's Get Lost,55,ChetBaker,0.521,0.37,0,-9.345,1,0.03,0.84,0.000807,0.108,0.55,78.12,2.0,1
13,Time After Time,52,MargaretWhiting,0.236,0.173,6,-13.844,1,0.0319,0.928,0.000623,0.239,0.279,82.342,17.0,1
15,In The Wee Small Hours Of The Morning - Remast...,60,FrankSinatra,0.29,0.0874,0,-16.119,1,0.0346,0.856,0.000957,0.109,0.0734,114.997,1.0,1
17,Unforgettable,68,NatKingCole,0.349,0.182,5,-13.507,1,0.031,0.92,0.0152,0.143,0.178,136.094,0.0,1


In [310]:
filtered_data.iloc[:-1, :-2]

Unnamed: 0,name,popularity,artists,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
3,I Fall In Love Too Easily - Vocal Version,66,ChetBaker,0.532,0.0657,0,-19.136,0,0.0406,0.94,0.00488,0.0946,0.292,128.607
4,My Funny Valentine,64,ChetBaker,0.438,0.00756,0,-26.44,0,0.0382,0.941,0.00643,0.109,0.277,133.32
7,I'm In The Mood For Love,65,JulieLondon,0.58,0.0545,9,-14.594,0,0.0473,0.963,0.000265,0.0967,0.177,123.341
8,Cry Me A River,63,JulieLondon,0.496,0.0688,4,-14.927,0,0.0545,0.964,0.00229,0.112,0.161,119.898
9,I'm Glad There Is You,61,JulieLondon,0.492,0.0671,10,-15.671,0,0.0399,0.958,0.000968,0.109,0.165,124.908
10,Black Coffee,56,SarahVaughan,0.38,0.155,1,-11.977,1,0.0311,0.973,1.7e-05,0.0977,0.127,94.23
12,Let's Get Lost,55,ChetBaker,0.521,0.37,0,-9.345,1,0.03,0.84,0.000807,0.108,0.55,78.12
13,Time After Time,52,MargaretWhiting,0.236,0.173,6,-13.844,1,0.0319,0.928,0.000623,0.239,0.279,82.342
15,In The Wee Small Hours Of The Morning - Remast...,60,FrankSinatra,0.29,0.0874,0,-16.119,1,0.0346,0.856,0.000957,0.109,0.0734,114.997
17,Unforgettable,68,NatKingCole,0.349,0.182,5,-13.507,1,0.031,0.92,0.0152,0.143,0.178,136.094


In [298]:
num_features = filtered_data.drop(columns = ['name', 'artists', 'prediction'])
num_features

Unnamed: 0,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,artistsIndex
3,66,0.532,0.0657,0,-19.136,0,0.0406,0.94,0.00488,0.0946,0.292,128.607,2.0
4,64,0.438,0.00756,0,-26.44,0,0.0382,0.941,0.00643,0.109,0.277,133.32,2.0
7,65,0.58,0.0545,9,-14.594,0,0.0473,0.963,0.000265,0.0967,0.177,123.341,3.0
8,63,0.496,0.0688,4,-14.927,0,0.0545,0.964,0.00229,0.112,0.161,119.898,3.0
9,61,0.492,0.0671,10,-15.671,0,0.0399,0.958,0.000968,0.109,0.165,124.908,3.0
10,56,0.38,0.155,1,-11.977,1,0.0311,0.973,1.7e-05,0.0977,0.127,94.23,18.0
12,55,0.521,0.37,0,-9.345,1,0.03,0.84,0.000807,0.108,0.55,78.12,2.0
13,52,0.236,0.173,6,-13.844,1,0.0319,0.928,0.000623,0.239,0.279,82.342,17.0
15,60,0.29,0.0874,0,-16.119,1,0.0346,0.856,0.000957,0.109,0.0734,114.997,1.0
17,68,0.349,0.182,5,-13.507,1,0.031,0.92,0.0152,0.143,0.178,136.094,0.0


In [299]:
may_recommend_songs = num_features.iloc[:-1, :]
may_recommend_songs

Unnamed: 0,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,artistsIndex
3,66,0.532,0.0657,0,-19.136,0,0.0406,0.94,0.00488,0.0946,0.292,128.607,2.0
4,64,0.438,0.00756,0,-26.44,0,0.0382,0.941,0.00643,0.109,0.277,133.32,2.0
7,65,0.58,0.0545,9,-14.594,0,0.0473,0.963,0.000265,0.0967,0.177,123.341,3.0
8,63,0.496,0.0688,4,-14.927,0,0.0545,0.964,0.00229,0.112,0.161,119.898,3.0
9,61,0.492,0.0671,10,-15.671,0,0.0399,0.958,0.000968,0.109,0.165,124.908,3.0
10,56,0.38,0.155,1,-11.977,1,0.0311,0.973,1.7e-05,0.0977,0.127,94.23,18.0
12,55,0.521,0.37,0,-9.345,1,0.03,0.84,0.000807,0.108,0.55,78.12,2.0
13,52,0.236,0.173,6,-13.844,1,0.0319,0.928,0.000623,0.239,0.279,82.342,17.0
15,60,0.29,0.0874,0,-16.119,1,0.0346,0.856,0.000957,0.109,0.0734,114.997,1.0
17,68,0.349,0.182,5,-13.507,1,0.031,0.92,0.0152,0.143,0.178,136.094,0.0


In [300]:
song =  num_features.iloc[-1, :]
song

popularity           53.00000
danceability          0.50800
energy                0.00766
key                   9.00000
loudness            -37.78300
mode                  0.00000
speechiness           0.09750
acousticness          0.99100
instrumentalness      0.87300
liveness              0.07320
valence               0.58800
tempo               130.91800
artistsIndex         12.00000
Name: 36, dtype: float64

In [301]:
may_recommend_songs - song

Unnamed: 0,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,artistsIndex
3,13.0,0.024,0.05804,-9.0,18.647,0.0,-0.0569,-0.051,-0.86812,0.0214,-0.296,-2.311,-10.0
4,11.0,-0.07,-0.0001,-9.0,11.343,0.0,-0.0593,-0.05,-0.86657,0.0358,-0.311,2.402,-10.0
7,12.0,0.072,0.04684,0.0,23.189,0.0,-0.0502,-0.028,-0.872735,0.0235,-0.411,-7.577,-9.0
8,10.0,-0.012,0.06114,-5.0,22.856,0.0,-0.043,-0.027,-0.87071,0.0388,-0.427,-11.02,-9.0
9,8.0,-0.016,0.05944,1.0,22.112,0.0,-0.0576,-0.033,-0.872032,0.0358,-0.423,-6.01,-9.0
10,3.0,-0.128,0.14734,-8.0,25.806,1.0,-0.0664,-0.018,-0.872983,0.0245,-0.461,-36.688,6.0
12,2.0,0.013,0.36234,-9.0,28.438,1.0,-0.0675,-0.151,-0.872193,0.0348,-0.038,-52.798,-10.0
13,-1.0,-0.272,0.16534,-3.0,23.939,1.0,-0.0656,-0.063,-0.872377,0.1658,-0.309,-48.576,5.0
15,7.0,-0.218,0.07974,-9.0,21.664,1.0,-0.0629,-0.135,-0.872043,0.0358,-0.5146,-15.921,-11.0
17,15.0,-0.159,0.17434,-4.0,24.276,1.0,-0.0665,-0.071,-0.8578,0.0698,-0.41,5.176,-12.0


In [302]:
distance_vector = np.sum(np.abs(may_recommend_songs - song), axis=1).rename("distance")
distance_vector

3      54.333460
4      45.137770
7      53.270275
8      59.355650
9      47.618872
10     82.212223
12    104.774833
13     84.428117
15     67.503083
17     63.260440
18     86.233950
19     95.035540
20     85.287440
21     92.116240
26     94.140420
27     38.457340
28     36.397966
30     61.185440
31     73.392840
32     72.888236
33    107.271440
35     76.414938
Name: distance, dtype: float64

In [303]:
temp_result = pd.concat([filtered_data, distance_vector], axis=1)
temp_result

Unnamed: 0,name,popularity,artists,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,artistsIndex,prediction,distance
3,I Fall In Love Too Easily - Vocal Version,66,ChetBaker,0.532,0.0657,0,-19.136,0,0.0406,0.94,0.00488,0.0946,0.292,128.607,2.0,1,54.33346
4,My Funny Valentine,64,ChetBaker,0.438,0.00756,0,-26.44,0,0.0382,0.941,0.00643,0.109,0.277,133.32,2.0,1,45.13777
7,I'm In The Mood For Love,65,JulieLondon,0.58,0.0545,9,-14.594,0,0.0473,0.963,0.000265,0.0967,0.177,123.341,3.0,1,53.270275
8,Cry Me A River,63,JulieLondon,0.496,0.0688,4,-14.927,0,0.0545,0.964,0.00229,0.112,0.161,119.898,3.0,1,59.35565
9,I'm Glad There Is You,61,JulieLondon,0.492,0.0671,10,-15.671,0,0.0399,0.958,0.000968,0.109,0.165,124.908,3.0,1,47.618872
10,Black Coffee,56,SarahVaughan,0.38,0.155,1,-11.977,1,0.0311,0.973,1.7e-05,0.0977,0.127,94.23,18.0,1,82.212223
12,Let's Get Lost,55,ChetBaker,0.521,0.37,0,-9.345,1,0.03,0.84,0.000807,0.108,0.55,78.12,2.0,1,104.774833
13,Time After Time,52,MargaretWhiting,0.236,0.173,6,-13.844,1,0.0319,0.928,0.000623,0.239,0.279,82.342,17.0,1,84.428117
15,In The Wee Small Hours Of The Morning - Remast...,60,FrankSinatra,0.29,0.0874,0,-16.119,1,0.0346,0.856,0.000957,0.109,0.0734,114.997,1.0,1,67.503083
17,Unforgettable,68,NatKingCole,0.349,0.182,5,-13.507,1,0.031,0.92,0.0152,0.143,0.178,136.094,0.0,1,63.26044


In [306]:
result_data = temp_result.sort_values('distance')
result_data

Unnamed: 0,name,popularity,artists,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,artistsIndex,prediction,distance
28,Singin' In The Rain,53,GeneKelly,0.354,0.241,5,-13.811,1,0.0334,0.889,0.000274,0.165,0.408,135.646,13.0,1,36.397966
27,Claire de lune,62,ClaudeDebussyAlexisWeissenberg,0.365,0.01,1,-25.268,1,0.0519,0.995,0.924,0.0584,0.0364,135.048,9.0,1,38.45734
4,My Funny Valentine,64,ChetBaker,0.438,0.00756,0,-26.44,0,0.0382,0.941,0.00643,0.109,0.277,133.32,2.0,1,45.13777
9,I'm Glad There Is You,61,JulieLondon,0.492,0.0671,10,-15.671,0,0.0399,0.958,0.000968,0.109,0.165,124.908,3.0,1,47.618872
7,I'm In The Mood For Love,65,JulieLondon,0.58,0.0545,9,-14.594,0,0.0473,0.963,0.000265,0.0967,0.177,123.341,3.0,1,53.270275
3,I Fall In Love Too Easily - Vocal Version,66,ChetBaker,0.532,0.0657,0,-19.136,0,0.0406,0.94,0.00488,0.0946,0.292,128.607,2.0,1,54.33346
8,Cry Me A River,63,JulieLondon,0.496,0.0688,4,-14.927,0,0.0545,0.964,0.00229,0.112,0.161,119.898,3.0,1,59.35565
30,'Round Midnight,58,TheloniousMonk,0.542,0.32,8,-9.975,1,0.0387,0.907,0.649,0.0609,0.206,116.648,23.0,1,61.18544
17,Unforgettable,68,NatKingCole,0.349,0.182,5,-13.507,1,0.031,0.92,0.0152,0.143,0.178,136.094,0.0,1,63.26044
15,In The Wee Small Hours Of The Morning - Remast...,60,FrankSinatra,0.29,0.0874,0,-16.119,1,0.0346,0.856,0.000957,0.109,0.0734,114.997,1.0,1,67.503083


In [307]:
result_data[:10]

Unnamed: 0,name,popularity,artists,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,artistsIndex,prediction,distance
28,Singin' In The Rain,53,GeneKelly,0.354,0.241,5,-13.811,1,0.0334,0.889,0.000274,0.165,0.408,135.646,13.0,1,36.397966
27,Claire de lune,62,ClaudeDebussyAlexisWeissenberg,0.365,0.01,1,-25.268,1,0.0519,0.995,0.924,0.0584,0.0364,135.048,9.0,1,38.45734
4,My Funny Valentine,64,ChetBaker,0.438,0.00756,0,-26.44,0,0.0382,0.941,0.00643,0.109,0.277,133.32,2.0,1,45.13777
9,I'm Glad There Is You,61,JulieLondon,0.492,0.0671,10,-15.671,0,0.0399,0.958,0.000968,0.109,0.165,124.908,3.0,1,47.618872
7,I'm In The Mood For Love,65,JulieLondon,0.58,0.0545,9,-14.594,0,0.0473,0.963,0.000265,0.0967,0.177,123.341,3.0,1,53.270275
3,I Fall In Love Too Easily - Vocal Version,66,ChetBaker,0.532,0.0657,0,-19.136,0,0.0406,0.94,0.00488,0.0946,0.292,128.607,2.0,1,54.33346
8,Cry Me A River,63,JulieLondon,0.496,0.0688,4,-14.927,0,0.0545,0.964,0.00229,0.112,0.161,119.898,3.0,1,59.35565
30,'Round Midnight,58,TheloniousMonk,0.542,0.32,8,-9.975,1,0.0387,0.907,0.649,0.0609,0.206,116.648,23.0,1,61.18544
17,Unforgettable,68,NatKingCole,0.349,0.182,5,-13.507,1,0.031,0.92,0.0152,0.143,0.178,136.094,0.0,1,63.26044
15,In The Wee Small Hours Of The Morning - Remast...,60,FrankSinatra,0.29,0.0874,0,-16.119,1,0.0346,0.856,0.000957,0.109,0.0734,114.997,1.0,1,67.503083


In [309]:
result_data.iloc[:10, :-3]

Unnamed: 0,name,popularity,artists,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
28,Singin' In The Rain,53,GeneKelly,0.354,0.241,5,-13.811,1,0.0334,0.889,0.000274,0.165,0.408,135.646
27,Claire de lune,62,ClaudeDebussyAlexisWeissenberg,0.365,0.01,1,-25.268,1,0.0519,0.995,0.924,0.0584,0.0364,135.048
4,My Funny Valentine,64,ChetBaker,0.438,0.00756,0,-26.44,0,0.0382,0.941,0.00643,0.109,0.277,133.32
9,I'm Glad There Is You,61,JulieLondon,0.492,0.0671,10,-15.671,0,0.0399,0.958,0.000968,0.109,0.165,124.908
7,I'm In The Mood For Love,65,JulieLondon,0.58,0.0545,9,-14.594,0,0.0473,0.963,0.000265,0.0967,0.177,123.341
3,I Fall In Love Too Easily - Vocal Version,66,ChetBaker,0.532,0.0657,0,-19.136,0,0.0406,0.94,0.00488,0.0946,0.292,128.607
8,Cry Me A River,63,JulieLondon,0.496,0.0688,4,-14.927,0,0.0545,0.964,0.00229,0.112,0.161,119.898
30,'Round Midnight,58,TheloniousMonk,0.542,0.32,8,-9.975,1,0.0387,0.907,0.649,0.0609,0.206,116.648
17,Unforgettable,68,NatKingCole,0.349,0.182,5,-13.507,1,0.031,0.92,0.0152,0.143,0.178,136.094
15,In The Wee Small Hours Of The Morning - Remast...,60,FrankSinatra,0.29,0.0874,0,-16.119,1,0.0346,0.856,0.000957,0.109,0.0734,114.997
