In [1]:
import findspark
findspark.init() 

In [2]:
import pyspark

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.window import Window
from pyspark.ml.stat import Correlation
from pyspark.sql.types import IntegerType,BooleanType,DateType,FloatType, StringType
import matplotlib.pyplot as plt 
import seaborn as sns
import pyspark.sql.types as T
import pandas as pd

In [4]:
spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

def fudf(val):
    return functools.reduce(lambda x, y:x+y, val)

In [53]:
df = spark.read.json('C:\\Users\\pasqu\\Desktop\\Università\\Distributed Data Analysis and Mining\\cleaned_dataset')

In [12]:
df.count()

583363

In [40]:
df.show()

+------------+------------------+--------------------+---------------------+------------+-----------+------+--------------------+--------------------+----------------+---+--------+--------+----+----------------+------------+-----------+--------------------+---------------------+-------+--------------+-------+
|acousticness|               age|avg_artist_followers|avg_artist_popularity|danceability|duration_ms|energy|              genres|            id_track|instrumentalness|key|liveness|loudness|mode|popularity_track|release_date|speechiness|sum_artist_followers|sum_artist_popularity|  tempo|time_signature|valence|
+------------+------------------+--------------------+---------------------+------------+-----------+------+--------------------+--------------------+----------------+---+--------+--------+----+----------------+------------+-----------+--------------------+---------------------+-------+--------------+-------+
|       0.658|41.821917808219176|              5403.5|             

# Data Transformation

Vector Assembler. A vector assembler is a transformer that converts a set of features into a single vector column often referred to as an array of features 

In [54]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(
    inputCols=["avg_artist_popularity","duration_ms", "danceability", "loudness", "speechiness","acousticness","instrumentalness","liveness","valence","tempo","time_signature","age", "mode","sum_artist_followers","sum_artist_popularity"],
    outputCol="features")

df = assembler.transform(df)

In [55]:
df.select('features').show(truncate=False)

+------------------------------------------------------------------------------------------------------------------------+
|features                                                                                                                |
+------------------------------------------------------------------------------------------------------------------------+
|[40.0,156067.0,0.602,-6.667,0.404,0.658,0.0,0.0972,0.65,182.229,3.0,41.821917808219176,1.0,10807.0,80.0]                |
|[43.0,220133.0,0.77,-7.306,0.172,0.543,7.96E-4,0.0684,0.898,135.573,4.0,45.824657534246576,1.0,19833.0,43.0]            |
|[68.0,250960.0,0.212,-6.69,0.14,4.8E-5,0.918,0.324,0.231,140.917,4.0,25.673972602739727,0.0,874600.0,68.0]              |
|[42.0,457040.0,0.362,-17.744,0.0398,0.144,0.827,0.117,0.257,118.853,4.0,31.813698630136987,0.0,69129.0,42.0]            |
|[68.0,282891.0,0.343,-14.937,0.0384,0.957,2.49E-4,0.661,0.101,144.533,4.0,4.071232876712329,0.0,1709414.0,68.0]         |
|[69.0,217000.0,

Standardizzo i valori del vettore

In [56]:
from pyspark.mllib.util import MLUtils
from pyspark.ml.feature import StandardScaler

scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures",
                        withStd=True, withMean=False)

# Compute summary statistics by fitting the StandardScaler
scalerModel = scaler.fit(df)

# Normalize each feature to have unit standard deviation.
df = scalerModel.transform(df)

In [57]:
df.select('scaledFeatures').show(truncate=False)

+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|scaledFeatures                                                                                                                                                                                                                                                                                 |
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|[1.9235483655073684,1.2361331222982879,3.63675244594466,-1.3167426258294748,2.2425341538160897,1.8900121366507092,0.0,0.527476201

variance treshold per eliminare gli attributi con basssa varianza (minore di 1)

In [58]:
from pyspark.ml.feature import VarianceThresholdSelector

selector = VarianceThresholdSelector(varianceThreshold=1,featuresCol='scaledFeatures', outputCol="selectedFeatures")

df = selector.fit(df).transform(df)

print("Output: Features with variance lower than %f are removed." %
      selector.getVarianceThreshold())

Output: Features with variance lower than 1.000000 are removed.


In [59]:
df.select('selectedFeatures').show(truncate=False)

+------------------------------------------------------------+
|selectedFeatures                                            |
+------------------------------------------------------------+
|[2.2425341538160897,0.5274762018580152,1.827589589002979]   |
|[0.9547422635058598,0.3711869568630478,2.0025066141935035]  |
|[0.7771157958768627,1.7582540061933842,1.1219352793021238]  |
|[0.22092291911356524,0.6349250577920554,1.3902371639372808] |
|[0.2131517611547966,3.5870552410303302,0.1779101296599035]  |
|[0.20093994150530306,0.5166227820666981,0.6031704126693095] |
|[0.2292491597836745,0.45041692133966327,3.139407415842523]  |
|[0.44351108636115233,0.7000455765399586,0.29128892695999004]|
|[0.38522740167038766,0.48080649675535136,1.338276870348857] |
|[0.24312622756718988,0.7705928051835201,0.9091854136186455] |
|[0.1992746933712812,1.6768533577585054,2.7021747149556]     |
|[0.5828368469076469,0.640351767687714,1.68104719418217]     |
|[0.14543167037124144,0.5220494919623566,0.700027273298

SelectedFeatures = speechiness, liveness, age

# K-Means

In [60]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.ml.clustering import KMeansModel

confronto silhouette score per scegliere il miglior numero di cluster

In [61]:
silhouette_score=[]
evaluator = ClusteringEvaluator(predictionCol='prediction', featuresCol='selectedFeatures', \
                                metricName='silhouette', distanceMeasure='squaredEuclidean')
for i in range(2,10):
    
    KMeans_algo=KMeans(featuresCol='selectedFeatures', k=i)
    
    KMeans_fit=KMeans_algo.fit(df)
    
    output=KMeans_fit.transform(df)
    
    
    
    score=evaluator.evaluate(output)
    
    silhouette_score.append(score)
    
    print("k={}, Silhouette Score:{}".format(i,score))

k=2, Silhouette Score:0.5166126813988047
k=3, Silhouette Score:0.7027754912180209
k=4, Silhouette Score:0.6188030511936338
k=5, Silhouette Score:0.5553441559527094
k=6, Silhouette Score:0.5482477622624944
k=7, Silhouette Score:0.5173128113989679
k=8, Silhouette Score:0.5108339629932724
k=9, Silhouette Score:0.5155385931488722


In [74]:
# Trains a k-means model.
kmeans = KMeans(featuresCol='selectedFeatures', k=4)
model = kmeans.fit(df)

# Make predictions
predictions = model.transform(df)

# Evaluate clustering by computing Silhouette score
evaluator = ClusteringEvaluator()

silhouette = evaluator.evaluate(predictions)
print("Silhouette with squared euclidean distance = " + str(silhouette))

# Shows the result.
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)

Silhouette with squared euclidean distance = -0.6352428256314132
Cluster Centers: 
[0.40061972 0.86353734 0.79206732]
[4.84705963 2.05296189 2.03091133]
[0.33774966 0.91381392 2.54167034]
[0.51305821 3.71835118 1.3537786 ]


In [75]:
generi=predictions.groupBy("prediction","genres").count().orderBy(desc("prediction"), desc("count"))

In [76]:
generi.where(col('prediction')==1).show(100, truncate=False)

+----------+-----------------------------------------------------------------------------------------------------------------------------+-----+
|prediction|genres                                                                                                                       |count|
+----------+-----------------------------------------------------------------------------------------------------------------------------+-----+
|1         |[hoerspiel]                                                                                                                  |7679 |
|1         |[]                                                                                                                           |6181 |
|1         |[kleine hoerspiel]                                                                                                           |1912 |
|1         |[barnsagor]                                                                                                           

In [77]:
generi.where(col('prediction')==3).show(100, truncate=False)

+----------+-----------------------------------------------------------------------------------------------------------------------------+-----+
|prediction|genres                                                                                                                       |count|
+----------+-----------------------------------------------------------------------------------------------------------------------------+-----+
|3         |[]                                                                                                                           |3551 |
|3         |[hoerspiel]                                                                                                                  |371  |
|3         |[sertanejo, sertanejo pop, sertanejo universitario]                                                                          |293  |
|3         |[classic israeli pop, israeli rock]                                                                                   

0 pop\
1 audiolibri\
2 classica, vintage\
3 musica live\

# Bisecting K-means

In [78]:
from pyspark.ml.clustering import BisectingKMeans
from pyspark.ml.evaluation import ClusteringEvaluator

In [79]:
silhouette_score=[]
evaluator = ClusteringEvaluator(predictionCol='prediction', featuresCol='selectedFeatures', \
                                metricName='silhouette', distanceMeasure='squaredEuclidean')
for i in range(2,10):
    
    bkm = BisectingKMeans(featuresCol='selectedFeatures').setK(i).setSeed(1)
    
    model = bkm.fit(df)
    
    output=model.transform(df)
    
    
    
    score=evaluator.evaluate(output)
    
    silhouette_score.append(score)
    
    print("k={}, Silhouette Score:{}".format(i,score))

k=2, Silhouette Score:0.7415624288776014
k=3, Silhouette Score:0.543198987271154
k=4, Silhouette Score:0.6063931564082934
k=5, Silhouette Score:0.5156639699250682
k=6, Silhouette Score:0.4629157750322314
k=7, Silhouette Score:0.4650012503021758
k=8, Silhouette Score:0.4562108156548085
k=9, Silhouette Score:0.40766863299654843


In [80]:
# Trains a bisecting k-means model.
bkm = BisectingKMeans(featuresCol='selectedFeatures').setK(4).setSeed(1)
model = bkm.fit(df)

# Make predictions
predictions = model.transform(df)

# Evaluate clustering by computing Silhouette score
evaluator = ClusteringEvaluator()

silhouette = evaluator.evaluate(predictions)
print("Silhouette with squared euclidean distance = " + str(silhouette))

# Shows the result.
print("Cluster Centers: ")
centers = model.clusterCenters()
for center in centers:
    print(center)

Silhouette with squared euclidean distance = -0.5429574238364241
Cluster Centers: 
[0.38649703 0.89038973 0.79395606]
[0.32490673 0.92299514 2.52324765]
[0.6498445  3.8400005  1.41456087]
[4.74552642 2.02605544 2.02572495]


In [81]:
generi=predictions.groupBy("prediction","genres").count().orderBy(desc("prediction"), desc("count"))

In [82]:
generi.where(col('prediction')==0).show(100, truncate=False)

+----------+------------------------------------------------------------------------------------------------------------+-----+
|prediction|genres                                                                                                      |count|
+----------+------------------------------------------------------------------------------------------------------------+-----+
|0         |[]                                                                                                          |26828|
|0         |[classic israeli pop, israeli rock]                                                                         |1224 |
|0         |[c-pop, cantopop, classic cantopop]                                                                         |1158 |
|0         |[greek pop, laiko]                                                                                          |1105 |
|0         |[classic italian pop, italian adult pop]                                                    

0: [pop]\
1 [classica]\
2 [audiolibri]\
3 [live]
