In [10]:
import findspark 
findspark.init()

from pyspark.sql import SparkSession
from pyspark.ml.clustering import KMeans

In [11]:
spark = SparkSession.builder.appName('Clustering mall customers into Gold - Silver - Bronze customers depending on their spending score ').getOrCreate()

## Reading dataset 

In [12]:
df = spark.read.csv('/Users/ihebd/PycharmProjects/Spark-with-machine-learning-/datasets/Mall_Customers.csv',
                    header=True, inferSchema=True)

### Let's explore our dataset

In [13]:
df.printSchema()

root
 |-- CustomerID: integer (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Annual Income (k$): integer (nullable = true)
 |-- Spending Score (1-100): integer (nullable = true)



In [14]:
df.describe()

DataFrame[summary: string, CustomerID: string, Gender: string, Age: string, Annual Income (k$): string, Spending Score (1-100): string]

In [15]:
df.take(5)

[Row(CustomerID=1, Gender='Male', Age=19, Annual Income (k$)=15, Spending Score (1-100)=39),
 Row(CustomerID=2, Gender='Male', Age=21, Annual Income (k$)=15, Spending Score (1-100)=81),
 Row(CustomerID=3, Gender='Female', Age=20, Annual Income (k$)=16, Spending Score (1-100)=6),
 Row(CustomerID=4, Gender='Female', Age=23, Annual Income (k$)=16, Spending Score (1-100)=77),
 Row(CustomerID=5, Gender='Female', Age=31, Annual Income (k$)=17, Spending Score (1-100)=40)]

In [16]:
df.count()

200

### Checking for null/missing values 

In [17]:
from pyspark.sql.functions import isnan, when, count, col

df.select([count(when(isnan(c), c)).alias(c) for c in df.columns]).show()

+----------+------+---+------------------+----------------------+
|CustomerID|Gender|Age|Annual Income (k$)|Spending Score (1-100)|
+----------+------+---+------------------+----------------------+
|         0|     0|  0|                 0|                     0|
+----------+------+---+------------------+----------------------+



In [18]:
df.select([count(when(col(c).isNull(), c)).alias(c) for c in df.columns]).show()

+----------+------+---+------------------+----------------------+
|CustomerID|Gender|Age|Annual Income (k$)|Spending Score (1-100)|
+----------+------+---+------------------+----------------------+
|         0|     0|  0|                 0|                     0|
+----------+------+---+------------------+----------------------+



### create features vector 

In [19]:
# This dataset contains only one feature which is spending score as we aim to cluster customers depending on their speding score 
# Import VectorAssembler to create our features vector 
from pyspark.ml.feature import VectorAssembler
features = ['Spending Score (1-100)']

feature_vector = VectorAssembler(inputCols=features, outputCol='features')

ds = feature_vector.transform(df)

In [20]:
ds.columns

['CustomerID',
 'Gender',
 'Age',
 'Annual Income (k$)',
 'Spending Score (1-100)',
 'features']

## Create our clustering kmeans model 

In [21]:
# Create kmeans model
kmeans = KMeans(featuresCol='features', k=3)

# fit kmeans model 
model = kmeans.fit(ds)



In [24]:
print(model.summary)

<pyspark.ml.clustering.KMeansSummary object at 0x000001D64BB8BD90>


In [25]:
# make predictions  

predictions = model.transform(ds)
from pyspark.ml.evaluation import ClusteringEvaluator
# Evaluate clustering by computing Silhouette score
evaluator = ClusteringEvaluator()

In [26]:
silhouette = evaluator.evaluate(predictions)
print("Silhouette with squared euclidean distance = " + str(silhouette))

Silhouette with squared euclidean distance = 0.8172327476215493


In [27]:
model.transform(ds).groupBy('prediction').count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|         1|   94|
|         2|   47|
|         0|   59|
+----------+-----+



We managed to cluster our customers into three groups depending on their spending score 

                * Cluster 1 : 47 customers
                * Cluster 2 : 59 customers
                * Cluster 3 : 94 customers 


## Analyse and visualize our clusters 

In [28]:
model.transform(ds).groupBy('prediction').max('Annual Income (k$)').show()

+----------+-----------------------+
|prediction|max(Annual Income (k$))|
+----------+-----------------------+
|         1|                     99|
|         2|                    137|
|         0|                    137|
+----------+-----------------------+



In [29]:
model.transform(ds).groupBy('prediction').min('Annual Income (k$)').show()

+----------+-----------------------+
|prediction|min(Annual Income (k$))|
+----------+-----------------------+
|         1|                     15|
|         2|                     16|
|         0|                     15|
+----------+-----------------------+



In [30]:
model.transform(ds).groupBy('prediction').sum('Annual Income (k$)').show()

+----------+-----------------------+
|prediction|sum(Annual Income (k$))|
+----------+-----------------------+
|         1|                   5128|
|         2|                   3158|
|         0|                   3826|
+----------+-----------------------+



In [31]:
model.transform(ds).groupBy('prediction').mean('Age').show()

+----------+-----------------+
|prediction|         avg(Age)|
+----------+-----------------+
|         1|42.41489361702128|
|         2|42.95744680851064|
|         0|29.89830508474576|
+----------+-----------------+



In [32]:
model.transform(ds).groupBy('prediction').sum('Spending Score (1-100)').show()

+----------+---------------------------+
|prediction|sum(Spending Score (1-100))|
+----------+---------------------------+
|         1|                       4529|
|         2|                        686|
|         0|                       4825|
+----------+---------------------------+



In [33]:
model.transform(ds).groupBy('prediction').avg('Spending Score (1-100)').show()

+----------+---------------------------+
|prediction|avg(Spending Score (1-100))|
+----------+---------------------------+
|         1|         48.180851063829785|
|         2|         14.595744680851064|
|         0|          81.77966101694915|
+----------+---------------------------+

