In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeans #分群

In [2]:
logs = spark.read.parquet('hdfs://devenv/user/ml_datasets/ec_web_logs_analysis/data/')

In [8]:
logs.count()

1276264

In [5]:
logs.agg(countDistinct(col("device_id"))).show()

+----------------+
|count(device_id)|
+----------------+
|            1000|
+----------------+



In [3]:
# Tagging places the users go frequently for each device and output to MySQL
all_device_ids = logs.select("device_id") \
    .distinct() \
    .rdd.map(lambda row: row[0]).collect()

len_all_device_ids = len(all_device_ids)

In [None]:
for i in range(len_all_device_ids):
    print("{}/{} processed.".format(i,len_all_device_ids))

    device_locations = logs.select("device_id", "lat", "lon")\
        .where("device_id = '{}'".format(all_device_ids[i]))

    device_locations = VectorAssembler(inputCols=["lat", "lon"],
                                       outputCol="features").transform(device_locations)
    # Model training
    kmeans = KMeans(k=5)
    model = kmeans.fit(device_locations)

    # Transform the test data using the model to get predictions
    predicted_device_locations = model.transform(device_locations)

    # Cluster centers and count
    device_inferred_location = predicted_device_locations.groupBy("device_id", "prediction") \
        .agg(avg("lat").alias("avg_lat"), avg("lon").alias("avg_lon"), count("*").alias("lat_lon_count")) \
        .drop("prediction")

    device_inferred_location.persist()

    device_inferred_location.show()

    device_inferred_location.write.option("driver", "com.mysql.jdbc.Driver") \
        .jdbc("jdbc:mysql://localhost:3306", "ec_web_logs_analysis.device_inferred_location",
              properties={"user": "", "password": ""}, mode="append")


0/1000 processed.
+--------------------+------------------+------------------+-------------+
|           device_id|           avg_lat|           avg_lon|lat_lon_count|
+--------------------+------------------+------------------+-------------+
|3f19eed6-8950-75a...|24.945786237716675|121.38734912872314|            8|
|3f19eed6-8950-75a...|25.134732818603517| 121.7504150390625|            5|
|3f19eed6-8950-75a...|25.074490723786532|121.57464966950593|           27|
|3f19eed6-8950-75a...| 24.99284198768157| 121.5029514355767|         1064|
|3f19eed6-8950-75a...|24.990383625030518|121.57012367248535|            8|
+--------------------+------------------+------------------+-------------+

1/1000 processed.
+--------------------+------------------+------------------+-------------+
|           device_id|           avg_lat|           avg_lon|lat_lon_count|
+--------------------+------------------+------------------+-------------+
|8161218a-8582-02f...| 25.07598240360333|121.39894181755697|   

12/1000 processed.
+--------------------+------------------+------------------+-------------+
|           device_id|           avg_lat|           avg_lon|lat_lon_count|
+--------------------+------------------+------------------+-------------+
|228994ca-8bef-2da...|  25.0600694732666|121.53162463378906|          250|
|228994ca-8bef-2da...|24.553667545318604|120.84956169128418|            4|
|228994ca-8bef-2da...| 25.00875358581543|121.46117935180663|           20|
|228994ca-8bef-2da...| 25.13489197040426|121.78038064364729|           58|
|228994ca-8bef-2da...|25.045664377695008|121.61420822554179|          929|
+--------------------+------------------+------------------+-------------+

13/1000 processed.
+--------------------+------------------+------------------+-------------+
|           device_id|           avg_lat|           avg_lon|lat_lon_count|
+--------------------+------------------+------------------+-------------+
|b8ff04c8-bb8a-b0b...|25.059810575200707|121.54100972381086| 

24/1000 processed.
+--------------------+------------------+------------------+-------------+
|           device_id|           avg_lat|           avg_lon|lat_lon_count|
+--------------------+------------------+------------------+-------------+
|63a17bd8-af2a-a49...|22.980470339457195|120.21853764851888|            6|
|63a17bd8-af2a-a49...| 22.94975233078003|120.27115821838379|            4|
|63a17bd8-af2a-a49...|22.727310668036527|120.31322775335981|         1041|
|63a17bd8-af2a-a49...|22.804141521453857|120.32916259765625|            4|
|63a17bd8-af2a-a49...|22.629734906283293|120.32012870094992|           11|
+--------------------+------------------+------------------+-------------+

25/1000 processed.
+--------------------+------------------+------------------+-------------+
|           device_id|           avg_lat|           avg_lon|lat_lon_count|
+--------------------+------------------+------------------+-------------+
|3f3d34b7-ad2a-ce2...|25.042255895546234|121.45935803295606| 

36/1000 processed.
+--------------------+------------------+------------------+-------------+
|           device_id|           avg_lat|           avg_lon|lat_lon_count|
+--------------------+------------------+------------------+-------------+
|d8923806-ae16-b6b...| 25.02702276846942|121.45779480653651|          136|
|d8923806-ae16-b6b...|24.999162579175106|121.49052149612712|          161|
|d8923806-ae16-b6b...| 24.99713723013334|121.44148036921136|          214|
|d8923806-ae16-b6b...|24.990991033249646| 121.4620688124666|          307|
|d8923806-ae16-b6b...|25.009005455874572|121.45999908447266|          347|
+--------------------+------------------+------------------+-------------+

37/1000 processed.
+--------------------+------------------+------------------+-------------+
|           device_id|           avg_lat|           avg_lon|lat_lon_count|
+--------------------+------------------+------------------+-------------+
|3f901223-b702-686...|25.060232503073557|121.55999596264897| 

+--------------------+------------------+------------------+-------------+
|           device_id|           avg_lat|           avg_lon|lat_lon_count|
+--------------------+------------------+------------------+-------------+
|705c692f-834c-5d4...| 22.81437177756398|120.30038357764175|           97|
|705c692f-834c-5d4...| 22.56294298171997|120.35348832429345|          268|
|705c692f-834c-5d4...|23.164402139597925|120.24196519522832|           29|
|705c692f-834c-5d4...| 22.60178284172539|  120.405948997311|          777|
|705c692f-834c-5d4...|22.880385535103933|120.59442901611328|            7|
+--------------------+------------------+------------------+-------------+

49/1000 processed.
+--------------------+------------------+------------------+-------------+
|           device_id|           avg_lat|           avg_lon|lat_lon_count|
+--------------------+------------------+------------------+-------------+
|cc9f1077-8ad0-0dd...|24.909041510687935|121.35834333631728|            9|
|cc9f

### 針對單一使用者分群

In [10]:
device_locations = logs.select("device_id", "lat", "lon")\
    .where("device_id = '3f19eed6-8950-75ab-49fd-0196d35b20b0'")

device_locations = VectorAssembler(inputCols=["lat", "lon"],
                                   outputCol="features").transform(device_locations)

In [11]:
device_locations.show(truncate=False)

+------------------------------------+---------+----------+---------------------------------------+
|device_id                           |lat      |lon       |features                               |
+------------------------------------+---------+----------+---------------------------------------+
|3f19eed6-8950-75ab-49fd-0196d35b20b0|24.991545|121.50291 |[24.991544723510742,121.5029067993164] |
|3f19eed6-8950-75ab-49fd-0196d35b20b0|24.991629|121.5029  |[24.991628646850586,121.50289916992188]|
|3f19eed6-8950-75ab-49fd-0196d35b20b0|24.991627|121.50285 |[24.991626739501953,121.50285339355469]|
|3f19eed6-8950-75ab-49fd-0196d35b20b0|24.991608|121.5029  |[24.991607666015625,121.50289916992188]|
|3f19eed6-8950-75ab-49fd-0196d35b20b0|24.991594|121.5029  |[24.991594314575195,121.50289916992188]|
|3f19eed6-8950-75ab-49fd-0196d35b20b0|24.991604|121.502914|[24.99160385131836,121.50291442871094] |
|3f19eed6-8950-75ab-49fd-0196d35b20b0|25.007242|121.47501 |[25.00724220275879,121.47501373291016] |


In [12]:
# Model training
kmeans = KMeans(k=5) #分成k群(對經緯度做分群)
model = kmeans.fit(device_locations)

In [13]:
# Transform the test data using the model to get predictions
predicted_device_locations = model.transform(device_locations)

In [17]:
predicted_device_locations.show(n=1000, truncate=False)

+------------------------------------+---------+----------+---------------------------------------+----------+
|device_id                           |lat      |lon       |features                               |prediction|
+------------------------------------+---------+----------+---------------------------------------+----------+
|3f19eed6-8950-75ab-49fd-0196d35b20b0|24.991545|121.50291 |[24.991544723510742,121.5029067993164] |0         |
|3f19eed6-8950-75ab-49fd-0196d35b20b0|24.991629|121.5029  |[24.991628646850586,121.50289916992188]|0         |
|3f19eed6-8950-75ab-49fd-0196d35b20b0|24.991627|121.50285 |[24.991626739501953,121.50285339355469]|0         |
|3f19eed6-8950-75ab-49fd-0196d35b20b0|24.991608|121.5029  |[24.991607666015625,121.50289916992188]|0         |
|3f19eed6-8950-75ab-49fd-0196d35b20b0|24.991594|121.5029  |[24.991594314575195,121.50289916992188]|0         |
|3f19eed6-8950-75ab-49fd-0196d35b20b0|24.991604|121.502914|[24.99160385131836,121.50291442871094] |0         |
|

In [18]:
predicted_device_locations.groupBy(col("prediction")).count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|         3|   27|
|         4|    8|
|         0| 1064|
|         1|    8|
|         2|    5|
+----------+-----+



In [19]:
# Cluster centers and count
device_inferred_location = predicted_device_locations.groupBy("device_id", "prediction") \
    .agg(avg("lat").alias("avg_lat"), avg("lon").alias("avg_lon"), count("*").alias("lat_lon_count")) \
    .drop("prediction")

device_inferred_location.show()

+--------------------+------------------+------------------+-------------+
|           device_id|           avg_lat|           avg_lon|lat_lon_count|
+--------------------+------------------+------------------+-------------+
|3f19eed6-8950-75a...|25.074490723786532|121.57464966950593|           27|
|3f19eed6-8950-75a...| 24.99284198768157| 121.5029514355767|         1064|
|3f19eed6-8950-75a...|24.990383625030518|121.57012367248535|            8|
|3f19eed6-8950-75a...|24.945786237716675|121.38734912872314|            8|
|3f19eed6-8950-75a...|25.134732818603517| 121.7504150390625|            5|
+--------------------+------------------+------------------+-------------+

