In [1]:
sc

In [2]:
spark

In [3]:
#將資料轉成parquet檔
raw = spark.read.csv("hdfs://devenv/user/ml_datasets/ec_web_logs_analysis/raw/",
                    header = True,
                    schema="device_id string, timestamp timestamp, product_category_id int,ip string, lat float, lon float, device_type int, connect_type int, age_group string, grnder int, member_id string")

In [4]:
raw.write.parquet("hdfs://devenv/user/ml_datasets/ec_web_logs_analysis/data/")

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import isnull
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [4]:
logs = spark.read.parquet("hdfs://devenv/user/ml_datasets/ec_web_logs_analysis/data/")
logs.show()

+--------------------+-------------------+-------------------+--------------------+---------+----------+-----------+------------+---------+------+--------------------+
|           device_id|          timestamp|product_category_id|                  ip|      lat|       lon|device_type|connect_type|age_group|grnder|           member_id|
+--------------------+-------------------+-------------------+--------------------+---------+----------+-----------+------------+---------+------+--------------------+
|2eecf6cc-b2d8-f13...|2019-05-01 08:00:04|                132|     114.137.185.183|25.027023| 121.41819|          1|           2|     null|  null|c55941d974fa7f9cd...|
|7f9b4249-b0df-39d...|2019-05-01 08:00:07|                 38|        115.82.34.79| 25.02705|121.557396|          2|           2|     null|  null|d578b2f0aa6e00f63...|
|db396d28-b748-56a...|2019-05-01 08:00:12|                 17|      49.216.141.189|24.090103| 120.73134|          2|           2|     null|  null|5f27b787212e16

In [5]:
feature_prep = logs.select("product_category_id", "device_type","connect_type","grnder").where(~isnull("grnder"))
feature_prep.show()

+-------------------+-----------+------------+------+
|product_category_id|device_type|connect_type|grnder|
+-------------------+-----------+------------+------+
|                 60|          2|           1|     0|
|                156|          1|           2|     1|
|                 53|          2|           2|     0|
|                113|          1|           2|     1|
|                104|          2|           2|     1|
|                  6|          2|           2|     0|
|                123|          2|           1|     1|
|                 27|          1|           1|     0|
|                 64|          1|           2|     1|
|                150|          1|           2|     1|
|                107|          1|           1|     1|
|                108|          2|           2|     1|
|                 18|          1|           2|     0|
|                 17|          2|           2|     0|
|                 53|          1|           1|     1|
|                  8|       

In [7]:
final_data = VectorAssembler(inputCols=["product_category_id","device_type","connect_type"],outputCol="features").transform(feature_prep)
final_data.show()

+-------------------+-----------+------------+------+---------------+
|product_category_id|device_type|connect_type|grnder|       features|
+-------------------+-----------+------------+------+---------------+
|                 60|          2|           1|     0| [60.0,2.0,1.0]|
|                156|          1|           2|     1|[156.0,1.0,2.0]|
|                 53|          2|           2|     0| [53.0,2.0,2.0]|
|                113|          1|           2|     1|[113.0,1.0,2.0]|
|                104|          2|           2|     1|[104.0,2.0,2.0]|
|                  6|          2|           2|     0|  [6.0,2.0,2.0]|
|                123|          2|           1|     1|[123.0,2.0,1.0]|
|                 27|          1|           1|     0| [27.0,1.0,1.0]|
|                 64|          1|           2|     1| [64.0,1.0,2.0]|
|                150|          1|           2|     1|[150.0,1.0,2.0]|
|                107|          1|           1|     1|[107.0,1.0,1.0]|
|                108

In [8]:
train_data, test_data = final_data.randomSplit([0.7,0.3])

In [10]:
#Model training <DecisionTree>
classifier = DecisionTreeClassifier(featuresCol="features", labelCol="grnder", maxDepth=10)
model = classifier.fit(train_data)

In [11]:
predicted_test_data = model.transform(train_data)
predicted_test_data.show()

+-------------------+-----------+------------+------+-------------+----------------+--------------------+----------+
|product_category_id|device_type|connect_type|grnder|     features|   rawPrediction|         probability|prediction|
+-------------------+-----------+------------+------+-------------+----------------+--------------------+----------+
|                  1|          1|           1|     0|[1.0,1.0,1.0]|[22328.0,4150.0]|[0.84326610771206...|       0.0|
|                  1|          1|           1|     0|[1.0,1.0,1.0]|[22328.0,4150.0]|[0.84326610771206...|       0.0|
|                  1|          1|           1|     0|[1.0,1.0,1.0]|[22328.0,4150.0]|[0.84326610771206...|       0.0|
|                  1|          1|           1|     0|[1.0,1.0,1.0]|[22328.0,4150.0]|[0.84326610771206...|       0.0|
|                  1|          1|           1|     0|[1.0,1.0,1.0]|[22328.0,4150.0]|[0.84326610771206...|       0.0|
|                  1|          1|           1|     0|[1.0,1.0,1.

In [12]:
evalutor_accuracy = MulticlassClassificationEvaluator(labelCol='grnder',predictionCol='prediction',metricName='accuracy')
print("Accuracy: {}".format(evalutor_accuracy.evaluate(predicted_test_data)))

Accuracy: 0.9298376526286077


In [None]:
model.save("file:///home/spark/Desktop/models/ec_web_logs_analysis/model_gender_prediction/")