In [16]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import input_file_name

from pyspark.sql.functions import isnull
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [None]:
spark = SparkSession.\
        builder.\
        appName("pyspark-notebook").\
        getOrCreate()        

In [3]:
# Prepare data
logs = spark.read.parquet("hdfs://namenode:9000//user/data/spark_ml_101/ec_web_logs_analysis/data/")

In [4]:
# Preprocessing and feature engineering
feature_prep = logs.select("product_category_id", "device_type", "connect_type", "gender") \
                   .where(~isnull("gender"))

In [5]:
final_data = VectorAssembler(inputCols=["product_category_id", "device_type", "connect_type"],
                             outputCol="features").transform(feature_prep)

In [7]:
final_data.show()

+-------------------+-----------+------------+------+---------------+
|product_category_id|device_type|connect_type|gender|       features|
+-------------------+-----------+------------+------+---------------+
|                114|          1|           2|     1|[114.0,1.0,2.0]|
|                  2|          2|           2|     0|  [2.0,2.0,2.0]|
|                 59|          2|           1|     0| [59.0,2.0,1.0]|
|                 49|          2|           1|     0| [49.0,2.0,1.0]|
|                144|          1|           2|     1|[144.0,1.0,2.0]|
|                109|          1|           2|     1|[109.0,1.0,2.0]|
|                 30|          1|           2|     0| [30.0,1.0,2.0]|
|                129|          1|           1|     1|[129.0,1.0,1.0]|
|                 55|          1|           2|     0| [55.0,1.0,2.0]|
|                 60|          1|           1|     0| [60.0,1.0,1.0]|
|                 14|          2|           2|     0| [14.0,2.0,2.0]|
|                155

In [6]:
# Split data into train and test sets
train_data, test_data = final_data.randomSplit([0.7, 0.3])

In [8]:
# Model training
classifier = DecisionTreeClassifier(featuresCol="features", labelCol="gender", maxDepth=10)
model = classifier.fit(train_data)

In [9]:
model.save("hdfs://namenode:9000//user/data/spark_ml_101/ec_web_logs_analysis/models/model_gender_prediction/")

In [10]:
# Transform the test data using the model to get predictions
predicted_test_data = model.transform(test_data)

In [11]:
# Evaluate the model performance
evaluator_accuracy = MulticlassClassificationEvaluator(labelCol='gender',
                                                       predictionCol='prediction',
                                                       metricName='accuracy')
print("Accuracy: {}", evaluator_accuracy.evaluate(predicted_test_data))

Accuracy: {} 0.9301915174879365


In [12]:
confusion_matrix_info = predicted_test_data.select("gender", "prediction")\
                                           .groupBy("gender", "prediction")\
                                           .count()

In [13]:
confusion_matrix_info.orderBy("gender", "prediction").show()

+------+----------+-----+
|gender|prediction|count|
+------+----------+-----+
|     0|       0.0|76696|
|     0|       1.0| 2089|
|     1|       0.0| 8631|
|     1|       1.0|66147|
+------+----------+-----+



In [14]:
confusion_matrix_info.orderBy("prediction", "gender").show()

+------+----------+-----+
|gender|prediction|count|
+------+----------+-----+
|     0|       0.0|76696|
|     1|       0.0| 8631|
|     0|       1.0| 2089|
|     1|       1.0|66147|
+------+----------+-----+



In [15]:
spark.stop()