# Machine learning experiments

In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType, ArrayType
import json
import os
import math
import sys
import re

In [2]:
# Add here your team number
team = 14
# .master("local[*]") \
# location of your Hive database in HDFS
warehouse = "project/hive/warehouse"

spark = SparkSession.builder\
        .appName(f"Team {team} - spark ML Job Descriptions")\
        .master("yarn") \
        .config("spark.submit.deployMode", "client")\
        .config("hive.metastore.uris", "thrift://hadoop-02.uni.innopolis.ru:9883")\
        .config("spark.sql.warehouse.dir", warehouse)\
        .config("spark.sql.avro.compression.codec", "snappy")\
        .enableHiveSupport()\
        .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/04/27 09:31:39 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/04/27 09:31:40 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
25/04/27 09:31:40 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
25/04/27 09:31:41 WARN DomainSocketFactory: The short-circuit local reads feature cannot be used because libhadoop cannot be loaded.
25/04/27 09:31:41 WARN Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME.


In [3]:
spark

In [4]:
from pyspark.ml.linalg import VectorUDT

input_schema = StructType([
        StructField("features", VectorUDT(), True), # Features are stored as vectors
        StructField("label", DoubleType(), True)    # Label is a double (salary_avg)
    ])


In [5]:
train_data = spark.read.format("json").schema(input_schema).load("project/data/train")
test_data = spark.read.format("json").schema(input_schema).load("project/data/test")

train_data.show()

                                                                                

+--------------------+-------+
|            features|  label|
+--------------------+-------+
|(2920,[0,2,3,4,5,...|95500.0|
|(2920,[0,2,3,4,5,...|74500.0|
|(2920,[0,2,3,4,5,...|88000.0|
|(2920,[0,2,3,4,5,...|77000.0|
|(2920,[0,2,3,4,5,...|88000.0|
|(2920,[0,2,3,4,5,...|89000.0|
|(2920,[0,2,3,4,5,...|93500.0|
|(2920,[0,2,3,4,5,...|74000.0|
|(2920,[0,2,3,4,5,...|80500.0|
|(2920,[0,2,3,4,5,...|88500.0|
|(2920,[0,2,3,4,5,...|86500.0|
|(2920,[0,2,3,4,5,...|85500.0|
|(2920,[0,2,3,4,5,...|83000.0|
|(2920,[0,2,3,4,5,...|72000.0|
|(2920,[0,2,3,4,5,...|88000.0|
|(2920,[0,2,3,4,5,...|78500.0|
|(2920,[0,2,3,4,5,...|87500.0|
|(2920,[0,2,3,4,5,...|76500.0|
|(2920,[0,2,3,4,5,...|72500.0|
|(2920,[0,2,3,4,5,...|75500.0|
+--------------------+-------+
only showing top 20 rows



In [6]:
train_data.cache()
test_data.cache()

DataFrame[features: vector, label: double]

In [7]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit

evaluator_rmse = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")
evaluator_r2 = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2")

lr = LinearRegression(featuresCol="features", labelCol="label")

lr_param_grid = ParamGridBuilder() \
    .addGrid(lr.regParam, [0.01, 0.1, 0.5]) \
    .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0]) \
    .build()

tvs = TrainValidationSplit(
    estimator=lr,
    estimatorParamMaps=lr_param_grid,
    evaluator=evaluator_rmse,
    trainRatio=0.8,  # 80% train, 20% validation
    parallelism=4    # safe to set >1 here if needed
)

In [8]:
print("Running TrainValidationSplit for Linear Regression...")
lr_tvs_model = tvs.fit(train_data)
print("Training finished.")

Running TrainValidationSplit for Linear Regression...


25/04/27 09:33:02 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
25/04/27 09:33:03 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK
                                                                                

Training finished.


                                                                                

In [9]:
lr_best_model = lr_tvs_model.bestModel
print(f"Best LR Model Params: regParam={lr_best_model._java_obj.getRegParam()}, elasticNetParam={lr_best_model._java_obj.getElasticNetParam()}")

Best LR Model Params: regParam=0.5, elasticNetParam=1.0


In [10]:
from pyspark.ml.regression import GBTRegressor

gbt = GBTRegressor(featuresCol="features", labelCol="label", seed=42)
gbt_grid = ParamGridBuilder() \
        .addGrid(gbt.maxDepth, [3, 5, 7]) \
        .addGrid(gbt.maxIter, [10, 20]) \
        .addGrid(gbt.stepSize, [0.1, 0.05]) \
        .build()
gbt_tsv = TrainValidationSplit(estimator=gbt,
                          estimatorParamMaps=gbt_grid,
                          evaluator=evaluator_rmse, # Use RMSE for tuning
                          trainRatio=80,
                          parallelism=4)

In [11]:
print("Running TrainValidationSplit for GBTRegressor...")
gbt_tvs_model = gbt_tsv.fit(train_data)
print("Training finished.")

Running TrainValidationSplit for GBTRegressor...


[Stage 345:===>             (2 + 2) / 9][Stage 347:>                (0 + 0) / 9]

IllegalArgumentException: requirement failed: Nothing has been added to this summarizer.

[Stage 1219:> (4 + 2) / 9][Stage 1221:> (0 + 0) / 9][Stage 1223:> (0 + 0) / 9]  

In [None]:
best_gbt_model = gbt_tvs_model.bestModel

# Print best hyperparameters
print(f"Best maxDepth: {best_gbt_model._java_obj.getMaxDepth()}")
print(f"Best maxIter: {best_gbt_model._java_obj.getMaxIter()}")
print(f"Best stepSize: {best_gbt_model._java_obj.getStepSize()}")