In [24]:
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import GBTRegressor
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline

In [25]:
# Создание SparkSession
spark = SparkSession.builder.appName("CrabAgePrediction").getOrCreate()
# Загружаем датасет
data = spark.read.csv("CrabAgePrediction.csv", header = True, inferSchema = True)

data.printSchema()
data.show()

root
 |-- Sex: string (nullable = true)
 |-- Length: double (nullable = true)
 |-- Diameter: double (nullable = true)
 |-- Height: double (nullable = true)
 |-- Weight: double (nullable = true)
 |-- Shucked Weight: double (nullable = true)
 |-- Viscera Weight: double (nullable = true)
 |-- Shell Weight: double (nullable = true)
 |-- Age: integer (nullable = true)

+---+------+--------+------+-----------+--------------+--------------+------------+---+
|Sex|Length|Diameter|Height|     Weight|Shucked Weight|Viscera Weight|Shell Weight|Age|
+---+------+--------+------+-----------+--------------+--------------+------------+---+
|  F|1.4375|   1.175|0.4125| 24.6357155|    12.3320325|     5.5848515|    6.747181|  9|
|  M|0.8875|    0.65|0.2125| 5.40057975|     2.2963095|    1.37495075|   1.5592225|  6|
|  I|1.0375|   0.775|  0.25| 7.95203475|      3.231843|    1.60174675|  2.76407625|  6|
|  F| 1.175|  0.8875|  0.25|13.48018725|    4.74854125|    2.28213475|   5.2446575| 10|
|  I|0.8875|  0.6

In [26]:
# Преобразуем Пол краба в числа
indexer = StringIndexer(inputCol="Sex", outputCol="SexIndex")
# Преобразовываем данные. Начинаем с выбора признаков
features = ["SexIndex", "Length", "Diameter" , "Height", "Weight", "Shucked Weight", "Viscera Weight", "Shell Weight"]
# Создаем вектор этих признаков
assembler = VectorAssembler(inputCols=features, outputCol="features")
# Разделяем данные на обучение и тест
(train, test) = data.randomSplit([0.7, 0.3], seed=12345)

train.show(3)
test.show(3)

print(train.count())
print(test.count())

+---+------+--------+------+---------+--------------+--------------+------------+---+
|Sex|Length|Diameter|Height|   Weight|Shucked Weight|Viscera Weight|Shell Weight|Age|
+---+------+--------+------+---------+--------------+--------------+------------+---+
|  F|0.6875|  0.4875| 0.175|  2.26796|     0.8788345|    0.60951425|   0.7087375|  5|
|  F| 0.725|   0.525|0.1875|7.7961125|     3.2034935|    1.91359125|   0.9922325|  6|
|  F| 0.725|  0.5625|0.1875|  3.96893|    1.45999925|    0.66621325|     1.13398|  5|
+---+------+--------+------+---------+--------------+--------------+------------+---+
only showing top 3 rows

+---+------+--------+------+---------+--------------+--------------+------------+---+
|Sex|Length|Diameter|Height|   Weight|Shucked Weight|Viscera Weight|Shell Weight|Age|
+---+------+--------+------+---------+--------------+--------------+------------+---+
|  F| 0.825|    0.65|   0.2|   5.6699|    1.77184375|      1.417475|    1.984465|  9|
|  F| 0.875|  0.6875|0.1625|5

In [27]:
# Создаем модель
rf = RandomForestRegressor(featuresCol = "features", labelCol= "Age")
# Создаем конвейер
pipelineRF = Pipeline(stages = [indexer, assembler, rf])
# Обучение модели
modelRF = pipelineRF.fit(train)
# Предсказание возраста на тестовой выборке
predictionsRF = modelRF.transform(test)

In [28]:
# Оценка модели
evaluator = RegressionEvaluator(labelCol = "Age", predictionCol = "prediction", metricName = "rmse")
rmse = evaluator.evaluate(predictionsRF)
print("Root Mean Squared Error (RMSE): %s" % rmse)
# Вывод результата и выключение SparkSession
predictionsRF.select("Age", "prediction").show()
#spark.stop()

Root Mean Squared Error (RMSE): 2.2615401627561877
+---+------------------+
|Age|        prediction|
+---+------------------+
|  9| 7.988392648316643|
| 10| 7.908705148316645|
|  8| 8.045248288980488|
|  7| 9.548391803438388|
|  7| 8.000412326977736|
|  8| 8.592632589196345|
| 10| 8.720904423833051|
|  9| 9.344447221980165|
| 10| 8.911436173951397|
|  7| 9.440142734707567|
|  7| 9.337883154618444|
|  8|  9.39839418522813|
| 11| 9.443347582927702|
|  6| 9.414458930156876|
|  9|10.059943090170167|
|  9| 9.443347582927702|
| 13| 9.771095354233097|
| 13|11.179394759835631|
|  8| 9.810599704373693|
| 13| 9.386385299029001|
+---+------------------+
only showing top 20 rows



In [None]:
# Создание модели градиентного бустинга
gbt = GBTRegressor(featuresCol="features", labelCol= "Age")
# Создание конвейера
pipelineGBT = Pipeline(stages = [indexer, assembler, gbt])
# Обучение модели
modelGBT = pipelineGBT.fit(train)
# Предсказание возраста на тестовой выборке
predictionsGBT = modelGBT.transform(test)

In [30]:
# Оценка модели
evaluator = RegressionEvaluator(labelCol = "Age", predictionCol = "prediction", metricName = "rmse")
rmse = evaluator.evaluate(predictionsGBT)
print("Root Mean Squared Error (RMSE): %s" % rmse)
# Вывод результата и выключение SparkSession
predictionsGBT.select("Age", "prediction").show()
spark.stop()

Root Mean Squared Error (RMSE): 2.2694767182967137
+---+------------------+
|Age|        prediction|
+---+------------------+
|  9|7.7375495134360985|
| 10|7.7375495134360985|
|  8|7.7375495134360985|
|  7| 9.629664301633323|
|  7| 8.173401460744234|
|  8| 8.649666277831063|
| 10| 8.788465602981512|
|  9| 9.891146804749734|
| 10| 8.649666277831063|
|  7| 9.891146804749734|
|  7| 9.624494300702326|
|  8| 9.891146804749734|
| 11| 9.891146804749734|
|  6| 9.624494300702326|
|  9|11.572535377407089|
|  9| 9.891146804749734|
| 13|10.428633260338977|
| 13|12.579037250175992|
|  8|  8.87359801336318|
| 13| 9.891146804749734|
+---+------------------+
only showing top 20 rows

