In [176]:
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.regression import LinearRegression
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.regression import IsotonicRegression
from pyspark.ml.regression import FMRegressor
from pyspark.ml.regression import GeneralizedLinearRegression
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.sql.functions import col

In [177]:
# Создание SparkSession
spark = SparkSession.builder.appName("CrabAgePrediction").getOrCreate()
# Загружаем датасет
data = spark.read.csv("CrabAgePrediction.csv", header = True, inferSchema = True)

data.printSchema()
data.show()

root
 |-- Sex: string (nullable = true)
 |-- Length: double (nullable = true)
 |-- Diameter: double (nullable = true)
 |-- Height: double (nullable = true)
 |-- Weight: double (nullable = true)
 |-- Shucked Weight: double (nullable = true)
 |-- Viscera Weight: double (nullable = true)
 |-- Shell Weight: double (nullable = true)
 |-- Age: integer (nullable = true)

+---+------+--------+------+-----------+--------------+--------------+------------+---+
|Sex|Length|Diameter|Height|     Weight|Shucked Weight|Viscera Weight|Shell Weight|Age|
+---+------+--------+------+-----------+--------------+--------------+------------+---+
|  F|1.4375|   1.175|0.4125| 24.6357155|    12.3320325|     5.5848515|    6.747181|  9|
|  M|0.8875|    0.65|0.2125| 5.40057975|     2.2963095|    1.37495075|   1.5592225|  6|
|  I|1.0375|   0.775|  0.25| 7.95203475|      3.231843|    1.60174675|  2.76407625|  6|
|  F| 1.175|  0.8875|  0.25|13.48018725|    4.74854125|    2.28213475|   5.2446575| 10|
|  I|0.8875|  0.6

In [178]:
# Преобразуем Пол краба в числа
indexer = StringIndexer(inputCol="Sex", outputCol="SexIndex")
# Преобразовываем данные. Начинаем с выбора признаков
features = ["SexIndex", "Length", "Diameter" , "Height", "Weight", "Shucked Weight", "Viscera Weight", "Shell Weight"]
# Создаем вектор этих признаков
assembler = VectorAssembler(inputCols=features, outputCol="features")
# Разделяем данные на обучение и тест
(train, test) = data.randomSplit([0.7, 0.3], seed=12345)

train.show(3)
test.show(3)

print(train.count())
print(test.count())

+---+------+--------+------+---------+--------------+--------------+------------+---+
|Sex|Length|Diameter|Height|   Weight|Shucked Weight|Viscera Weight|Shell Weight|Age|
+---+------+--------+------+---------+--------------+--------------+------------+---+
|  F|0.6875|  0.4875| 0.175|  2.26796|     0.8788345|    0.60951425|   0.7087375|  5|
|  F| 0.725|   0.525|0.1875|7.7961125|     3.2034935|    1.91359125|   0.9922325|  6|
|  F| 0.725|  0.5625|0.1875|  3.96893|    1.45999925|    0.66621325|     1.13398|  5|
+---+------+--------+------+---------+--------------+--------------+------------+---+
only showing top 3 rows

+---+------+--------+------+---------+--------------+--------------+------------+---+
|Sex|Length|Diameter|Height|   Weight|Shucked Weight|Viscera Weight|Shell Weight|Age|
+---+------+--------+------+---------+--------------+--------------+------------+---+
|  F| 0.825|    0.65|   0.2|   5.6699|    1.77184375|      1.417475|    1.984465|  9|
|  F| 0.875|  0.6875|0.1625|5

In [162]:
# Создаем модель
rf = RandomForestRegressor(featuresCol = "features", labelCol= "Age")
# Создаем конвейер
pipelineRF = Pipeline(stages = [indexer, assembler, rf])
# Обучение модели
modelRF = pipelineRF.fit(train)
# Предсказание возраста на тестовой выборке
predictionsRF = modelRF.transform(test)

In [163]:
# Оценка модели
evaluator = RegressionEvaluator(labelCol = "Age", predictionCol = "prediction", metricName = "rmse")
rmseRF = evaluator.evaluate(predictionsRF)
print("Root Mean Squared Error (RMSE): %s" % rmseRF)
# Вывод результата и выключение SparkSession
predictionsRF.select("Age", "prediction").show()
#spark.stop()

Root Mean Squared Error (RMSE): 2.2615401627561877
+---+------------------+
|Age|        prediction|
+---+------------------+
|  9| 7.988392648316643|
| 10| 7.908705148316645|
|  8| 8.045248288980488|
|  7| 9.548391803438388|
|  7| 8.000412326977736|
|  8| 8.592632589196345|
| 10| 8.720904423833051|
|  9| 9.344447221980165|
| 10| 8.911436173951397|
|  7| 9.440142734707567|
|  7| 9.337883154618444|
|  8|  9.39839418522813|
| 11| 9.443347582927702|
|  6| 9.414458930156876|
|  9|10.059943090170167|
|  9| 9.443347582927702|
| 13| 9.771095354233097|
| 13|11.179394759835631|
|  8| 9.810599704373693|
| 13| 9.386385299029001|
+---+------------------+
only showing top 20 rows



In [164]:
# Создание модели градиентного бустинга
gbt = GBTRegressor(featuresCol="features", labelCol= "Age")
# Создание конвейера
pipelineGBT = Pipeline(stages = [indexer, assembler, gbt])
# Обучение модели
modelGBT = pipelineGBT.fit(train)
# Предсказание возраста на тестовой выборке
predictionsGBT = modelGBT.transform(test)

In [165]:
# Оценка модели
evaluator = RegressionEvaluator(labelCol = "Age", predictionCol = "prediction", metricName = "rmse")
rmseGBT = evaluator.evaluate(predictionsGBT)
print("Root Mean Squared Error (RMSE): %s" % rmseGBT)
# Вывод результата и выключение SparkSession
predictionsGBT.select("Age", "prediction").show()
#spark.stop()

Root Mean Squared Error (RMSE): 2.2583145982417228
+---+------------------+
|Age|        prediction|
+---+------------------+
|  9| 7.601731414344994|
| 10| 7.601731414344994|
|  8|   8.1265215595973|
|  7| 9.202069566653186|
|  7| 8.148940461390136|
|  8| 8.625205278476965|
| 10| 9.279321550741239|
|  9| 9.858380155538812|
| 10| 9.124097604284172|
|  7| 10.05637329458689|
|  7| 9.509740903856851|
|  8|10.108892601147291|
| 11| 9.858380155538812|
|  6| 9.509740903856851|
|  9|12.294300980126295|
|  9| 10.05637329458689|
| 13|10.506719461440273|
| 13|12.781374133515333|
|  8|  7.67318735718679|
| 13| 9.858380155538812|
+---+------------------+
only showing top 20 rows



In [166]:
# Создание модели линейной регрессии
lr = LinearRegression(featuresCol="features", labelCol="Age", maxIter=10, regParam=0.3, elasticNetParam=0.8) 
# Создание конвейера
pipelineLR = Pipeline(stages = [indexer, assembler, lr])
# Fit the model
modelLR = pipelineLR.fit(train)
# Предсказание возраста на тестовой выборке
predictionsLR = modelLR.transform(test)

In [167]:
# Оценка модели
evaluator = RegressionEvaluator(labelCol = "Age", predictionCol = "prediction", metricName = "rmse")
rmseLR = evaluator.evaluate(predictionsLR)
print("Root Mean Squared Error (RMSE): %s" % rmseLR)
# Вывод результата и выключение SparkSession
predictionsLR.select("Age", "prediction").show()
#spark.stop()

Root Mean Squared Error (RMSE): 2.579961653876411
+---+------------------+
|Age|        prediction|
+---+------------------+
|  9| 7.647080104576439|
| 10| 7.376432872738925|
|  8|  7.66477689077303|
|  7| 9.301119967955419|
|  7| 7.742347246965229|
|  8| 8.110474241504926|
| 10| 8.372539183279702|
|  9| 8.586246080405784|
| 10| 8.441845873542388|
|  7| 8.939013512479402|
|  7| 8.598110909783623|
|  8| 8.511591736817113|
| 11|  8.80385732399132|
|  6| 8.612277066622966|
|  9|  9.02461930256062|
|  9| 8.911731123024163|
| 13| 9.265305866499462|
| 13| 9.640374692545175|
|  8|28.359676441418646|
| 13| 8.740840898929894|
+---+------------------+
only showing top 20 rows



In [168]:
# Создание модели решающих деервьев
dt = DecisionTreeRegressor(featuresCol="features", labelCol="Age")
# Создание конвейера
pipelineDT = Pipeline(stages = [indexer, assembler, dt])
# Fit the model
modelDT = pipelineDT.fit(train)
# Предсказание возраста на тестовой выборке
predictionsDT = modelDT.transform(test)

In [170]:
# Оценка модели
evaluator = RegressionEvaluator(labelCol = "Age", predictionCol = "prediction", metricName = "rmse")
rmseDT = evaluator.evaluate(predictionsDT)
print("Root Mean Squared Error (RMSE): %s" % rmseDT)
# Вывод результата и выключение SparkSession
predictionsDT.select("Age", "prediction").show()
#spark.stop()

Root Mean Squared Error (RMSE): 2.3033106593033175
+---+------------------+
|Age|        prediction|
+---+------------------+
|  9| 8.515463917525773|
| 10| 8.515463917525773|
|  8| 8.515463917525773|
|  7| 9.052631578947368|
|  7| 8.515463917525773|
|  8| 8.515463917525773|
| 10| 8.515463917525773|
|  9| 9.756944444444445|
| 10| 8.515463917525773|
|  7| 9.756944444444445|
|  7| 9.756944444444445|
|  8| 9.756944444444445|
| 11| 9.756944444444445|
|  6| 9.756944444444445|
|  9| 9.756944444444445|
|  9| 9.756944444444445|
| 13| 9.756944444444445|
| 13|11.507537688442211|
|  8| 7.804878048780488|
| 13| 9.756944444444445|
+---+------------------+
only showing top 20 rows



In [171]:
# Создание модели изотонической регрессии
ir = IsotonicRegression(featuresCol="features", labelCol="Age")
# Создание конвейера
pipelineIR = Pipeline(stages = [indexer, assembler, ir])
# Fit the model
modelIR = pipelineIR.fit(train)
# Предсказание возраста на тестовой выборке
predictionsIR = modelIR.transform(test)

In [172]:
# Оценка модели
evaluator = RegressionEvaluator(labelCol = "Age", predictionCol = "prediction", metricName = "rmse")
rmseIR = evaluator.evaluate(predictionsIR)
print("Root Mean Squared Error (RMSE): %s" % rmseIR)
# Вывод результата и выключение SparkSession
predictionsIR.select("Age", "prediction").show()
#spark.stop()

Root Mean Squared Error (RMSE): 3.912391248823117
+---+------------------+
|Age|        prediction|
+---+------------------+
|  9|10.181337181337181|
| 10|10.181337181337181|
|  8|10.181337181337181|
|  7|10.181337181337181|
|  7|10.181337181337181|
|  8|10.181337181337181|
| 10|10.181337181337181|
|  9|10.181337181337181|
| 10|10.181337181337181|
|  7|10.181337181337181|
|  7|10.181337181337181|
|  8|10.181337181337181|
| 11|10.181337181337181|
|  6|10.181337181337181|
|  9|10.181337181337181|
|  9|10.181337181337181|
| 13|10.181337181337181|
| 13|10.181337181337181|
|  8|10.181337181337181|
| 13|10.181337181337181|
+---+------------------+
only showing top 20 rows



In [173]:
# Создание модели FMRegressor
fm = FMRegressor(featuresCol="features", labelCol="Age", predictionCol="prediction",stepSize=0.001)
# Создание конвейера
pipelineFM = Pipeline(stages = [indexer, assembler, fm])
# Fit the model
modelFM = pipelineFM.fit(train)
# Предсказание возраста на тестовой выборке
predictionsFM = modelFM.transform(test)

In [174]:
# Оценка модели
evaluator = RegressionEvaluator(labelCol = "Age", predictionCol = "prediction", metricName = "rmse")
rmseFM = evaluator.evaluate(predictionsFM)
print("Root Mean Squared Error (RMSE): %s" % rmseFM)
# Вывод результата и выключение SparkSession
predictionsFM.select("Age", "prediction").show()
#spark.stop()

Root Mean Squared Error (RMSE): 5.35109053198643
+---+------------------+
|Age|        prediction|
+---+------------------+
|  9|1.3120302935748251|
| 10|1.3541586261200584|
|  8|1.5336236978486755|
|  7| 4.026391330554553|
|  7|1.8983260448229755|
|  8|1.8517696853303236|
| 10|1.8931313322409584|
|  9|2.3402544906660534|
| 10|2.0773916644682973|
|  7|2.7406158867610104|
|  7|2.9277645764985603|
|  8|  2.46503516280356|
| 11| 2.793089647795517|
|  6|2.9979136278488148|
|  9|2.9432307343815083|
|  9| 2.853777826972006|
| 13|3.8249887069256774|
| 13|3.6753797938273696|
|  8|  5.37600673173027|
| 13|2.5580406619571927|
+---+------------------+
only showing top 20 rows



In [158]:
# Модель  `AFTSurvivalRegression`  не подходит для датасета  `CrabAgePrediction`, так как нет информации о том,  
# живы ли были крабы, когда их поймали.

# Эта модель предназначена для анализа данных выживания, где есть данные о времени до события (в данном случае, смерть) 
# и информация о том,  было ли это событие наблюдено (краб умер) или  **цензурировано**  (краб был жив, когда его поймали).  

In [179]:
glr = GeneralizedLinearRegression(featuresCol="features", labelCol="Age", family="gaussian", link="identity", maxIter=10, regParam=0.3)
# Создание конвейера
pipelineGLR = Pipeline(stages = [indexer, assembler, glr])
# Fit the model
modelGLR = pipelineGLR.fit(train)
# Предсказание возраста на тестовой выборке
predictionsGLR = modelGLR.transform(test)

24/06/09 01:56:27 WARN LAPACK: Failed to load implementation from: com.github.fommil.netlib.NativeSystemLAPACK
24/06/09 01:56:27 WARN LAPACK: Failed to load implementation from: com.github.fommil.netlib.NativeRefLAPACK


In [180]:
# Оценка модели
evaluator = RegressionEvaluator(labelCol = "Age", predictionCol = "prediction", metricName = "rmse")
rmseGLR = evaluator.evaluate(predictionsGLR)
print("Root Mean Squared Error (RMSE): %s" % rmseGLR)
# Вывод результата и выключение SparkSession
predictionsGLR.select("Age", "prediction").show()
#spark.stop()

Root Mean Squared Error (RMSE): 2.3887452525121136
+---+------------------+
|Age|        prediction|
+---+------------------+
|  9| 7.494100623301231|
| 10| 7.178808381551332|
|  8| 7.433041136683782|
|  7| 8.685829205427842|
|  7| 7.334002582304484|
|  8| 7.985053710573677|
| 10|  8.33746522216055|
|  9|  8.61636127335104|
| 10| 8.490470332227247|
|  7|  9.16165662431091|
|  7| 8.526349983048226|
|  8|  8.63925358802613|
| 11| 9.038253044888194|
|  6| 8.419051995114586|
|  9| 8.997563331236265|
|  9| 9.110619617715807|
| 13| 9.210283301035286|
| 13|10.105747252959798|
|  8| 31.62050909871427|
| 13| 9.006838873792155|
+---+------------------+
only showing top 20 rows



In [181]:
# Вывод лучшего метода
#Можно было бы еще записывать результаты в словарь 
best_model = ""
best_rmse = float("inf")
if rmseLR < best_rmse:
    best_model = "Linear Regression"
    best_rmse = rmseLR
if rmseRF < best_rmse:
    best_model = "Random Forest"
    best_rmse = rmseRF
if rmseGBT < best_rmse:
    best_model = "Gradient Boosted Trees"
    best_rmse = rmseGBT
if rmseFM < best_rmse:
    best_model = "Factorization machines regressor"
    best_rmse = rmseLR
if rmseIR < best_rmse:
    best_model = "Isotonic Regression"
    best_rmse = rmseRF
if rmseDT < best_rmse:
    best_model = "Decision Tree Regressor"
    best_rmse = rmseGBT
if rmseGLR < best_rmse:
    best_model = "Generalized Linear Regression"
    best_rmse = rmseGLR
print(f"Лучшая модель: {best_model} (RMSE: {best_rmse})")
spark.stop()

Лучшая модель: Gradient Boosted Trees (RMSE: 2.2583145982417228)
