In [1]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType
from pyspark.sql.functions import monotonically_increasing_id, format_number
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.regression import GeneralizedLinearRegression
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.regression import IsotonicRegression
from pyspark.ml.regression import LinearRegression

from pyspark.ml.regression import GBTRegressor
from pyspark.ml.regression import FMRegressor
from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import col
from pyspark.ml import Pipeline
import numpy as np

In [2]:
# Создание SparkSession
spark = SparkSession.builder.appName("CrabAgePrediction").getOrCreate()
# Загружаем датасет
data = spark.read.csv("CrabAgePrediction.csv", header = True, inferSchema = True)

data.printSchema()
data.show()

                                                                                

root
 |-- Sex: string (nullable = true)
 |-- Length: double (nullable = true)
 |-- Diameter: double (nullable = true)
 |-- Height: double (nullable = true)
 |-- Weight: double (nullable = true)
 |-- Shucked Weight: double (nullable = true)
 |-- Viscera Weight: double (nullable = true)
 |-- Shell Weight: double (nullable = true)
 |-- Age: integer (nullable = true)

+---+------+--------+------+-----------+--------------+--------------+------------+---+
|Sex|Length|Diameter|Height|     Weight|Shucked Weight|Viscera Weight|Shell Weight|Age|
+---+------+--------+------+-----------+--------------+--------------+------------+---+
|  F|1.4375|   1.175|0.4125| 24.6357155|    12.3320325|     5.5848515|    6.747181|  9|
|  M|0.8875|    0.65|0.2125| 5.40057975|     2.2963095|    1.37495075|   1.5592225|  6|
|  I|1.0375|   0.775|  0.25| 7.95203475|      3.231843|    1.60174675|  2.76407625|  6|
|  F| 1.175|  0.8875|  0.25|13.48018725|    4.74854125|    2.28213475|   5.2446575| 10|
|  I|0.8875|  0.6

In [3]:
# Преобразуем Пол краба в числа
indexer = StringIndexer(inputCol="Sex", outputCol="SexIndex")
# Преобразовываем данные. Начинаем с выбора признаков
features = ["SexIndex", "Length", "Diameter" , "Height", "Weight", "Shucked Weight", "Viscera Weight", "Shell Weight"]
# Создаем вектор этих признаков
assembler = VectorAssembler(inputCols=features, outputCol="features")
# Разделяем данные на обучение и тест
(train, test) = data.randomSplit([0.7, 0.3])

# Объявляем необходимые переменные
feature_names = test.columns
feature_names.remove("Age")

tupleRMSE = ()
Pred_arr = []
Importance_arr = []

print("Train count: " + str(train.count()))
train.show(3)

print("Test count: " + str(test.count()))
test.show(3)

Train count: 2767
+---+------+--------+------+---------+--------------+--------------+------------+---+
|Sex|Length|Diameter|Height|   Weight|Shucked Weight|Viscera Weight|Shell Weight|Age|
+---+------+--------+------+---------+--------------+--------------+------------+---+
|  F|0.6875|  0.4875| 0.175|  2.26796|     0.8788345|    0.60951425|   0.7087375|  5|
|  F| 0.725|   0.525|0.1875|7.7961125|     3.2034935|    1.91359125|   0.9922325|  6|
|  F| 0.725|  0.5625|0.1875|  3.96893|    1.45999925|    0.66621325|     1.13398|  5|
+---+------+--------+------+---------+--------------+--------------+------------+---+
only showing top 3 rows

Test count: 1126
+---+------+--------+------+----------+--------------+--------------+------------+---+
|Sex|Length|Diameter|Height|    Weight|Shucked Weight|Viscera Weight|Shell Weight|Age|
+---+------+--------+------+----------+--------------+--------------+------------+---+
|  F|0.7625|  0.5625| 0.175|4.20990075|    1.65844575|    0.94970825|   1.275

In [4]:
# Создаем модель
rf = RandomForestRegressor(featuresCol = "features", labelCol= "Age")
# Создаем конвейер
pipelineRF = Pipeline(stages = [indexer, assembler, rf])
# Обучение модели
modelRF = pipelineRF.fit(train)
# Предсказание возраста на тестовой выборке
predictionsRF = modelRF.transform(test).withColumnRenamed("prediction", "predictionsRF")
Pred_arr.append(predictionsRF)

# Сохранение степени влиятельности признаков
name = "RandomForestRegressor"
tup = (name, ) + tuple(list(map(lambda x: float(x), modelRF.stages[2].featureImportances)))
Importance_arr.append(tup)
print(Importance_arr)

                                                                                

[('RandomForestRegressor', 0.02665004704853663, 0.053320040428898416, 0.08449644970692752, 0.16723765971544471, 0.1564371386405771, 0.06842463185436534, 0.04896707521389934, 0.3944669573913509)]


In [5]:
# Оценка модели
evaluator = RegressionEvaluator(labelCol = "Age", predictionCol = "predictionsRF", metricName = "rmse")
rmseRF = evaluator.evaluate(predictionsRF)
tupleRMSE += (rmseRF,)
print("Root Mean Squared Error (RMSE): %s" % rmseRF)
# Вывод результата и выключение SparkSession
predictionsRF.select("Age", "predictionsRF").show()

Root Mean Squared Error (RMSE): 2.260287373287712
+---+------------------+
|Age|     predictionsRF|
+---+------------------+
|  7| 6.332898894704909|
|  7|7.2443224174439775|
|  6| 7.127882315489858|
|  6|7.8158080713338505|
|  9| 8.581493957648528|
| 10| 7.930096303444886|
|  8| 7.828903309429089|
|  5|7.9724092968753455|
|  8| 8.690662106865874|
|  7| 9.100764007002237|
| 10| 9.199193503429484|
| 10| 9.429057054750961|
|  6| 9.172415519504423|
| 10| 9.199193503429484|
|  8| 9.005082979418017|
| 13| 9.005082979418017|
| 11| 9.282317663121209|
| 10| 9.485643372450628|
| 12| 9.485643372450628|
|  9| 9.312954073638107|
+---+------------------+
only showing top 20 rows



In [6]:
# Создание модели градиентного бустинга
gbt = GBTRegressor(featuresCol="features", labelCol= "Age")
# Создание конвейера
pipelineGBT = Pipeline(stages = [indexer, assembler, gbt])
# Обучение модели
modelGBT = pipelineGBT.fit(train)
# Предсказание возраста на тестовой выборке
predictionsGBT = modelGBT.transform(test).withColumnRenamed("prediction", "predictionsGBT")
Pred_arr.append(predictionsGBT)

# Сохранение степени влиятельности признаков
name = "GBTRegressor"
tup = (name, ) + tuple(list(map(lambda x: float(x), modelGBT.stages[2].featureImportances)))
Importance_arr.append(tup)
print(Importance_arr)

[('RandomForestRegressor', 0.02665004704853663, 0.053320040428898416, 0.08449644970692752, 0.16723765971544471, 0.1564371386405771, 0.06842463185436534, 0.04896707521389934, 0.3944669573913509), ('GBTRegressor', 0.05888803959575064, 0.11085946000815497, 0.05779170388478715, 0.11244157718956405, 0.10750894546914604, 0.15406989513018765, 0.08793827404986826, 0.3105021046725413)]


In [7]:
# Оценка модели
evaluator = RegressionEvaluator(labelCol = "Age", predictionCol = "predictionsGBT", metricName = "rmse")
rmseGBT = evaluator.evaluate(predictionsGBT)
tupleRMSE += (rmseGBT,)
print("Root Mean Squared Error (RMSE): %s" % rmseGBT)
# Вывод результата и выключение SparkSession
predictionsGBT.select("Age", "predictionsGBT").show()

Root Mean Squared Error (RMSE): 2.2803200991143204
+---+------------------+
|Age|    predictionsGBT|
+---+------------------+
|  7|6.3856592052447505|
|  7| 8.328145290587976|
|  6| 8.328145290587976|
|  6| 8.328145290587976|
|  9| 8.328145290587976|
| 10| 8.385203677551146|
|  8| 8.328145290587976|
|  5| 8.385203677551146|
|  8| 8.385203677551146|
|  7| 8.510309925210988|
| 10| 9.688723558705624|
| 10| 9.886895821854138|
|  6| 9.688723558705624|
| 10| 9.772464097750788|
|  8| 9.489832774724613|
| 13| 9.489832774724613|
| 11|  9.17497297837164|
| 10| 9.898170067119258|
| 12| 9.898170067119258|
|  9| 9.500640132756345|
+---+------------------+
only showing top 20 rows



24/06/17 16:33:13 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
24/06/17 16:33:13 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS


In [8]:
# Создание модели линейной регрессии
lr = LinearRegression(featuresCol="features", labelCol="Age", maxIter=10, regParam=0.3, elasticNetParam=0.8) 
# Создание конвейера
pipelineLR = Pipeline(stages = [indexer, assembler, lr])
# Fit the model
modelLR = pipelineLR.fit(train)
# Предсказание возраста на тестовой выборке
predictionsLR = modelLR.transform(test).withColumnRenamed("prediction", "predictionsLR")
Pred_arr.append(predictionsLR)

In [9]:
# Оценка модели
evaluator = RegressionEvaluator(labelCol = "Age", predictionCol = "predictionsLR", metricName = "rmse")
rmseLR = evaluator.evaluate(predictionsLR)
tupleRMSE += (rmseLR,)
print("Root Mean Squared Error (RMSE): %s" % rmseLR)
# Вывод результата и выключение SparkSession
predictionsLR.select("Age", "predictionsLR").show()

Root Mean Squared Error (RMSE): 2.47864214309764
+---+-----------------+
|Age|    predictionsLR|
+---+-----------------+
|  7|7.160321089352263|
|  7|7.284946146650438|
|  6|7.206448484146021|
|  6|7.501189937532092|
|  9|7.642435500897497|
| 10|7.469859364388783|
|  8|7.587252364471093|
|  5| 7.65621372454987|
|  8|7.870762089199976|
|  7|7.948102942769726|
| 10|8.111809612199352|
| 10|8.292333015001129|
|  6|7.939400734946954|
| 10|8.092308402737986|
|  8|8.307954288616763|
| 13|8.274536374588308|
| 11|8.214638627621504|
| 10|8.389811404014402|
| 12|8.392584715950019|
|  9|8.397112741823182|
+---+-----------------+
only showing top 20 rows



In [10]:
# Создание модели решающих деервьев
dt = DecisionTreeRegressor(featuresCol="features", labelCol="Age")
# Создание конвейера
pipelineDT = Pipeline(stages = [indexer, assembler, dt])
# Fit the model
modelDT = pipelineDT.fit(train)
# Предсказание возраста на тестовой выборке
predictionsDT = modelDT.transform(test).withColumnRenamed("prediction", "predictionsDT")
Pred_arr.append(predictionsDT)

# Сохранение степени влиятельности признаков
name = "DecisionTreeRegressor"
tup = (name, ) + tuple(list(map(lambda x: float(x), modelDT.stages[2].featureImportances)))
Importance_arr.append(tup)
print(Importance_arr)

[('RandomForestRegressor', 0.02665004704853663, 0.053320040428898416, 0.08449644970692752, 0.16723765971544471, 0.1564371386405771, 0.06842463185436534, 0.04896707521389934, 0.3944669573913509), ('GBTRegressor', 0.05888803959575064, 0.11085946000815497, 0.05779170388478715, 0.11244157718956405, 0.10750894546914604, 0.15406989513018765, 0.08793827404986826, 0.3105021046725413), ('DecisionTreeRegressor', 0.02339298348803164, 0.0038014403418231537, 0.0, 0.011727518511288346, 0.007568860690374647, 0.12868104206642564, 0.0010964969783375006, 0.823731657923719)]


In [11]:
# Оценка модели
evaluator = RegressionEvaluator(labelCol = "Age", predictionCol = "predictionsDT", metricName = "rmse")
rmseDT = evaluator.evaluate(predictionsDT)
tupleRMSE += (rmseDT,)
print("Root Mean Squared Error (RMSE): %s" % rmseDT)
# Вывод результата и выключение SparkSession
predictionsDT.select("Age", "predictionsDT").show()

Root Mean Squared Error (RMSE): 2.3097527434255354
+---+-----------------+
|Age|    predictionsDT|
+---+-----------------+
|  7|6.363636363636363|
|  7|8.306122448979592|
|  6|8.306122448979592|
|  6|8.306122448979592|
|  9|8.306122448979592|
| 10|8.306122448979592|
|  8|8.306122448979592|
|  5|8.306122448979592|
|  8|8.306122448979592|
|  7|8.306122448979592|
| 10|9.484536082474227|
| 10|9.484536082474227|
|  6|9.484536082474227|
| 10|9.484536082474227|
|  8|9.484536082474227|
| 13|9.484536082474227|
| 11|9.484536082474227|
| 10|9.484536082474227|
| 12|9.484536082474227|
|  9|9.484536082474227|
+---+-----------------+
only showing top 20 rows



In [12]:
# Создание модели изотонической регрессии
ir = IsotonicRegression(featuresCol="features", labelCol="Age")
# Создание конвейера
pipelineIR = Pipeline(stages = [indexer, assembler, ir])
# Fit the model
modelIR = pipelineIR.fit(train)
# Предсказание возраста на тестовой выборке
predictionsIR = modelIR.transform(test).withColumnRenamed("prediction", "predictionsIR")
Pred_arr.append(predictionsIR)

In [13]:
# Оценка модели
evaluator = RegressionEvaluator(labelCol = "Age", predictionCol = "predictionsIR", metricName = "rmse")
rmseIR = evaluator.evaluate(predictionsIR)
tupleRMSE += (rmseIR,)
print("Root Mean Squared Error (RMSE): %s" % rmseIR)
# Вывод результата и выключение SparkSession
predictionsIR.select("Age", "predictionsIR").show()

Root Mean Squared Error (RMSE): 3.3033172371601327
+---+-------------+
|Age|predictionsIR|
+---+-------------+
|  7|         12.0|
|  7|         12.0|
|  6|         12.0|
|  6|         12.0|
|  9|         12.0|
| 10|         12.0|
|  8|         12.0|
|  5|         12.0|
|  8|         12.0|
|  7|         12.0|
| 10|         12.0|
| 10|         12.0|
|  6|         12.0|
| 10|         12.0|
|  8|         12.0|
| 13|         12.0|
| 11|         12.0|
| 10|         12.0|
| 12|         12.0|
|  9|         12.0|
+---+-------------+
only showing top 20 rows



In [14]:
# Создание модели FMRegressor
fm = FMRegressor(featuresCol="features", labelCol="Age", predictionCol="prediction",stepSize=0.001)
# Создание конвейера
pipelineFM = Pipeline(stages = [indexer, assembler, fm])
# Fit the model
modelFM = pipelineFM.fit(train)
# Предсказание возраста на тестовой выборке
predictionsFM = modelFM.transform(test).withColumnRenamed("prediction", "predictionsFM")
Pred_arr.append(predictionsFM)

In [15]:
# Оценка модели
evaluator = RegressionEvaluator(labelCol = "Age", predictionCol = "predictionsFM", metricName = "rmse")
rmseFM = evaluator.evaluate(predictionsFM)
tupleRMSE += (rmseFM,)
print("Root Mean Squared Error (RMSE): %s" % rmseFM)
# Вывод результата и выключение SparkSession
predictionsFM.select("Age", "predictionsFM").show()

Root Mean Squared Error (RMSE): 5.368003698897728
+---+------------------+
|Age|     predictionsFM|
+---+------------------+
|  7|1.1590307649514169|
|  7|1.2226059413432528|
|  6|1.2993159746448828|
|  6| 1.524598838368934|
|  9|1.4928202500865342|
| 10| 1.536697196986917|
|  8|1.5660857747087227|
|  5|1.7676983590120845|
|  8|1.6415910840055952|
|  7|1.9725884281155246|
| 10|1.9835849321438097|
| 10|2.3066986786641377|
|  6|1.9757341665481158|
| 10| 2.104297185299983|
|  8|2.2879852205575335|
| 13|2.2688363104715092|
| 11|2.6163000405653363|
| 10| 2.689913664989554|
| 12|2.7566753492200746|
|  9| 2.543729843762283|
+---+------------------+
only showing top 20 rows



In [16]:
# Модель  `AFTSurvivalRegression`  не подходит для датасета  `CrabAgePrediction`, так как нет информации о том,  
# живы ли были крабы, когда их поймали.

# Эта модель предназначена для анализа данных выживания, где есть данные о времени до события (в данном случае, смерть) 
# и информация о том,  было ли это событие наблюдено (краб умер) или  **цензурировано**  (краб был жив, когда его поймали).  

In [17]:
glr = GeneralizedLinearRegression(featuresCol="features", labelCol="Age", family="gaussian", link="identity", maxIter=10, regParam=0.3)
# Создание конвейера
pipelineGLR = Pipeline(stages = [indexer, assembler, glr])
# Fit the model
modelGLR = pipelineGLR.fit(train)
# Предсказание возраста на тестовой выборке
predictionsGLR = modelGLR.transform(test).withColumnRenamed("prediction", "predictionsGLR")
Pred_arr.append(predictionsGLR)

24/06/17 16:33:23 WARN LAPACK: Failed to load implementation from: com.github.fommil.netlib.NativeSystemLAPACK
24/06/17 16:33:23 WARN LAPACK: Failed to load implementation from: com.github.fommil.netlib.NativeRefLAPACK


In [18]:
# Оценка модели
evaluator = RegressionEvaluator(labelCol = "Age", predictionCol = "predictionsGLR", metricName = "rmse")
rmseGLR = evaluator.evaluate(predictionsGLR)
tupleRMSE += (rmseGLR,)
print("Root Mean Squared Error (RMSE): %s" % rmseGLR)
# Вывод результата и выключение SparkSession
predictionsGLR.select("Age", "predictionsGLR").show()

Root Mean Squared Error (RMSE): 2.283799410499867
+---+------------------+
|Age|    predictionsGLR|
+---+------------------+
|  7| 6.737955190427437|
|  7| 6.862932318748824|
|  6| 6.740405495766632|
|  6| 7.236090133153725|
|  9| 7.490798254321438|
| 10|7.3569211868027224|
|  8| 7.443108416444253|
|  5| 7.457203932313081|
|  8| 7.832877077082419|
|  7| 7.851557296318457|
| 10| 8.198009848979432|
| 10| 8.428149720994423|
|  6| 7.921528034658778|
| 10| 8.105271827856566|
|  8| 8.415571564479208|
| 13| 8.291360700210372|
| 11| 8.017739898488461|
| 10| 8.406049907191566|
| 12| 8.426642586227553|
|  9|  8.52506271204657|
+---+------------------+
only showing top 20 rows



In [19]:
# Создаем списки моделей, выполняющихся том же в порядке выше
modelsList = ["RandomForestRegressor", "GBTRegressor", "LinearRegression", "DecisionTreeRegressor", 
              "IsotonicRegression", "FMRegressor", "GeneralizedLinearRegression"]
shortModelsList = ["predictionsRF", "predictionsGBT", "predictionsLR", "predictionsDT", "predictionsIR", 
                   "predictionsFM", "predictionsGLR"]

# Схема нового DataFrame значений RMSE каждой модели
schemaRMSE = StructType([
    StructField(modelsList[0], DoubleType(), True),
    StructField(modelsList[1], DoubleType(), True),
    StructField(modelsList[2], DoubleType(), True),
    StructField(modelsList[3], DoubleType(), True),
    StructField(modelsList[4], DoubleType(), True),
    StructField(modelsList[5], DoubleType(), True),
    StructField(modelsList[6], DoubleType(), True)
])
# Создание DataFrame
rmseDF = spark.createDataFrame([tupleRMSE], schemaRMSE)

In [20]:
types = [StructField("Model_name", StringType(), True)]

for i in feature_names:
    types.append(StructField(i, DoubleType(), True))

schema_imp = StructType(types)
impDF = spark.createDataFrame(Importance_arr, schema_imp)
impDF.printSchema()

root
 |-- Model_name: string (nullable = true)
 |-- Sex: double (nullable = true)
 |-- Length: double (nullable = true)
 |-- Diameter: double (nullable = true)
 |-- Height: double (nullable = true)
 |-- Weight: double (nullable = true)
 |-- Shucked Weight: double (nullable = true)
 |-- Viscera Weight: double (nullable = true)
 |-- Shell Weight: double (nullable = true)



In [21]:
for i, df in enumerate(Pred_arr):
    df = df.withColumn("id", monotonically_increasing_id())
    Pred_arr[i] = df

ListSelectedDF = [df.select("Age", "id", shortModelsList[i]) for i, df in enumerate(Pred_arr)]
predDF = ListSelectedDF[0]

for i in range(1, len(ListSelectedDF)):
    predDF = predDF.join(ListSelectedDF[i], on = ["Age", "id"], how = "inner")

predDF = predDF.drop("id")

+---+------------------+------------------+-----------------+-----------------+-------------+------------------+-----------------+
|Age|     predictionsRF|    predictionsGBT|    predictionsLR|    predictionsDT|predictionsIR|     predictionsFM|   predictionsGLR|
+---+------------------+------------------+-----------------+-----------------+-------------+------------------+-----------------+
|  7| 6.332898894704909|6.3856592052447505|7.160321089352263|6.363636363636363|         12.0|1.1590307649514169|6.737955190427437|
|  7|7.2443224174439775| 8.328145290587976|7.284946146650438|8.306122448979592|         12.0|1.2226059413432528|6.862932318748824|
|  6| 7.127882315489858| 8.328145290587976|7.206448484146021|8.306122448979592|         12.0|1.2993159746448828|6.740405495766632|
|  6|7.8158080713338505| 8.328145290587976|7.501189937532092|8.306122448979592|         12.0| 1.524598838368934|7.236090133153725|
|  9| 8.581493957648528| 8.328145290587976|7.642435500897497|8.306122448979592|    

                                                                                

+---------------------+------------------+----------------+---------------------+------------------+-----------------+---------------------------+
|RandomForestRegressor|      GBTRegressor|LinearRegression|DecisionTreeRegressor|IsotonicRegression|      FMRegressor|GeneralizedLinearRegression|
+---------------------+------------------+----------------+---------------------+------------------+-----------------+---------------------------+
|    2.260287373287712|2.2803200991143204|2.47864214309764|   2.3097527434255354|3.3033172371601327|5.368003698897728|          2.283799410499867|
+---------------------+------------------+----------------+---------------------+------------------+-----------------+---------------------------+

+--------------------+-------------------+--------------------+-------------------+--------------------+--------------------+-------------------+--------------------+------------------+
|          Model_name|                Sex|              Length|           Diam

In [22]:
# Вывод лучшего результата
print(f"Лучшая модель: {modelsList[rmseDF.head().index(min(rmseDF.head()))]} (RMSE: {min(rmseDF.head())})")
# Запись результатов в json-файл
rmseDF.coalesce(1).write.json("RMSE.json", mode = 'overwrite')
predDF.write.json("Pred.json", mode = 'overwrite')
impDF.coalesce(1).write.json("Importance.json", mode = 'overwrite')

Лучшая модель: RandomForestRegressor (RMSE: 2.260287373287712)


                                                                                

In [203]:
spark.stop()