Здравствуйте.

Spark умеет валидоровать модели. Попробуем это сделать. Evaluation ипортируется следующим образом:


```
from pyspark.ml.evaluation import RegressionEvaluator, BinaryClassificationEvaluator
```

В частности [RegressionEvaluator](https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.evaluation.RegressionEvaluator.html#pyspark.ml.evaluation.RegressionEvaluator.metricName)



# Задание
Ниже обучается и оцениваться модель. 

Нужно перевести этот в Pipeline (вам понадобится VectorAssembler), а затем оценить MAE с помощью spark.


# https://scikit-learn.org/stable/datasets/toy_dataset.html#boston-dataset


In [41]:
from pyspark.ml.evaluation import RegressionEvaluator
import pandas as pd
from sklearn.datasets import load_diabetes, load_iris, load_boston
from sklearn.metrics import mean_absolute_error

from pyspark.ml.feature import VectorAssembler 
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.ml.regression import RandomForestRegressor

In [4]:
spark = SparkSession.builder\
    .master("local[2]")\
    .appName("Lesson_2")\
    .config("spark.executor.instances",2)\
    .config("spark.executor.memory",'2g')\
    .config("spark.executor.cores",1)\
    .getOrCreate()
sc = spark.sparkContext

In [67]:
# load data
data = load_boston()
dataset = pd.DataFrame(data['data'], columns=data['feature_names'])
columns = dataset.columns.to_list()
dataset['target'] = data['target']
dataset.columns, dataset.shape

(Index(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
        'PTRATIO', 'B', 'LSTAT', 'target'],
       dtype='object'),
 (506, 14))

In [76]:
spark_dataset = spark.createDataFrame(dataset)
assembler = VectorAssembler(inputCols=columns, outputCol='features')

output = assembler.transform(spark_dataset)
spark_dataset = output.select('features', 'target')
train, test = spark_dataset.randomSplit([.7, .3])

rf = RandomForestRegressor(labelCol='target')
# print("Model was fit using parameters: ")
# print(rf.extractParamMap())

pipeline = Pipeline(stages=[rf])
preds_df = pipeline.fit(train).transform(test)
preds_df.show(n=5, truncate=False)

evaluator = RegressionEvaluator(labelCol='target', predictionCol='prediction', metricName='mae')
print('MAE:', evaluator.evaluate(preds_df))



+---------------------------------------------------------------------------+------+------------------+
|features                                                                   |target|prediction        |
+---------------------------------------------------------------------------+------+------------------+
|[0.01439,60.0,2.93,0.0,0.401,6.604,18.8,6.2196,1.0,265.0,15.6,376.7,4.38]  |29.1  |31.63602669022915 |
|[0.01951,17.5,1.38,0.0,0.4161,7.104,59.5,9.2229,3.0,216.0,18.6,393.24,8.05]|33.0  |29.835487987502688|
|[0.02055,85.0,0.74,0.0,0.41,6.383,35.7,9.1876,2.0,313.0,17.3,396.9,5.77]   |24.7  |25.065775866570778|
|[0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03]  |34.7  |36.12114906331665 |
|[0.0315,95.0,1.47,0.0,0.403,6.975,15.3,7.6534,3.0,402.0,17.0,396.9,4.56]   |34.9  |32.28354268594046 |
+---------------------------------------------------------------------------+------+------------------+
only showing top 5 rows

MAE: 2.4472669854589757


In [100]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

paramGrid = ParamGridBuilder()\
    .addGrid(rf.maxDepth, [1, 3, 5, 10])\
    .addGrid(rf.minInfoGain, [0.001, 0.01, 0.1, 0.15])\
    .addGrid(rf.numTrees, [1, 5, 10])\
    .build()

crossval = CrossValidator(estimator=pipeline, 
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator,
                          numFolds=2)

cvModel = crossval.fit(train)
prediction = cvModel.transform(test)
prediction.show()

+--------------------+------+------------------+
|            features|target|        prediction|
+--------------------+------+------------------+
|[0.01439,60.0,2.9...|  29.1|27.014500000000005|
|[0.01951,17.5,1.3...|  33.0|30.799666666666667|
|[0.02055,85.0,0.7...|  24.7|23.592718614718613|
|[0.02729,0.0,7.07...|  34.7| 36.53045238095238|
|[0.0315,95.0,1.47...|  34.9|30.619982456140356|
|[0.03237,0.0,2.18...|  33.4|32.294999999999995|
|[0.03359,75.0,2.9...|  34.9|           32.3495|
|[0.03932,0.0,3.41...|  22.0|24.985611111111115|
|[0.0456,0.0,13.89...|  23.3|24.041438095238096|
|[0.04684,0.0,3.41...|  22.6| 23.91561111111111|
|[0.05302,0.0,3.41...|  28.7|27.488253968253968|
|[0.05735,0.0,4.49...|  26.6| 27.52709523809524|
|[0.0578,0.0,2.46,...|  37.2| 32.32928571428572|
|[0.06047,0.0,2.46...|  29.6|           26.1615|
|[0.06642,0.0,4.05...|  29.9|27.270522875816994|
|[0.06888,0.0,2.46...|  36.2|           23.8765|
|[0.06899,0.0,25.6...|  22.0| 20.01249458874459|
|[0.07022,0.0,4.05..

In [108]:
import numpy as np
print(cvModel.getEstimatorParamMaps()[np.argmax(cvModel.avgMetrics)])


{Param(parent='RandomForestRegressor_324dc6cae1db', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.'): 1, Param(parent='RandomForestRegressor_324dc6cae1db', name='minInfoGain', doc='Minimum information gain for a split to be considered at a tree node.'): 0.001, Param(parent='RandomForestRegressor_324dc6cae1db', name='numTrees', doc='Number of trees to train (>= 1).'): 1}
