In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.\
        builder.\
        appName("best_one").\
        master("spark://spark-master:7077").\
        config("spark.executor.memory", "512m").\
        getOrCreate()

In [2]:
bike_df = (spark.read
           .format('com.databricks.spark.csv')
           .option("header", "true") # первая строка - заголовок
           .option("inferSchema", "true") # автоопределение типов данных
           .load("day.csv"))

In [3]:
bike_df.columns

['instant',
 'dteday',
 'season',
 'yr',
 'mnth',
 'holiday',
 'weekday',
 'workingday',
 'weathersit',
 'temp',
 'atemp',
 'hum',
 'windspeed',
 'casual',
 'registered',
 'cnt']

In [4]:
bike_df1 = bike_df.select('season','mnth','holiday','weekday','workingday','weathersit','temp','atemp','hum',
                         'windspeed','casual','registered','cnt')

# добавим колонку с кол-вом
bike_df2 = bike_df1.withColumn("cnt", bike_df1["cnt"].cast("double"))
bike_df2.show()

+------+----+-------+-------+----------+----------+--------+--------+--------+---------+------+----------+------+
|season|mnth|holiday|weekday|workingday|weathersit|    temp|   atemp|     hum|windspeed|casual|registered|   cnt|
+------+----+-------+-------+----------+----------+--------+--------+--------+---------+------+----------+------+
|     1|   1|      0|      6|         0|         2|0.344167|0.363625|0.805833| 0.160446|   331|       654| 985.0|
|     1|   1|      0|      0|         0|         2|0.363478|0.353739|0.696087| 0.248539|   131|       670| 801.0|
|     1|   1|      0|      1|         1|         1|0.196364|0.189405|0.437273| 0.248309|   120|      1229|1349.0|
|     1|   1|      0|      2|         1|         1|     0.2|0.212122|0.590435| 0.160296|   108|      1454|1562.0|
|     1|   1|      0|      3|         1|         1|0.226957| 0.22927|0.436957|   0.1869|    82|      1518|1600.0|
|     1|   1|      0|      4|         1|         1|0.204348|0.233209|0.518261|0.0895652|

In [5]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.regression import RandomForestRegressionModel
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.tuning import ParamGridBuilder,CrossValidator

In [6]:
feature_columns = ['season'
                   ,'mnth'
                   ,'holiday'
                   ,'weekday'
                   ,'workingday'
                   ,'weathersit'
                   ,'temp'
                   ,'atemp'
                   ,'hum'
                   ,'windspeed'
                   ,'casual'
                   ,'registered']

In [8]:
(training_data, test_data) = bike_df2.randomSplit([0.7,0.3], seed = 10)
print("Training data size is :"+str(training_data.count()))
print("Test data size is :"+str(test_data.count()))

Training data size is :495
Test data size is :236


### Сделаем pipeline процесса

In [9]:
# сбор фичей
vecAssembler = VectorAssembler(inputCols=feature_columns, outputCol='features_vector')

In [10]:
# тестовый алгоритм
rdf = RandomForestRegressor(labelCol='cnt',featuresCol="features_vector",predictionCol='predicted_cnt',seed=15)

In [11]:
# собров в последовательность
pipeline = Pipeline(stages=[vecAssembler,rdf])


In [12]:
# перебор параметров
paramGrid = (ParamGridBuilder()
             .addGrid(rdf.maxDepth,[5,10,15,20])
             .addGrid(rdf.numTrees,[1,10,50,100])
             .build())

In [13]:
# оценка результатов обучения
rdfEvaluator = RegressionEvaluator(predictionCol="predicted_cnt", labelCol='cnt', metricName='rmse')

In [14]:
# кросс-валидация (её мы применяем не только для обучения, но и для тестирования)
cv = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=rdfEvaluator)

In [15]:
# выполнение
cvModel = cv.fit(training_data)

In [16]:
test_data_with_predictions = cvModel.transform(test_data)

In [17]:
test_data_with_predictions.show()

+------+----+-------+-------+----------+----------+---------+--------+--------+---------+------+----------+------+--------------------+------------------+
|season|mnth|holiday|weekday|workingday|weathersit|     temp|   atemp|     hum|windspeed|casual|registered|   cnt|     features_vector|     predicted_cnt|
+------+----+-------+-------+----------+----------+---------+--------+--------+---------+------+----------+------+--------------------+------------------+
|     1|   1|      0|      0|         0|         1| 0.138333|0.116175|0.434167|  0.36195|    54|       768| 822.0|[1.0,1.0,0.0,0.0,...| 962.9716666666666|
|     1|   1|      0|      0|         0|         1| 0.216522|0.250322|0.722174|0.0739826|   140|       956|1096.0|[1.0,1.0,0.0,0.0,...|           1231.72|
|     1|   1|      0|      0|         0|         1|   0.3375|0.340258|   0.465| 0.191542|   599|      2826|3425.0|[1.0,1.0,0.0,0.0,...|           3615.38|
|     1|   1|      0|      0|         0|         1|     0.37|0.375621|

In [18]:
test_data_RMSE = rdfEvaluator.evaluate(test_data_with_predictions)
print("RMSE on test data is : " + str(test_data_RMSE))

RMSE on test data is : 298.35828374779925


In [19]:
# лучший результат
bestRDFModel = cvModel.bestModel.stages[1]
bestRDFModel

RandomForestRegressionModel: uid=RandomForestRegressor_7f456bae9e9f, numTrees=50, numFeatures=12