In [1]:
from pyspark.sql import SparkSession

# Add here your team number teamx
team = "team14"

# location of your Hive database in HDFS
warehouse = "project/hive/warehouse"

spark = SparkSession.builder\
        .appName("{} - spark ML".format(team))\
        .master("yarn")\
        .config("hive.metastore.uris", "thrift://hadoop-02.uni.innopolis.ru:9883")\
        .config("spark.sql.warehouse.dir", warehouse)\
        .config("spark.sql.avro.compression.codec", "snappy")\
        .enableHiveSupport()\
        .getOrCreate()

In [2]:
spark.sql("SHOW DATABASES").show()

+--------------------+
|           namespace|
+--------------------+
|             default|
|             root_db|
|     team0_projectdb|
|team12_hive_proje...|
|    team13_projectdb|
|    team14_projectdb|
|    team15_projectdb|
|    team16_projectdb|
|    team17_projectdb|
|    team18_projectdb|
|    team19_projectdb|
|     team1_projectdb|
|    team20_projectdb|
|    team21_projectdb|
|    team22_projectdb|
|    team23_projectdb|
|    team24_projectdb|
|    team25_projectdb|
|    team26_projectdb|
|    team27_projectdb|
+--------------------+
only showing top 20 rows



In [3]:
spark.sql("USE team14_projectdb")
spark.sql("SHOW TABLES").show()

+----------------+--------------------+-----------+
|       namespace|           tableName|isTemporary|
+----------------+--------------------+-----------+
|team14_projectdb|         houses_part|      false|
|team14_projectdb|          q1_results|      false|
|team14_projectdb|          q2_results|      false|
|team14_projectdb|          q3_results|      false|
|team14_projectdb|          q4_results|      false|
|team14_projectdb|          q5_results|      false|
|team14_projectdb|real_estate_annou...|      false|
+----------------+--------------------+-----------+



In [4]:
spark.sql("SELECT * FROM team14_projectdb.houses_part").show()

+--------+---------+---------+-----------+-------------+----------+---------+-----------+
|house_id|id_region|street_id|postal_code|building_type|   geo_lon|  geo_lat|object_type|
+--------+---------+---------+-----------+-------------+----------+---------+-----------+
|  588606|       50|   208270|     141070|            4| 37.810055|55.915363|          0|
|  588607|       22|   222650|     656015|            4|  83.76511| 53.35501|          0|
|  588615|       22|   561374|     658213|            2|  81.22149|51.523575|          0|
|  588624|       26|   309018|     355031|            0|  42.01147|45.024067|          0|
|  588628|       77|   282486|     125195|            3| 37.472458| 55.85739|          0|
|  588635|       75|   116208|     673200|            6|110.467735|51.365814|          0|
|  588637|       76|   538749|     152020|            4|  38.84929|56.739395|          0|
|  588639|       23|   337967|     353430|            5| 37.389553|44.902275|          0|
|  588641|

In [5]:
spark.sql("SELECT * FROM team14_projectdb.real_estate_announcements_buck").show()

+---------------+----------------+--------+-----+------+-----+-------+------------+--------+
|announcement_id|publication_date|   price|level|levels|rooms|   area|kitchen_area|house_id|
+---------------+----------------+--------+-----+------+-----+-------+------------+--------+
|        3406615|   1619470800000| 2400000|    5|     5|    2| 44.500|       5.800| 3035705|
|        3406613|   1619470800000| 4480000|   16|    16|    2| 43.000|       0.000|  803314|
|        3406609|   1619470800000|  850000|    1|     2|    2| 47.000|       7.200| 1208830|
|        3406588|   1619470800000| 7400000|    9|    10|    4| 87.600|       0.000| 1443906|
|        3406584|   1619470800000| 3060000|    1|     4|    2| 57.900|      16.100| 2039859|
|        3406546|   1619470800000|10250000|    3|     9|    3| 50.000|       6.200| 1001693|
|        3406538|   1619470800000| 7180000|   14|    15|    5|129.000|       8.400|  919980|
|        3406534|   1619470800000| 2800000|    2|     9|    2| 40.000|

In [6]:
houses = spark.read.format("avro").table('team14_projectdb.houses_part')

listings = spark.read.format("avro").table('real_estate_announcements_buck')

In [7]:
houses.printSchema()

root
 |-- house_id: integer (nullable = true)
 |-- id_region: integer (nullable = true)
 |-- street_id: integer (nullable = true)
 |-- postal_code: integer (nullable = true)
 |-- building_type: integer (nullable = true)
 |-- geo_lon: float (nullable = true)
 |-- geo_lat: float (nullable = true)
 |-- object_type: short (nullable = true)



In [8]:
listings.printSchema()

root
 |-- announcement_id: integer (nullable = true)
 |-- publication_date: long (nullable = true)
 |-- price: long (nullable = true)
 |-- level: integer (nullable = true)
 |-- levels: integer (nullable = true)
 |-- rooms: integer (nullable = true)
 |-- area: decimal(6,3) (nullable = true)
 |-- kitchen_area: decimal(6,3) (nullable = true)
 |-- house_id: integer (nullable = true)



In [9]:
# Take only the Tatarstan flats, as there are too many instances
houses = houses.filter(houses.id_region == 16)

In [10]:
df = houses.join(listings, houses.house_id == listings.house_id)

# The house_id is NOT a FEATURE, but a joining point. So, it is not needed after joining
# because all the info about house is reflected in the other features
# Region id contains only a single value after filtering the Tatarstan data
df = df.drop(listings['house_id'])
df = df.drop('house_id', 'id_region')

df.show()

+---------+-----------+-------------+---------+---------+-----------+---------------+----------------+--------+-----+------+-----+-------+------------+
|street_id|postal_code|building_type|  geo_lon|  geo_lat|object_type|announcement_id|publication_date|   price|level|levels|rooms|   area|kitchen_area|
+---------+-----------+-------------+---------+---------+-----------+---------------+----------------+--------+-----+------+-----+-------+------------+
|   529655|     423822|            2| 52.37455|55.728302|          0|        3379233|   1619384400000| 4560000|    8|     9|    4| 95.000|       0.000|
|   392690|     420011|            4|  49.1667|55.721046|          0|        3334501|   1619211600000| 4700000|    3|     9|    1| 36.000|       9.300|
|   536292|     420137|            2| 49.15724| 55.83636|          0|        3275651|   1619038800000| 4200000|    4|     9|    1| 33.700|       0.000|
|   434535|     420025|            2|49.223713|55.790405|          0|        3202132|   

In [11]:
import gc

houses = None
listings = None
gc.collect()

358

In [12]:
from pyspark.sql.functions import col, when

# Announcement id is a unique identifier, so we do not include it
# publication_date and geo_* features will be transformed
# Street_id and postal_code are categorical columns, but we cannot use them as OneHot encoded as there are too many unique values (several thousands)
# level, levels, rooms, area are categorical, but they have strict numerical order, so we won't encode them, but treat as numerical instead
numerical = ['street_id', 'postal_code', 'level', 'levels', 'rooms', 'area', 'kitchen_area']
categorical = ['building_type', 'object_type']
transformable_features = ['geo_lon', 'geo_lat', 'publication_date']

# We aim to predict the price
label = 'price'

# Drop entries with null values
df = df.select(numerical + categorical + transformable_features + [label]).na.drop()

# Drop duplicates
df = df.dropDuplicates()

# Translate the label column
df = df.withColumnRenamed(label,"label")

# Replace -100 and -1 (studio appartment) with zeros
df = df.withColumn("kitchen_area", when(col("kitchen_area") == -100, 0).otherwise(col("kitchen_area")))
df = df.withColumn("rooms", when(col("rooms") == -1, 0).otherwise(col("rooms")))

# Filter outliers. Even the smallest flats cannot cost less than 500,000, so it is probably
# malformed data, and flats that cost more than 100,000,000 are elite flats for which the cost
# is determined be some extra elite features not included in the data, so they will confuse a model a lot
df = df.filter((col("price") >= 500_000) & (col("price") <= 100_000_000))

print("The dataframe that will be used")
df.show()

The dataframe that will be used
+---------+-----------+-----+------+-----+------+------------+-------------+-----------+---------+---------+----------------+-------+
|street_id|postal_code|level|levels|rooms|  area|kitchen_area|building_type|object_type|  geo_lon|  geo_lat|publication_date|  label|
+---------+-----------+-----+------+-----+------+------------+-------------+-----------+---------+---------+----------------+-------+
|   492355|     420059|    3|     5|    1|32.000|       6.000|            1|          0| 49.14056|55.767838|   1636318800000|4550000|
|   403622|     423602|    3|     5|    2|55.000|      11.000|            4|          0|  51.9899| 55.75219|   1613336400000|2800000|
|   184388|     420034|    1|     5|    2|43.700|       5.500|            2|          0| 49.08531|55.820637|   1620075600000|3850000|
|   206227|     420097|    7|     9|    1|43.000|      11.000|            4|          0|49.152428|55.784595|   1622926800000|5990000|
|   269256|     420081|   16| 

In [13]:
# Make sure we still satisfy the requirements
df.count()

161425

In [14]:
from pyspark.sql.functions import col, from_unixtime, month, dayofmonth, sin, cos
from pyspark.ml import Transformer
from pyspark import keyword_only
from pyspark.ml.param.shared import HasInputCol, HasOutputCol
from pyspark.ml.util import DefaultParamsReadable, DefaultParamsWritable

class TimeEncoder(Transformer, HasInputCol, HasOutputCol, DefaultParamsReadable, DefaultParamsWritable):
    """
    A custom transformer to encode date using sin and cos
    """
    pi = 3.141592653589793
    
    @keyword_only
    def __init__(self, inputCol=None, outputCol=None):
        super(TimeEncoder, self).__init__()
        self._setDefault(inputCol="publication_date")
        kwargs = self._input_kwargs
        self.setParams(**kwargs)
    
    @keyword_only
    def setParams(self, input_col: str = "input", output_col: str = "output"):
        kwargs = self._input_kwargs
        self._set(**kwargs)

    def setInputCol(self, value):
        return self._set(inputCol=value)

    def getInputCol(self):
        return self.getOrDefault(self.inputCol)

    def _transform(self, dataset):
        # Convert timestamp to date format
        dataset = dataset.withColumn("date", from_unixtime(col(self.getInputCol())))
        
        # Extract month, and day of month
        dataset = dataset.withColumn("month", month(col("date"))) \
                         .withColumn("day", dayofmonth(col("date")))

        # Encode month, day using sine and cosine functions
        dataset = dataset.withColumn("month_sin", sin(col("month") * (2 * self.pi / 12))) \
                         .withColumn("month_cos", cos(col("month") * (2 * self.pi / 12))) \
                         .withColumn("day_sin", sin(col("day") * (2 * self.pi / 31))) \
                         .withColumn("day_cos", cos(col("day") * (2 * self.pi / 31)))

        # Drop intermediate columns
        dataset = dataset.drop(self.getInputCol(), "date", "month", "day")
        return dataset

In [15]:
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, DoubleType
import math
from math import radians

class GeoToECEF(Transformer, HasInputCol, HasOutputCol):
    """
    Custom transformer to encode longtitude and latitude columns to the
    ECEF format
    """
    @keyword_only
    def __init__(self, inputCol=None, outputCol=None):
        super(GeoToECEF, self).__init__()
        self._setDefault(outputCol="ecef_coordinates")
        kwargs = self._input_kwargs
        self.setParams(**kwargs)
    
    @keyword_only
    def setParams(self, input_col: str = "input", output_col: str = "output"):
        kwargs = self._input_kwargs
        self._set(**kwargs)

    def setOutputCol(self, value):
        return self._set(outputCol=value)

    def getOutputCol(self):
        return self.getOrDefault(self.outputCol)

    def _transform(self, dataset):
        def geo_to_ecef(lat, lon):
            a = 6378137.0              # WGS-84 semi-major axis
            e2 = 6.6943799901377997e-3  # WGS-84 first eccentricity squared
            
            lat_rad = radians(lat)
            lon_rad = radians(lon)

            n = a / ((1 - e2 * (math.sin(lat_rad) ** 2)) ** 0.5)
            x = n * math.cos(lat_rad) * math.cos(lon_rad)
            y = n * math.cos(lat_rad) * math.sin(lon_rad)
            z = n * (1 - e2) * math.sin(lat_rad)
            return [x, y, z]

        geo_to_ecef_udf = udf(geo_to_ecef, ArrayType(DoubleType()))
        dataset = dataset.withColumn(self.getOutputCol(), geo_to_ecef_udf(dataset["geo_lat"], dataset["geo_lon"]))
        return dataset

In [16]:
class Disassembler(Transformer, HasInputCol, HasOutputCol, DefaultParamsReadable, DefaultParamsWritable):
    """
    A custom transformer to disassemble a list column to a list of columns
    """
    def __init__(self, inputCol, outputCol):
        self.inputCol = inputCol
        self.outputCol = outputCol
        
    def _transform(self, dataset):
        for i, column in enumerate(self.outputCol):
            dataset = dataset.withColumn(column, col(self.inputCol)[i])
        return dataset.drop(self.inputCol)

In [17]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, RobustScaler, OneHotEncoder

# No point to have hours, minutes, seconds, as they are not included into the data and will be all zeros
# Year is also the same everywhere
time_features = ["month_sin", "month_cos", "day_sin", "day_cos"]

ecef_features = ["ecef_x", "ecef_y", "ecef_z"]

# Encode the categorical data
encoders = [ OneHotEncoder(inputCol=c, outputCol=f"{c}_encoded") for c in categorical]

# Assemble the features to one column
assembler = VectorAssembler(inputCols= numerical + [f"{c}_encoded" for c in categorical] + time_features + ecef_features, outputCol="unscaled_features")

# Scalr the features using the robust scaler
scaler = RobustScaler(inputCol="unscaled_features", outputCol="features")

# Pipeline to execute transformations
pipeline = Pipeline(stages= [TimeEncoder(), GeoToECEF(), Disassembler("ecef_coordinates", ecef_features)] + encoders + [assembler, scaler])

# Fit the pipeline to the data
pipeline_model = pipeline.fit(df)

# Apply transformations
transformed_df = pipeline_model.transform(df)

print("The transformed dataframe with all stages")
transformed_df.show(truncate=False)

The transformed dataframe with all stages
+---------+-----------+-----+------+-----+-------+------------+-------------+-----------+---------+---------+-------+----------------------+-----------------------+-----------------------+--------------------+------------------+------------------+-----------------+---------------------+-------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|street_id|postal_code|level|levels|rooms|area   |kitchen_area|building_type|object_type|geo_lon  |geo_lat  |label  |month_sin             |month_cos              |day

In [18]:
data = transformed_df.select(['features', 'label'])

In [19]:
from pyspark.ml.feature import VectorIndexer

featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=7).fit(data)
transformed = featureIndexer.transform(data)

print("The final training dataframe")
# Display the output Spark DataFrame
transformed.show()

The final training dataframe
+--------------------+-------+--------------------+
|            features|  label|     indexedFeatures|
+--------------------+-------+--------------------+
|[1.82175120253412...|4490000|[1.82175120253412...|
|[2.02803957608228...|3700000|[2.02803957608228...|
|[2.35081407323400...| 950000|[2.35081407323400...|
|[1.82205102133927...|3850000|[1.82205102133927...|
|[1.61399415136069...|1050000|[1.61399415136069...|
|[1.93729441772146...|3250000|[1.93729441772146...|
|[2.37665497807846...|5100000|[2.37665497807846...|
|[2.26102051368955...|8900000|[2.26102051368955...|
|[2.29608627829268...|2800000|[2.29608627829268...|
|[0.53561543241258...|5020000|[0.53561543241258...|
|[2.06262736867719...|4300000|[2.06262736867719...|
|[0.70040280004692...|3100000|[0.70040280004692...|
|[2.33782192501053...|3000000|[2.33782192501053...|
|[0.73160133658354...|6350000|[0.73160133658354...|
|[1.20090901585563...|7500000|[1.20090901585563...|
|[1.20090901585563...|4300000|[1.20

In [20]:
data = None
transformed_df = None
df = None
gc.collect()

163

In [21]:
# A function to run commands
import os
def run(command):
    return os.popen(command).read()

In [22]:
(train_data, test_data) = transformed.randomSplit([0.7, 0.3], seed = 10)

train_data.select("features", "label")\
    .coalesce(1)\
    .write\
    .mode("overwrite")\
    .format("json")\
    .save("project/data/train")

# Run it from root directory of the repository
run("hdfs dfs -cat project/data/train/*.json > data/train.json")

''

In [23]:
test_data.select("features", "label")\
    .coalesce(1)\
    .write\
    .mode("overwrite")\
    .format("json")\
    .save("project/data/test")

# Run it from root directory of the repository
run("hdfs dfs -cat project/data/test/*.json > data/test.json")

''

In [24]:
transformed = None
gc.collect()

158

In [25]:
%%time
from pyspark.ml.regression import LinearRegression

lr = LinearRegression(featuresCol="indexedFeatures", labelCol="label", elasticNetParam=1, fitIntercept=False)
model_lr= lr.fit(train_data)

CPU times: user 10.7 ms, sys: 7.55 ms, total: 18.3 ms
Wall time: 26.3 s


In [26]:
pred_lr = model_lr.transform(test_data)

print("Base Linear Regression model predictions")
pred_lr.show()

Base Linear Regression model predictions
+--------------------+-------+--------------------+------------------+
|            features|  label|     indexedFeatures|        prediction|
+--------------------+-------+--------------------+------------------+
|(22,[0,1,2,3,5,7,...|3927490|(22,[0,1,2,3,5,7,...| 4219840.729044616|
|(22,[0,1,2,3,5,7,...|3950000|(22,[0,1,2,3,5,7,...| 4947703.883693755|
|(22,[0,1,2,3,5,11...|1870000|(22,[0,1,2,3,5,11...|1822436.4809295535|
|(22,[0,1,2,3,5,11...|4700000|(22,[0,1,2,3,5,11...| 4751108.369207114|
|(22,[0,1,2,3,5,11...|3550080|(22,[0,1,2,3,5,11...| 4762703.562189519|
|(22,[0,1,2,3,5,11...|3550080|(22,[0,1,2,3,5,11...| 4746821.686853051|
|(22,[0,1,2,3,5,11...|2877630|(22,[0,1,2,3,5,11...| 4291392.890671641|
|(22,[0,1,2,3,5,11...|3100000|(22,[0,1,2,3,5,11...| 4384701.902546823|
|[0.50574218189876...|2990000|[0.50574218189876...|2410458.8565741777|
|[0.50574218189876...|4100000|[0.50574218189876...|4559868.0876734555|
|[0.50574218189876...|2450000|[0.505

In [27]:
from pyspark.ml.evaluation import RegressionEvaluator 

# Evaluate the performance of the model
evaluator_rmse = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")
evaluator_r2 = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2")
evaluator_mae = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="mae")

rmse = evaluator_rmse.evaluate(pred_lr)
r2 = evaluator_r2.evaluate(pred_lr)
mae = evaluator_mae.evaluate(pred_lr)

print("The metrics of base linear regression model on the test data")
print("Root Mean Squared Error (RMSE) =", rmse)
print("Mean absolute error (MAE) =", mae)
print("R-squared score =", r2)

The metrics of base linear regression model on the test data
Root Mean Squared Error (RMSE) = 2090230.6035200728
Mean absolute error (MAE) = 1020680.5112522887
R-squared score = 0.6270623447631778


In [28]:
%%time
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

param_grid = (ParamGridBuilder()
              .addGrid(lr.fitIntercept, [False, True])  # whether to use intercept
              .addGrid(lr.aggregationDepth, [2, 3, 4])     # maximal number of terms to use
              .build())
# param_grid = (ParamGridBuilder()
#               .addGrid(lr.fitIntercept, [True])  # whether to use intercept
#               .addGrid(lr.aggregationDepth, [2])     # maximal number of terms to use
#               .build())

# Create CrossValidator
cv = CrossValidator(estimator=lr,
                    estimatorParamMaps=param_grid,
                    evaluator=evaluator_rmse,
                    numFolds=3,
                    parallelism=5,
                    seed=10)


# Fit CrossValidator
cv_model = cv.fit(train_data)

# Get the best Random Forest model
best_lr_model = cv_model.bestModel

CPU times: user 1.12 s, sys: 548 ms, total: 1.67 s
Wall time: 2min 26s


In [29]:
cv = None
gc.collect()

1471

In [30]:
from pprint import pprint

print("Parameters of the best Linear Regression model")
model1 = best_lr_model
pprint(model1.extractParamMap())

Parameters of the best Linear Regression model
{Param(parent='LinearRegression_ec866017eaa1', name='epsilon', doc='The shape parameter to control the amount of robustness. Must be > 1.0. Only valid when loss is huber'): 1.35,
 Param(parent='LinearRegression_ec866017eaa1', name='labelCol', doc='label column name.'): 'label',
 Param(parent='LinearRegression_ec866017eaa1', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.'): 1.0,
 Param(parent='LinearRegression_ec866017eaa1', name='featuresCol', doc='features column name.'): 'indexedFeatures',
 Param(parent='LinearRegression_ec866017eaa1', name='fitIntercept', doc='whether to fit an intercept term.'): True,
 Param(parent='LinearRegression_ec866017eaa1', name='maxIter', doc='max number of iterations (>= 0).'): 100,
 Param(parent='LinearRegression_ec866017eaa1', name='maxBlockSizeInMB', doc='maximum memory in MB for stacking input d

In [31]:
model1.write().overwrite().save("project/models/model1")

# Run it from root directory of the repository
run("hdfs dfs -get project/models/model1 models/model1")

''

In [32]:
predictions1 = model1.transform(test_data)
print("Predicitons of the best random forest model")
predictions1.show()

predictions1.select("label", "prediction")\
    .coalesce(1)\
    .write\
    .mode("overwrite")\
    .format("csv")\
    .option("sep", ",")\
    .option("header","true")\
    .save("project/output/model1_predictions.csv")

# Run it from root directory of the repository
run("hdfs dfs -cat project/output/model1_predictions.csv/*.csv > output/model1_predictions.csv")

Predicitons of the best random forest model
+--------------------+-------+--------------------+------------------+
|            features|  label|     indexedFeatures|        prediction|
+--------------------+-------+--------------------+------------------+
|(22,[0,1,2,3,5,7,...|4084430|(22,[0,1,2,3,5,7,...| 4411958.501964569|
|(22,[0,1,2,3,5,7,...|2558646|(22,[0,1,2,3,5,7,...| 2476026.476934433|
|(22,[0,1,2,3,5,11...|4700000|(22,[0,1,2,3,5,11...| 4787452.422697067|
|(22,[0,1,2,3,5,11...|3550080|(22,[0,1,2,3,5,11...| 4701008.048578262|
|(22,[0,1,2,3,5,11...|3550080|(22,[0,1,2,3,5,11...| 4794978.135252476|
|(22,[0,1,2,3,5,11...|2877630|(22,[0,1,2,3,5,11...| 4321328.556001663|
|(22,[0,1,2,3,5,11...|3100000|(22,[0,1,2,3,5,11...|  4426955.43289566|
|[0.50574218189876...|2150000|[0.50574218189876...|1467826.8151521683|
|[0.50574218189876...|2300000|[0.50574218189876...|1673430.1669831276|
|[0.50740639352739...|2700000|[0.50740639352739...| 5961153.314994812|
|[0.50740639352739...|4300000|[0.

''

In [33]:
rmse1 = evaluator_rmse.evaluate(predictions1)
mae1 = evaluator_mae.evaluate(predictions1)
r2_1 = evaluator_r2.evaluate(predictions1)

print("Evaluating the best linear regression model on the test set")
print("Root Mean Squared Error (RMSE) =", rmse1)
print("Mean absolute error (MAE) =", mae1)
print("R-squared score =", r2_1)

Evaluating the best linear regression model on the test set
Root Mean Squared Error (RMSE) = 2009503.0773694213
Mean absolute error (MAE) = 994592.6319753438
R-squared score = 0.6392016025939729


In [34]:
%%time
from pyspark.ml.regression import RandomForestRegressor

rf = RandomForestRegressor(featuresCol="indexedFeatures", labelCol="label")
model_rf= rf.fit(train_data)

CPU times: user 23.4 ms, sys: 5.97 ms, total: 29.4 ms
Wall time: 14 s


In [35]:
pred_rf = model_rf.transform(test_data)

print("Base random forest model predictions:")
pred_rf.show()

Base random forest model predictions:
+--------------------+-------+--------------------+------------------+
|            features|  label|     indexedFeatures|        prediction|
+--------------------+-------+--------------------+------------------+
|(22,[0,1,2,3,5,7,...|4084430|(22,[0,1,2,3,5,7,...| 4429198.903743034|
|(22,[0,1,2,3,5,7,...|2558646|(22,[0,1,2,3,5,7,...| 2902000.659327418|
|(22,[0,1,2,3,5,11...|4700000|(22,[0,1,2,3,5,11...| 4579081.464546791|
|(22,[0,1,2,3,5,11...|3550080|(22,[0,1,2,3,5,11...| 4579081.464546791|
|(22,[0,1,2,3,5,11...|3550080|(22,[0,1,2,3,5,11...| 4628377.163535927|
|(22,[0,1,2,3,5,11...|2877630|(22,[0,1,2,3,5,11...|3370620.0008115387|
|(22,[0,1,2,3,5,11...|3100000|(22,[0,1,2,3,5,11...| 3645096.683594787|
|[0.50574218189876...|2150000|[0.50574218189876...|2599787.1752512683|
|[0.50574218189876...|2300000|[0.50574218189876...| 2449876.689317503|
|[0.50740639352739...|2700000|[0.50740639352739...| 5831104.024793202|
|[0.50740639352739...|4300000|[0.507406

In [36]:
rmse_rf = evaluator_rmse.evaluate(pred_rf)
mae_rf = evaluator_mae.evaluate(pred_rf)
r2_rf = evaluator_r2.evaluate(pred_rf)

print("Base Random Forest model metrics on test set")
print("Root Mean Squared Error (RMSE) =", rmse_rf)
print("Mean Absolute Error (MAE) =", mae_rf)
print("R-squared score =", r2_rf)

Base Random Forest model metrics on test set
Root Mean Squared Error (RMSE) = 1889344.379392503
Mean Absolute Error (MAE) = 934979.5913038143
R-squared score = 0.6810596249872345


In [40]:
pred_lr = None
predicitons1 = None
pred_rf = None
gc.collect()

1600

In [53]:
%%time

param_grid = (ParamGridBuilder()
              .addGrid(rf.maxDepth, [10, 5])  # number of trees
              .addGrid(rf.numTrees, [50, 25])     # maximum depth of trees
              .build())
# param_grid = (ParamGridBuilder()
#               .addGrid(rf.maxDepth, [5])  # number of trees
#               .addGrid(rf.numTrees, [20])     # maximum depth of trees
#               .build())

# Create CrossValidator
cv = CrossValidator(estimator=rf,
                    estimatorParamMaps=param_grid,
                    evaluator=evaluator_rmse,
                    numFolds=3,
                    parallelism=5,
                    seed=10)


# Fit CrossValidator
cv_model = cv.fit(train_data)

# Get the best Random Forest model
best_rf_model = cv_model.bestModel

CPU times: user 2.11 s, sys: 888 ms, total: 3 s
Wall time: 5min 23s


In [54]:
from pprint import pprint

print("Parameters of the best Random Forest model")
model2 = best_rf_model
pprint(model2.extractParamMap())

Parameters of the best Random Forest model
{Param(parent='RandomForestRegressor_23e8d13459e7', name='minInstancesPerNode', doc='Minimum number of instances each child must have after split. If a split causes the left or right child to have fewer than minInstancesPerNode, the split will be discarded as invalid. Should be >= 1.'): 1,
 Param(parent='RandomForestRegressor_23e8d13459e7', name='minInfoGain', doc='Minimum information gain for a split to be considered at a tree node.'): 0.0,
 Param(parent='RandomForestRegressor_23e8d13459e7', name='subsamplingRate', doc='Fraction of the training data used for learning each decision tree, in range (0, 1].'): 1.0,
 Param(parent='RandomForestRegressor_23e8d13459e7', name='predictionCol', doc='prediction column name.'): 'prediction',
 Param(parent='RandomForestRegressor_23e8d13459e7', name='numTrees', doc='Number of trees to train (>= 1).'): 50,
 Param(parent='RandomForestRegressor_23e8d13459e7', name='minWeightFractionPerNode', doc='Minimum fract

In [55]:
model2.write().overwrite().save("project/models/model2")

# Run it from root directory of the repository
run("hdfs dfs -get project/models/model2 models/model2")

''

In [56]:
predictions2 = model2.transform(test_data)
print("Predicitons of the best random forest model")
predictions2.show()

predictions2.select("label", "prediction")\
    .coalesce(1)\
    .write\
    .mode("overwrite")\
    .format("csv")\
    .option("sep", ",")\
    .option("header","true")\
    .save("project/output/model2_predictions.csv")

# Run it from root directory of the repository
run("hdfs dfs -cat project/output/model2_predictions.csv/*.csv > output/model2_predictions.csv")

Predicitons of the best random forest model
+--------------------+-------+--------------------+------------------+
|            features|  label|     indexedFeatures|        prediction|
+--------------------+-------+--------------------+------------------+
|(22,[0,1,2,3,5,7,...|4084430|(22,[0,1,2,3,5,7,...| 4274231.375858683|
|(22,[0,1,2,3,5,7,...|2558646|(22,[0,1,2,3,5,7,...|2458785.7264024885|
|(22,[0,1,2,3,5,11...|4700000|(22,[0,1,2,3,5,11...| 4467907.340648945|
|(22,[0,1,2,3,5,11...|3550080|(22,[0,1,2,3,5,11...| 4477801.909140591|
|(22,[0,1,2,3,5,11...|3550080|(22,[0,1,2,3,5,11...| 4533374.857674823|
|(22,[0,1,2,3,5,11...|2877630|(22,[0,1,2,3,5,11...|3198247.2458298607|
|(22,[0,1,2,3,5,11...|3100000|(22,[0,1,2,3,5,11...|  3770929.96491868|
|[0.50574218189876...|2150000|[0.50574218189876...|2274003.5747580933|
|[0.50574218189876...|2300000|[0.50574218189876...|2236578.9848842025|
|[0.50740639352739...|2700000|[0.50740639352739...| 5781417.169736432|
|[0.50740639352739...|4300000|[0.

''

In [57]:
rmse2 = evaluator_rmse.evaluate(predictions2)
mae2 = evaluator_mae.evaluate(predictions2)
r2_2 = evaluator_r2.evaluate(predictions2)

print("Evaluating the best Random Forest model on the test set")
print("Root Mean Squared Error (RMSE) =", rmse2)
print("Mean absolute error (MAE) =", mae2)
print("R-squared score =", r2_2)

Evaluating the best Random Forest model on the test set
Root Mean Squared Error (RMSE) = 1404780.0490305342
Mean absolute error (MAE) = 677721.7049154028
R-squared score = 0.8236790529126035


In [58]:
print("Compare the best models")

# Create data frame to report performance of the models
models = [[str(model1),rmse1, r2_1], [str(model2),rmse2, r2_2]]

df = spark.createDataFrame(models, ["model", "RMSE", "R2"])
df.show(truncate=False)


# Save it to HDFS
df.coalesce(1)\
    .write\
    .mode("overwrite")\
    .format("csv")\
    .option("sep", ",")\
    .option("header","true")\
    .save("project/output/evaluation.csv")

# Run it from root directory of the repository
run("hdfs dfs -cat project/output/evaluation.csv/*.csv > output/evaluation.csv")

Compare the best models
+------------------------------------------------------------------------------------------------+------------------+------------------+
|model                                                                                           |RMSE              |R2                |
+------------------------------------------------------------------------------------------------+------------------+------------------+
|LinearRegressionModel: uid=LinearRegression_ec866017eaa1, numFeatures=22                        |2009503.0773694213|0.6392016025939729|
|RandomForestRegressionModel: uid=RandomForestRegressor_23e8d13459e7, numTrees=50, numFeatures=22|1404780.0490305342|0.8236790529126035|
+------------------------------------------------------------------------------------------------+------------------+------------------+



''