# Connect to Hive

In [1]:
from pyspark.sql import SparkSession

# Add here your team number teamx
team = "team0"

# location of your Hive database in HDFS
warehouse = "project/hive/warehouse"

spark = SparkSession.builder\
        .appName("{} - spark ML".format(team))\
        .master("yarn")\
        .config("hive.metastore.uris", "thrift://hadoop-02.uni.innopolis.ru:9883")\
        .config("spark.sql.warehouse.dir", warehouse)\
        .config("spark.sql.avro.compression.codec", "snappy")\
        .enableHiveSupport()\
        .getOrCreate()

sc = spark.sparkContext

In [2]:
spark

# list Hive databases

In [3]:
print(spark.catalog.listDatabases())
spark.sql("SHOW DATABASES;").show()

[Database(name='default', description='Default Hive database', locationUri='hdfs://hadoop-02.uni.innopolis.ru:8020/apps/hive/warehouse'), Database(name='root_db', description='', locationUri='hdfs://hadoop-02.uni.innopolis.ru:8020/user/root/root_db'), Database(name='team0_projectdb', description='', locationUri='hdfs://hadoop-02.uni.innopolis.ru:8020/user/team0/project/hive/warehouse'), Database(name='team12_hive_projectdb', description='', locationUri='hdfs://hadoop-02.uni.innopolis.ru:8020/user/team12/project/hive/warehouse'), Database(name='team23_projectdb', description='', locationUri='hdfs://hadoop-02.uni.innopolis.ru:8020/user/team23/project/hive/warehouse'), Database(name='team2_projectdb', description='', locationUri='hdfs://hadoop-02.uni.innopolis.ru:8020/user/team2/project/hive/warehouse'), Database(name='team7_projectdb', description='', locationUri='hdfs://hadoop-02.uni.innopolis.ru:8020/user/team7/project/hive/warehouse'), Database(name='testdb', description='', locationU

# Specify the input and output features

In [4]:
# We will use the following features
# Excluded 'comm' because it has a lot of nulls
# Excuded hiredate because it is given as practice to implement the cos_sin_transformation for the student
features = ['empno', 'ename', 'job', 'mgr', 'deptno']

# The output/target of our model
label = 'sal'

# Read hive tables

In [5]:
emps = spark.read.format("avro").table('team0_projectdb.employees_part')

depts = spark.read.format("avro").table('team0_projectdb.departments')

In [6]:
emps.show()

+-----+--------+-----------+----+----------+-------+-------+------+
|empno|   ename|        job| mgr|  hiredate|    sal|   comm|deptno|
+-----+--------+-----------+----+----------+-------+-------+------+
| 7782| 'CLARK'|  'MANAGER'|7839|1993-05-14|2450.00|   null|    10|
| 7839|  'KING'|'PRESIDENT'|null|1990-06-09|5000.00|   0.00|    10|
| 7934|'MILLER'|    'CLERK'|7782|2000-01-21|1300.00|   null|    10|
| 7369| 'SMITH'|    'CLERK'|7902|1993-06-13| 800.00|   0.00|    20|
| 7566| 'JONES'|  'MANAGER'|7839|1995-10-31|2975.00|   null|    20|
| 7788| 'SCOTT'|  'ANALYST'|7566|1996-03-05|3000.00|   null|    20|
| 7876| 'ADAMS'|    'CLERK'|7788|1999-06-04|1100.00|   null|    20|
| 7902|  'FORD'|  'ANALYST'|7566|1997-12-05|3000.00|   null|    20|
| 7499| 'ALLEN'| 'SALESMAN'|7698|1998-08-15|1600.00| 300.00|    30|
| 7521|  'WARD'| 'SALESMAN'|7698|1996-03-26|1250.00| 500.00|    30|
| 7698| 'BLAKE'|  'MANAGER'|7839|1992-06-11|2850.00|   null|    30|
| 7844|'TURNER'| 'SALESMAN'|7698|1995-06-04|1500

In [7]:
depts.show()

+------+------------+----------+
|deptno|       dname|  location|
+------+------------+----------+
|    10|'Accounting'|'New York'|
|    20|  'Research'|  'Dallas'|
|    30|     'Sales'| 'Chicago'|
|    40|'Operations'|  'Boston'|
+------+------------+----------+



# Feature selection

In [8]:
import pyspark.sql.functions as F

# Remove the quotes before and after each string in job and ename columns.
emps = emps.withColumn("job", F.translate("job","'",""))
emps.show()
emps = emps.withColumn("ename", F.translate("ename","'",""))
emps.show()

+-----+--------+---------+----+----------+-------+-------+------+
|empno|   ename|      job| mgr|  hiredate|    sal|   comm|deptno|
+-----+--------+---------+----+----------+-------+-------+------+
| 7782| 'CLARK'|  MANAGER|7839|1993-05-14|2450.00|   null|    10|
| 7839|  'KING'|PRESIDENT|null|1990-06-09|5000.00|   0.00|    10|
| 7934|'MILLER'|    CLERK|7782|2000-01-21|1300.00|   null|    10|
| 7369| 'SMITH'|    CLERK|7902|1993-06-13| 800.00|   0.00|    20|
| 7566| 'JONES'|  MANAGER|7839|1995-10-31|2975.00|   null|    20|
| 7788| 'SCOTT'|  ANALYST|7566|1996-03-05|3000.00|   null|    20|
| 7876| 'ADAMS'|    CLERK|7788|1999-06-04|1100.00|   null|    20|
| 7902|  'FORD'|  ANALYST|7566|1997-12-05|3000.00|   null|    20|
| 7499| 'ALLEN'| SALESMAN|7698|1998-08-15|1600.00| 300.00|    30|
| 7521|  'WARD'| SALESMAN|7698|1996-03-26|1250.00| 500.00|    30|
| 7698| 'BLAKE'|  MANAGER|7839|1992-06-11|2850.00|   null|    30|
| 7844|'TURNER'| SALESMAN|7698|1995-06-04|1500.00|   0.00|    30|
| 7900| 'J

In [9]:
emps = emps.select(features + [label]).na.drop()
emps = emps.withColumn("ename_job", F.concat(F.col('ename'), F.lit("_"), F.col('job')))
emps = emps.withColumnRenamed("sal","label")

emps.show()

+-----+------+--------+----+------+-------+---------------+
|empno| ename|     job| mgr|deptno|  label|      ename_job|
+-----+------+--------+----+------+-------+---------------+
| 7782| CLARK| MANAGER|7839|    10|2450.00|  CLARK_MANAGER|
| 7934|MILLER|   CLERK|7782|    10|1300.00|   MILLER_CLERK|
| 7369| SMITH|   CLERK|7902|    20| 800.00|    SMITH_CLERK|
| 7566| JONES| MANAGER|7839|    20|2975.00|  JONES_MANAGER|
| 7788| SCOTT| ANALYST|7566|    20|3000.00|  SCOTT_ANALYST|
| 7876| ADAMS|   CLERK|7788|    20|1100.00|    ADAMS_CLERK|
| 7902|  FORD| ANALYST|7566|    20|3000.00|   FORD_ANALYST|
| 7499| ALLEN|SALESMAN|7698|    30|1600.00| ALLEN_SALESMAN|
| 7521|  WARD|SALESMAN|7698|    30|1250.00|  WARD_SALESMAN|
| 7698| BLAKE| MANAGER|7839|    30|2850.00|  BLAKE_MANAGER|
| 7844|TURNER|SALESMAN|7698|    30|1500.00|TURNER_SALESMAN|
| 7900| JAMES|   CLERK|7698|    30| 950.00|    JAMES_CLERK|
| 7654|MARTIN|SALESMAN|7698|    30|1250.00|MARTIN_SALESMAN|
+-----+------+--------+----+------+-----

In [10]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, Word2Vec, Tokenizer, RegexTokenizer
from pyspark.sql.functions import col

categoricalCols = ['deptno']
textCols = ['ename_job']
others = ['empno', 'mgr']

# Feature extraction

In [11]:

# Since the tokenizer only return tokens separated by white spaces, I used RegexTokenizer to tokenize by '_'
# Then created word2Vec model

# tokenizer = Tokenizer(inputCol="ename", outputCol="ename_tokens")
# emps_tok = tokenizer.transform(emps)
tokenizer = RegexTokenizer(inputCol=textCols[0], outputCol="ename_job_tokens", pattern="_")
# emps_tok = tokenizer.transform(emps)
# emps_tok.show()

word2Vec = Word2Vec(vectorSize=5, seed=42, minCount=1, inputCol="ename_job_tokens", outputCol="ename_enc")
# word2VecModel = word2Vec.fit(emps_tok)
# print(word2VecModel)

# emps_tok = word2VecModel.transform(emps_tok)
# emps_tok.show()

# Adding the encoded ename_job to the list of other columns
# others += [ename_enc]


# Create String indexer to assign index for the string fields where each unique string will get a unique index
# String Indexer is required as an input for One-Hot Encoder 
# We set the case as `skip` for any string out of the input strings
indexers = [ StringIndexer(inputCol=c, outputCol="{0}_indexed".format(c)).setHandleInvalid("skip") for c in categoricalCols ]

# Encode the strings using One Hot encoding
# default setting: dropLast=True ==> For example with 5 categories, an input value of 2.0 would map to an output vector of [0.0, 0.0, 1.0, 0.0]. The last category is not included by default (configurable via dropLast), because it makes the vector entries sum up to one, and hence linearly dependent. So an input value of 4.0 maps to [0.0, 0.0, 0.0, 0.0].
encoders = [ OneHotEncoder(inputCol=indexer.getOutputCol(), outputCol="{0}_encoded".format(indexer.getOutputCol())) for indexer in indexers ]

# This will concatenate the input cols into a single column.
assembler = VectorAssembler(inputCols=[encoder.getOutputCol() for encoder in encoders] + others, outputCol= "features")

# You can create a pipeline to use only a single fit and transform on the data.
pipeline = Pipeline(stages=[tokenizer, word2Vec] + indexers + encoders + [assembler])


# Fit the pipeline ==> This will call the fit functions for all transformers if exist
model=pipeline.fit(emps)
# Fit the pipeline ==> This will call the transform functions for all transformers
data = model.transform(emps)

data.show()

# We delete all features and keep only the features and label columns
data = data.select(["features", "label"])


from pyspark.ml.feature import VectorIndexer

# Automatically identify categorical features, and index them.
# We specify maxCategories so features with > 4
# distinct values are treated as continuous.
featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(data)
transformed = featureIndexer.transform(data)

# Display the output Spark DataFrame
transformed.show()

+-----+------+--------+----+------+-------+---------------+------------------+--------------------+--------------+----------------------+--------------------+
|empno| ename|     job| mgr|deptno|  label|      ename_job|  ename_job_tokens|           ename_enc|deptno_indexed|deptno_indexed_encoded|            features|
+-----+------+--------+----+------+-------+---------------+------------------+--------------------+--------------+----------------------+--------------------+
| 7782| CLARK| MANAGER|7839|    10|2450.00|  CLARK_MANAGER|  [clark, manager]|[-0.0366689357906...|           2.0|             (2,[],[])|[0.0,0.0,7782.0,7...|
| 7934|MILLER|   CLERK|7782|    10|1300.00|   MILLER_CLERK|   [miller, clerk]|[-0.0071940987836...|           2.0|             (2,[],[])|[0.0,0.0,7934.0,7...|
| 7369| SMITH|   CLERK|7902|    20| 800.00|    SMITH_CLERK|    [smith, clerk]|[-0.0284428878221...|           1.0|         (2,[1],[1.0])|[0.0,1.0,7369.0,7...|
| 7566| JONES| MANAGER|7839|    20|2975.00|  J

# Split the dataset

In [12]:
#  split the data into 60% training and 40% test (it is not stratified)
(train_data, test_data) = transformed.randomSplit([0.6, 0.4], seed = 10)

def run(command):
    import os
    return os.popen(command).read()

train_data.select("features", "label")\
    .coalesce(1)\
    .write\
    .mode("overwrite")\
    .format("json")\
    .save("project/data/train")

# Run it from root directory of the repository
run("hdfs dfs -cat project/data/train/*.json > data/train.json")

test_data.select("features", "label")\
    .coalesce(1)\
    .write\
    .mode("overwrite")\
    .format("json")\
    .save("project/data/test")

# Run it from root directory of the repository
run("hdfs dfs -cat project/data/test/*.json > data/test.json")

''

# First model

## Build a model

In [13]:
from pyspark.ml.regression import LinearRegression
# Create Linear Regression Model
lr = LinearRegression()

# Fit the data to the pipeline stages
model_lr = lr.fit(train_data)

## Predict for test data

In [14]:
predictions = model_lr.transform(test_data)
predictions.show()

+--------------------+-------+--------------------+------------------+
|            features|  label|     indexedFeatures|        prediction|
+--------------------+-------+--------------------+------------------+
|[0.0,1.0,7566.0,7...|2975.00|[0.0,1.0,7566.0,7...|1924.5270478162856|
|[0.0,1.0,7876.0,7...|1100.00|[0.0,1.0,7876.0,7...| 3170.043729058416|
|[1.0,0.0,7521.0,7...|1250.00|[1.0,0.0,7521.0,7...|1451.4393991361212|
|[1.0,0.0,7844.0,7...|1500.00|[1.0,0.0,7844.0,7...|2802.3757762021633|
|[1.0,0.0,7900.0,7...| 950.00|[1.0,0.0,7900.0,7...| 3036.593847767792|
+--------------------+-------+--------------------+------------------+



## Evaluate the model

In [15]:
from pyspark.ml.evaluation import RegressionEvaluator 

# Evaluate the performance of the model
evaluator1_rmse = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")
evaluator1_r2 = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2")

rmse = evaluator1_rmse.evaluate(predictions)
r2 = evaluator1_r2.evaluate(predictions)

print("Root Mean Squared Error (RMSE) on test data = {}".format(rmse))
print("R^2 on test data = {}".format(r2))

Root Mean Squared Error (RMSE) on test data = 1515.2035403198176
R^2 on test data = -3.274514557061458


## Hyperparameter optimization

In [16]:
model_lr.params

[Param(parent='LinearRegression_2abd6a80678b', name='aggregationDepth', doc='suggested depth for treeAggregate (>= 2).'),
 Param(parent='LinearRegression_2abd6a80678b', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.'),
 Param(parent='LinearRegression_2abd6a80678b', name='epsilon', doc='The shape parameter to control the amount of robustness. Must be > 1.0. Only valid when loss is huber'),
 Param(parent='LinearRegression_2abd6a80678b', name='featuresCol', doc='features column name.'),
 Param(parent='LinearRegression_2abd6a80678b', name='fitIntercept', doc='whether to fit an intercept term.'),
 Param(parent='LinearRegression_2abd6a80678b', name='labelCol', doc='label column name.'),
 Param(parent='LinearRegression_2abd6a80678b', name='loss', doc='The loss function to be optimized. Supported options: squaredError, huber.'),
 Param(parent='LinearRegression_2abd6a80678b', name='m

In [17]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator 

import numpy as np


grid = ParamGridBuilder()
grid = grid.addGrid(
                    model_lr.aggregationDepth, [2, 3, 4])\
                    .addGrid(model_lr.regParam, np.logspace(1e-3,1e-1)
                    )\
                    .build()

cv = CrossValidator(estimator = lr, 
                    estimatorParamMaps = grid, 
                    evaluator = evaluator1_rmse,
                    parallelism = 5,
                    numFolds=3)

cvModel = cv.fit(train_data)
bestModel = cvModel.bestModel
bestModel

LinearRegressionModel: uid=LinearRegression_2abd6a80678b, numFeatures=4

## Best model 1


In [18]:
from pprint import pprint
model1 = bestModel
pprint(model1.extractParamMap())

{Param(parent='LinearRegression_2abd6a80678b', name='tol', doc='the convergence tolerance for iterative algorithms (>= 0).'): 1e-06,
 Param(parent='LinearRegression_2abd6a80678b', name='featuresCol', doc='features column name.'): 'features',
 Param(parent='LinearRegression_2abd6a80678b', name='fitIntercept', doc='whether to fit an intercept term.'): True,
 Param(parent='LinearRegression_2abd6a80678b', name='labelCol', doc='label column name.'): 'label',
 Param(parent='LinearRegression_2abd6a80678b', name='loss', doc='The loss function to be optimized. Supported options: squaredError, huber.'): 'squaredError',
 Param(parent='LinearRegression_2abd6a80678b', name='predictionCol', doc='prediction column name.'): 'prediction',
 Param(parent='LinearRegression_2abd6a80678b', name='solver', doc='The solver algorithm for optimization. Supported options: auto, normal, l-bfgs.'): 'auto',
 Param(parent='LinearRegression_2abd6a80678b', name='standardization', doc='whether to standardize the trainin

## Save the model to HDFS

In [19]:
model1.write().overwrite().save("project/models/model1")

# Run it from root directory of the repository
run("hdfs dfs -get project/models/model1 models/model1")

''

## Predict for test data using best model1

In [20]:
predictions = model1.transform(test_data)
predictions.show()

+--------------------+-------+--------------------+------------------+
|            features|  label|     indexedFeatures|        prediction|
+--------------------+-------+--------------------+------------------+
|[0.0,1.0,7566.0,7...|2975.00|[0.0,1.0,7566.0,7...|1919.1792222987642|
|[0.0,1.0,7876.0,7...|1100.00|[0.0,1.0,7876.0,7...| 3153.062261013358|
|[1.0,0.0,7521.0,7...|1250.00|[1.0,0.0,7521.0,7...|  1457.89900065869|
|[1.0,0.0,7844.0,7...|1500.00|[1.0,0.0,7844.0,7...|2793.4353807326916|
|[1.0,0.0,7900.0,7...| 950.00|[1.0,0.0,7900.0,7...| 3024.983483779586|
+--------------------+-------+--------------------+------------------+



In [21]:
predictions.select("label", "prediction")\
    .coalesce(1)\
    .write\
    .mode("overwrite")\
    .format("csv")\
    .option("sep", ",")\
    .option("header","true")\
    .save("project/output/model1_predictions.csv")

# Run it from root directory of the repository
run("hdfs dfs -cat project/output/model1_predictions.csv/*.csv > output/model1_predictions.csv")

''

## Evaluate the best model1

In [22]:
from pyspark.ml.evaluation import RegressionEvaluator 

# Evaluate the performance of the model
evaluator1_rmse = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")
evaluator1_r2 = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2")

rmse1 = evaluator1_rmse.evaluate(predictions)
r21 = evaluator1_r2.evaluate(predictions)

print("Root Mean Squared Error (RMSE) on test data = {}".format(rmse1))
print("R^2 on test data = {}".format(r21))

Root Mean Squared Error (RMSE) on test data = 1506.7564965041295
R^2 on test data = -3.2269877858078546


# Second model

## Build a model

In [23]:
from pyspark.ml.regression import GBTRegressor

# Create Linear Regression Model
gbt = GBTRegressor()

# Fit the data to the pipeline stages
model_gbt = gbt.fit(train_data)

## Predict for test data

In [24]:
predictions = model_gbt.transform(test_data)
predictions.show()

+--------------------+-------+--------------------+----------+
|            features|  label|     indexedFeatures|prediction|
+--------------------+-------+--------------------+----------+
|[0.0,1.0,7566.0,7...|2975.00|[0.0,1.0,7566.0,7...|     800.0|
|[0.0,1.0,7876.0,7...|1100.00|[0.0,1.0,7876.0,7...|    3000.0|
|[1.0,0.0,7521.0,7...|1250.00|[1.0,0.0,7521.0,7...|    1600.0|
|[1.0,0.0,7844.0,7...|1500.00|[1.0,0.0,7844.0,7...|    2850.0|
|[1.0,0.0,7900.0,7...| 950.00|[1.0,0.0,7900.0,7...|    2850.0|
+--------------------+-------+--------------------+----------+



## Evaluate the model

In [25]:
from pyspark.ml.evaluation import RegressionEvaluator 

# Evaluate the performance of the model
evaluator2_rmse = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")
evaluator2_r2 = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2")

rmse2 = evaluator2_rmse.evaluate(predictions)
r22 = evaluator2_r2.evaluate(predictions)

print("Root Mean Squared Error (RMSE) on test data = {}".format(rmse2))
print("R^2 on test data = {}".format(r22))

Root Mean Squared Error (RMSE) on test data = 1667.0707843400053
R^2 on test data = -4.174315769875256


## Hyperparameter optimization

In [26]:
model_gbt.params

[Param(parent='GBTRegressor_24a0b179d25a', name='cacheNodeIds', doc='If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees. Users can set how often should the cache be checkpointed or disable it by setting checkpointInterval.'),
 Param(parent='GBTRegressor_24a0b179d25a', name='checkpointInterval', doc='set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations. Note: this setting will be ignored if the checkpoint directory is not set in the SparkContext.'),
 Param(parent='GBTRegressor_24a0b179d25a', name='featureSubsetStrategy', doc="The number of features to consider for splits at each tree node. Supported options: 'auto' (choose automatically for task: If numTrees == 1, set to 'all'. If numTrees > 1 (forest), set to 'sqrt' for classification and to 'onethird' for regression), 'a

In [27]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator 

import numpy as np


grid = ParamGridBuilder()
grid = grid.addGrid(model_gbt.maxDepth, [2, 5, 10]).addGrid(model_gbt.lossType, ['squared', 'absolute']).build()

cv = CrossValidator(estimator = gbt, 
                    estimatorParamMaps = grid, 
                    evaluator = evaluator2_rmse,
                    parallelism = 5,
                    numFolds=3)

cvModel = cv.fit(train_data)
bestModel = cvModel.bestModel
bestModel

GBTRegressionModel: uid=GBTRegressor_24a0b179d25a, numTrees=20, numFeatures=4

## Best model 2


In [28]:
from pprint import pprint
model2 = bestModel
pprint(model2.extractParamMap())

{Param(parent='GBTRegressor_24a0b179d25a', name='maxBins', doc='Max number of bins for discretizing continuous features.  Must be >=2 and >= number of categories for any categorical feature.'): 32,
 Param(parent='GBTRegressor_24a0b179d25a', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes. Must be in range [0, 30].'): 5,
 Param(parent='GBTRegressor_24a0b179d25a', name='validationTol', doc='Threshold for stopping early when fit with validation is used. If the error rate on the validation input changes by less than the validationTol, then learning will stop early (before `maxIter`). This parameter is ignored when fit without validation is used.'): 0.01,
 Param(parent='GBTRegressor_24a0b179d25a', name='maxIter', doc='max number of iterations (>= 0).'): 20,
 Param(parent='GBTRegressor_24a0b179d25a', name='leafCol', doc='Leaf indices column name. Predicted leaf index of each instance in each tree by preorde

## Save the model to HDFS

In [29]:
model2.write().overwrite().save("project/models/model2")

# Run it from root directory of the repository
run("hdfs dfs -get project/models/model2 models/model2")

''

## Predict for test data using best model2

In [30]:
predictions = model2.transform(test_data)
predictions.show()

+--------------------+-------+--------------------+------------------+
|            features|  label|     indexedFeatures|        prediction|
+--------------------+-------+--------------------+------------------+
|[0.0,1.0,7566.0,7...|2975.00|[0.0,1.0,7566.0,7...| 800.0999999999999|
|[0.0,1.0,7876.0,7...|1100.00|[0.0,1.0,7876.0,7...|3000.1000000000004|
|[1.0,0.0,7521.0,7...|1250.00|[1.0,0.0,7521.0,7...|1600.1000000000004|
|[1.0,0.0,7844.0,7...|1500.00|[1.0,0.0,7844.0,7...|2850.1000000000004|
|[1.0,0.0,7900.0,7...| 950.00|[1.0,0.0,7900.0,7...|2850.1000000000004|
+--------------------+-------+--------------------+------------------+



In [31]:
predictions.select("label", "prediction")\
    .coalesce(1)\
    .write\
    .mode("overwrite")\
    .format("csv")\
    .option("sep", ",")\
    .option("header","true")\
    .save("project/output/model2_predictions.csv")

# Run it from root directory of the repository
run("hdfs dfs -cat project/output/model2_predictions.csv/*.csv > output/model2_predictions.csv")

''

## Evaluate the best model2

In [32]:
from pyspark.ml.evaluation import RegressionEvaluator 

# Evaluate the performance of the model
evaluator2_rmse = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")
evaluator2_r2 = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2")

rmse2 = evaluator2_rmse.evaluate(predictions)
r22 = evaluator2_r2.evaluate(predictions)

print("Root Mean Squared Error (RMSE) on test data = {}".format(rmse2))
print("R^2 on test data = {}".format(r22))

Root Mean Squared Error (RMSE) on test data = 1667.1106771897303
R^2 on test data = -4.174563414634148


# Compare best models

In [33]:
models = [[str(model1),rmse1, r21], [str(model2),rmse2, r22]]

df = spark.createDataFrame(models, ["model", "RMSE", "R2"])
df.show(truncate=False)

+-----------------------------------------------------------------------------+------------------+-------------------+
|model                                                                        |RMSE              |R2                 |
+-----------------------------------------------------------------------------+------------------+-------------------+
|LinearRegressionModel: uid=LinearRegression_2abd6a80678b, numFeatures=4      |1506.7564965041295|-3.2269877858078546|
|GBTRegressionModel: uid=GBTRegressor_24a0b179d25a, numTrees=20, numFeatures=4|1667.1106771897303|-4.174563414634148 |
+-----------------------------------------------------------------------------+------------------+-------------------+



In [34]:
df.coalesce(1)\
    .write\
    .mode("overwrite")\
    .format("csv")\
    .option("sep", ",")\
    .option("header","true")\
    .save("project/output/evaluation.csv")

# Run it from root directory of the repository
run("hdfs dfs -cat project/output/evaluation.csv/*.csv > output/evaluation.csv")

''