In [0]:
spark

In [0]:
# Load preprocessed raw data from csv file loaded in the DBFS of Databricks
train_df = spark.read.csv('/FileStore/tables/merge_pd.csv', header="true", inferSchema="true")
test_df = spark.read.csv('/FileStore/tables/merge_pd_test.csv', header="true", inferSchema="true")

In [0]:
# Checking the number of train and test data
print(test_df.count())
print(train_df.count())

In [0]:
test_df.show()

In [0]:
train_df.show()

In [0]:
train_df = train_df.drop('_c0')
test_df = test_df.drop('_c0')

In [0]:
# Filter na values in train and test data
train_df = train_df.fillna(0.0, subset=['air_temperature', 'cloud_coverage', 'dew_temperature','sea_level_pressure'])
train_df = train_df.fillna(0, subset=['square_feet'])
train_df.cache()
test_df = test_df.fillna(0.0, subset=['air_temperature', 'cloud_coverage', 'dew_temperature','sea_level_pressure'])
test_df = test_df.fillna(0, subset=['square_feet'])
test_df.cache()

In [0]:
# Vectorize all features into one column 
from pyspark.ml.feature import VectorAssembler
 
feature_cols = list(train_df.columns)
print(feature_cols)
feature_cols.remove("meter_reading")
print(feature_cols)   
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")

In [0]:
# Use mllib to build machine learning model
from pyspark.ml.regression import GBTRegressor
from pyspark.ml import Pipeline
 
# Models
lr = GBTRegressor(featuresCol='features', labelCol='meter_reading', maxDepth=12)

# Use Pipeline to realize victorization and model training
stages = [assembler, lr]
pipeline_model = Pipeline(stages=stages)
model = pipeline_model.fit(train_df)

In [0]:
# Make predictions
predictions = model.transform(test_df)
predictions.show()

In [0]:
print(predictions.count())

In [0]:
# Select predicted values
predictions_df = predictions.select('prediction')
predictions_df.show()

In [0]:
# Transfer to pandas dataframe
prediction_pd = predictions_df.toPandas()

In [0]:
prediction_pd.shape

In [0]:
# Export to csv file in Databricks dbfs 
predictions_df.cache()
predictions_df.coalesce(1).write.format('com.databricks.spark.csv').option('header', 'true').save('dbfs:/FileStore/tables/result_gbt_1.csv')

In [0]:
# How to download prediction results to local fiel system? 
# see https://towardsdatascience.com/databricks-how-to-save-files-in-csv-on-your-local-computer-3d0c70e6a9ab

# https://adb-6274230260614101.1.azuredatabricks.net/FileStore/tables/result.csv/part-00000-tid-6027726902046323268-ab465105-0da3-4fab-a127-ed268f0cbd2b-473-1-c000.csv?o=6274230260614101