In [1]:
import pandas as pd
from pyspark.sql import SparkSession, functions as F
import lbl2vec
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import numpy as np
from pyspark.sql.functions import date_format
import statsmodels.api as sm
from statsmodels.formula.api import ols
from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import OneHotEncoder, VectorAssembler
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt


In [2]:
# Create a spark session
spark = (
    SparkSession.builder.appName("MAST30034 Project 2")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "4g")
    .config("spark.executor.memory", "8g")
    .getOrCreate()
)

22/10/04 21:51:46 WARN Utils: Your hostname, MacBook-Air-3.local resolves to a loopback address: 127.0.0.1; using 192.168.0.66 instead (on interface en0)
22/10/04 21:51:46 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/10/04 21:51:46 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
# Read in data from ETL.py file
%run '../scripts/ETL.py' '../scripts/paths.json'
final_join3.limit(5)



22/10/04 21:51:48 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


                                                                                

22/10/04 21:52:42 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


                                                                                

merchant_name,merchant_abn,categories,take_rate,revenue_levels,name,address,state,gender,trans_merchant_abn,dollar_value,order_id,order_datetime,user_id,consumer_id,postcodes,int_sa2,SA2_code,SA2_name,income_2018-2019,total_males,total_females,total_persons,state_code,state_name,population_2020,population_2021
Egestas Nunc Asso...,11121775571,digital goods: bo...,6.58,a,Christopher Rodri...,30554 Evans Strea...,NSW,Male,11121775571,11.28829564583802,2bd2a61d-72e5-42d...,2021-08-20,3698,1175,2299,111031231,111031231,Shortland - Jesmond,242936885,6412,6179,12593,1,New South Wales,12598,12694
Morbi Accumsan In...,19618998054,tent and aWning s...,1.52,c,Christopher Rodri...,30554 Evans Strea...,NSW,Male,19618998054,62.90176609196828,3582b1f8-4577-403...,2021-05-16,3698,1175,2299,111031231,111031231,Shortland - Jesmond,242936885,6412,6179,12593,1,New South Wales,12598,12694
Eu Dolor Egestas PC,94472466107,"cable, satellite,...",6.23,a,Christopher Rodri...,30554 Evans Strea...,NSW,Male,94472466107,172.15375126873164,cb05d49f-c2fa-453...,2021-07-22,3698,1175,2299,111031231,111031231,Shortland - Jesmond,242936885,6412,6179,12593,1,New South Wales,12598,12694
Urna Justo Indust...,31472801314,music shops - mus...,6.56,a,Christopher Rodri...,30554 Evans Strea...,NSW,Male,31472801314,0.4894787650356477,aeec15c1-67e8-4cb...,2021-05-18,3698,1175,2299,111031231,111031231,Shortland - Jesmond,242936885,6412,6179,12593,1,New South Wales,12598,12694
Eu Sem Pellentesq...,35424691626,"computers, comput...",3.9,b,Christopher Rodri...,30554 Evans Strea...,NSW,Male,35424691626,7.360217018778133,9df473ba-102d-461...,2021-07-04,3698,1175,2299,111031231,111031231,Shortland - Jesmond,242936885,6412,6179,12593,1,New South Wales,12598,12694


In [4]:
final_join3.count()

                                                                                

10540181

In [5]:
tagged_merchants = pd.read_csv("../data/curated/tagged_merchants.csv")
tagged_merchants = tagged_merchants.iloc[:,1:]
tagged_merchants.drop(['tags', 'name', 'cleaned_tags', 'store_type'], axis=1, inplace=True)
tagged_merchants.to_parquet("../data/curated/tagged_merchants.parquet")
tagged_merchants_sdf = spark.read.parquet("../data/curated/tagged_merchants.parquet")

In [6]:
tagged_merchants_sdf = tagged_merchants_sdf.withColumnRenamed('merchant_abn',

    'tagged_merchant_abn'
)

In [7]:
tagged_merchants_sdf.show(5)

+-------------------+--------------------+
|tagged_merchant_abn|            category|
+-------------------+--------------------+
|        10023283211|           Furniture|
|        10142254217|         Electronics|
|        10165489824|        Toys and DIY|
|        10187291046|        Toys and DIY|
|        10192359162|Books, Stationary...|
+-------------------+--------------------+
only showing top 5 rows



In [8]:
final_join3.createOrReplaceTempView("join")
tagged_merchants_sdf.createOrReplaceTempView("tagged")

joint = spark.sql(""" 

SELECT *
FROM join
INNER JOIN tagged
ON join.merchant_abn = tagged.tagged_merchant_abn
""")

joint = joint.drop('tagged_merchant_abn')

In [9]:
joint.count()

                                                                                

10109254

In [10]:
joint.createOrReplaceTempView("group")

a = spark.sql(""" 

SELECT *, (take_rate - dollar_value) AS BNPL_earning
FROM group
""")

In [11]:
# Extracting the year, month, day from the timestamp

a = a.withColumn("Year", 
date_format('order_datetime', 'yyyy'))

a  = a.withColumn("Month", 
date_format('order_datetime', 'MMMM'))


a = a.withColumn("Day",
date_format(("order_datetime"), "E"))


In [12]:
a = a.drop('merchant_abn', 'categories','name', 'address', 'trans_merchant_abn', 'order_id','order_datetime','user_id','consumer_id','int_sa2',
'SA2_name','state_code','state_name','population_2020', 'population_2021','BNPL_earning')

In [13]:
 
# Find Count of Null, None, NaN of All DataFrame Columns
from pyspark.sql.functions import col,isnan, when, count
a.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in a.columns]
   ).show()



+-------------+---------+--------------+-----+------+------------+---------+--------+----------------+-----------+-------------+-------------+--------+----+-----+---+
|merchant_name|take_rate|revenue_levels|state|gender|dollar_value|postcodes|SA2_code|income_2018-2019|total_males|total_females|total_persons|category|Year|Month|Day|
+-------------+---------+--------------+-----+------+------------+---------+--------+----------------+-----------+-------------+-------------+--------+----+-----+---+
|            0|        0|             0|    0|     0|           0|        0|       0|               0|          0|            0|            0|       0|   0|    0|  0|
+-------------+---------+--------------+-----+------+------------+---------+--------+----------------+-----------+-------------+-------------+--------+----+-----+---+



                                                                                

In [14]:
a.printSchema()

root
 |-- merchant_name: string (nullable = true)
 |-- take_rate: double (nullable = true)
 |-- revenue_levels: string (nullable = true)
 |-- state: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- dollar_value: double (nullable = true)
 |-- postcodes: string (nullable = true)
 |-- SA2_code: long (nullable = true)
 |-- income_2018-2019: long (nullable = true)
 |-- total_males: long (nullable = true)
 |-- total_females: long (nullable = true)
 |-- total_persons: long (nullable = true)
 |-- category: string (nullable = true)
 |-- Year: string (nullable = true)
 |-- Month: string (nullable = true)
 |-- Day: string (nullable = true)



In [15]:
a.createOrReplaceTempView("agg")

male = spark.sql(""" 

SELECT CONCAT(merchant_name, SA2_code, Year, Month, Day) AS m_name, COUNT(gender) as males
FROM agg
WHERE gender = 'Male'
GROUP BY merchant_name, SA2_code, Year, Month, Day
""")

male.show(5)

female = spark.sql(""" 

SELECT CONCAT(merchant_name, SA2_code, Year, Month, Day) AS f_name, COUNT(gender) as females
FROM agg
WHERE gender = 'Female'
GROUP BY merchant_name, SA2_code, Year, Month, Day
""")
female.show(5)

                                                                                

+--------------------+-----+
|              m_name|males|
+--------------------+-----+
|Pede Nonummy Corp...|    2|
|Suspendisse Dui C...|    1|
|Maecenas Industri...|    1|
|Lorem Ipsum Sodal...|    1|
|Ultricies Digniss...|    1|
+--------------------+-----+
only showing top 5 rows





+--------------------+-------+
|              f_name|females|
+--------------------+-------+
|Eget Company10902...|      2|
|Taciti PC10902117...|      1|
|Est Nunc Consulti...|      1|
|Eget Metus In Cor...|      1|
|Cras Eget Foundat...|      1|
+--------------------+-------+
only showing top 5 rows



                                                                                

In [16]:
a.show(2)



+--------------------+---------+--------------+-----+------+-----------------+---------+---------+----------------+-----------+-------------+-------------+--------------------+----+------+---+
|       merchant_name|take_rate|revenue_levels|state|gender|     dollar_value|postcodes| SA2_code|income_2018-2019|total_males|total_females|total_persons|            category|Year| Month|Day|
+--------------------+---------+--------------+-----+------+-----------------+---------+---------+----------------+-----------+-------------+-------------+--------------------+----+------+---+
|Egestas Nunc Asso...|     6.58|             a|  NSW|  Male|11.28829564583802|     2299|111031231|       242936885|       6412|         6179|        12593|Books, Stationary...|2021|August|Fri|
|Morbi Accumsan In...|     1.52|             c|  NSW|  Male|62.90176609196828|     2299|111031231|       242936885|       6412|         6179|        12593|Books, Stationary...|2021|   May|Sun|
+--------------------+---------+---

                                                                                

In [17]:
a.createOrReplaceTempView("agg")

temp = spark.sql(""" 

SELECT merchant_name, COUNT(merchant_name) AS no_of_transactions, SA2_code, Year, Month, Day, SUM(take_rate - dollar_value) AS BNPL_earnings,
    CONCAT(merchant_name, SA2_code, Year, Month, Day) AS join_col
FROM agg
GROUP BY merchant_name, SA2_code, Year, Month, Day
""")

temp.show()




+--------------------+------------------+---------+----+------+---+--------------------+--------------------+
|       merchant_name|no_of_transactions| SA2_code|Year| Month|Day|       BNPL_earnings|            join_col|
+--------------------+------------------+---------+----+------+---+--------------------+--------------------+
|    Euismod Enim LLC|                 1|210021234|2021|August|Fri| -19.056022223176804|Euismod Enim LLC2...|
|Tempus Scelerisqu...|                 1|510021267|2021|August|Sun|  -9.093472011562348|Tempus Scelerisqu...|
|  Aliquam Gravida PC|                 1|315021404|2021|August|Mon|  -452.9431699416258|Aliquam Gravida P...|
|  Pede Nonummy Corp.|                 7|315031410|2021|  July|Fri| -123.63602658465202|Pede Nonummy Corp...|
|       Vel Institute|                 1|305031120|2021|   May|Fri|  -77.77982384549502|Vel Institute3050...|
|Sed Neque Associates|                 1|315031408|2021|  July|Fri|  -545.8554130974492|Sed Neque Associa...|
|         

                                                                                

In [18]:
temp.createOrReplaceTempView("gender_join")
male.createOrReplaceTempView("m")
female.createOrReplaceTempView("f")

temp2 = spark.sql(""" 

SELECT *
FROM gender_join
INNER JOIN m
ON gender_join.join_col = m.m_name
""")

temp2.createOrReplaceTempView("temp2")

temp3 = spark.sql(""" 

SELECT *
FROM temp2
INNER JOIN f
ON temp2.join_col = f.f_name
""")

temp3.limit(5)

                                                                                

merchant_name,no_of_transactions,SA2_code,Year,Month,Day,BNPL_earnings,join_col,m_name,males,f_name,females
A Arcu Industries,2,309011227,2021,October,Mon,-447.4473276205271,A Arcu Industries...,A Arcu Industries...,1,A Arcu Industries...,1
A Auctor Non Corp...,2,205031089,2022,August,Wed,-153.87443648170074,A Auctor Non Corp...,A Auctor Non Corp...,1,A Auctor Non Corp...,1
A Auctor Non Corp...,2,211051282,2022,October,Wed,-110.05960880156464,A Auctor Non Corp...,A Auctor Non Corp...,1,A Auctor Non Corp...,1
A Auctor Non Corp...,2,211051286,2021,September,Tue,-110.84792368624356,A Auctor Non Corp...,A Auctor Non Corp...,1,A Auctor Non Corp...,1
A Auctor Non Corp...,2,303041070,2022,May,Mon,-36.09463423218621,A Auctor Non Corp...,A Auctor Non Corp...,1,A Auctor Non Corp...,1


In [19]:
'SA2_code', 'total_males', 'total_females','income_2018-2019','total_persons', 

('SA2_code',
 'total_males',
 'total_females',
 'income_2018-2019',
 'total_persons')

In [20]:
a = a.withColumnRenamed('income_2018-2019',

    'income_2018_2019'    
)

a = a.withColumn('income_per_persons',
    (F.col('income_2018_2019')/F.col('total_persons'))
)


In [21]:
a.show(1)



+--------------------+---------+--------------+-----+------+-----------------+---------+---------+----------------+-----------+-------------+-------------+--------------------+----+------+---+------------------+
|       merchant_name|take_rate|revenue_levels|state|gender|     dollar_value|postcodes| SA2_code|income_2018_2019|total_males|total_females|total_persons|            category|Year| Month|Day|income_per_persons|
+--------------------+---------+--------------+-----+------+-----------------+---------+---------+----------------+-----------+-------------+-------------+--------------------+----+------+---+------------------+
|Egestas Nunc Asso...|     6.58|             a|  NSW|  Male|11.28829564583802|     2299|111031231|       242936885|       6412|         6179|        12593|Books, Stationary...|2021|August|Fri|19291.422615738902|
+--------------------+---------+--------------+-----+------+-----------------+---------+---------+----------------+-----------+-------------+-----------

                                                                                

In [22]:
a.createOrReplaceTempView("features")

e = spark.sql(""" 

SELECT merchant_name AS drop_name, FIRST(take_rate) AS take_rate, FIRST(revenue_levels) AS revenue_levels, FIRST(category) AS category,
    FIRST(total_males) AS males_in_SA2, FIRST(total_females) AS females_in_SA2, FIRST(income_per_persons) AS income_per_person
FROM features
GROUP BY merchant_name
""")

e.show(2)



+---------------+---------+--------------+--------------------+------------+--------------+------------------+
|      drop_name|take_rate|revenue_levels|            category|males_in_SA2|females_in_SA2| income_per_person|
+---------------+---------+--------------+--------------------+------------+--------------+------------------+
|   A Associates|     4.95|             b|Books, Stationary...|        9762|         10846|22526.523772559674|
|A Felis Company|     4.32|             b|Books, Stationary...|        1080|          1051| 33927.61168708765|
+---------------+---------+--------------+--------------------+------------+--------------+------------------+
only showing top 2 rows



                                                                                

In [23]:
temp3.createOrReplaceTempView("edit")
e.createOrReplaceTempView("rates")

temp4 = spark.sql(""" 

SELECT *
FROM edit
INNER JOIN rates
ON edit.merchant_name = rates.drop_name
""")

train = temp4.drop('m_name', 'f_name', 'drop_name','join_col')

train.limit(5)

[Stage 379:> (0 + 8) / 14][Stage 380:> (0 + 0) / 14][Stage 381:> (0 + 0) / 14]  

22/10/04 21:56:03 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/04 21:56:03 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/04 21:56:03 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/04 21:56:03 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/04 21:56:03 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/04 21:56:03 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/04 21:56:03 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/04 21:56:03 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


                                                                                

merchant_name,no_of_transactions,SA2_code,Year,Month,Day,BNPL_earnings,males,females,take_rate,revenue_levels,category,males_in_SA2,females_in_SA2,income_per_person
A Arcu Industries,2,309011227,2021,October,Mon,-447.4473276205271,1,1,3.0,c,Furniture,4821,4683,25816.03452631579
A Auctor Non Corp...,2,205031089,2022,August,Wed,-153.87443648170074,1,1,5.58,a,Furniture,2067,2014,22634.72370679088
A Auctor Non Corp...,2,211051282,2022,October,Wed,-110.05960880156464,1,1,5.58,a,Furniture,2067,2014,22634.72370679088
A Auctor Non Corp...,2,211051286,2021,September,Tue,-110.84792368624356,1,1,5.58,a,Furniture,2067,2014,22634.72370679088
A Auctor Non Corp...,2,303041070,2022,May,Mon,-36.09463423218621,1,1,5.58,a,Furniture,2067,2014,22634.72370679088


In [24]:
train.printSchema()

root
 |-- merchant_name: string (nullable = true)
 |-- no_of_transactions: long (nullable = false)
 |-- SA2_code: long (nullable = true)
 |-- Year: string (nullable = true)
 |-- Month: string (nullable = true)
 |-- Day: string (nullable = true)
 |-- BNPL_earnings: double (nullable = true)
 |-- males: long (nullable = false)
 |-- females: long (nullable = false)
 |-- take_rate: double (nullable = true)
 |-- revenue_levels: string (nullable = true)
 |-- category: string (nullable = true)
 |-- males_in_SA2: long (nullable = true)
 |-- females_in_SA2: long (nullable = true)
 |-- income_per_person: double (nullable = true)



In [25]:
train.count()

                                                                                

507978

In [26]:
# String indexing the categorical columns

indexer = StringIndexer(inputCols = ['merchant_name', 'SA2_code', 'Year', 'Month', 'Day', 'revenue_levels','category'],
outputCols = ['merchant_name_num', 'SA2_code_num', 'Year_num', 'Month_num', 'Day_num', 'revenue_levels_num','category_num'])

indexd_data = indexer.fit(train).transform(train)


# Applying onehot encoding to the categorical data that is string indexed above
encoder = OneHotEncoder(inputCols = ['merchant_name_num', 'SA2_code_num', 'Year_num', 'Month_num', 'Day_num', 'revenue_levels_num','category_num'],
outputCols = ['merchant_name_vec', 'SA2_code_vec', 'Year_vec', 'Month_vec', 'Day_vec', 'revenue_levels_vec','category_vec'])

onehotdata = encoder.fit(indexd_data).transform(indexd_data)


# Assembling the training data as a vector of features 
assembler1 = VectorAssembler(
inputCols=['no_of_transactions','take_rate', 'merchant_name_vec', 'SA2_code_vec', 'Year_vec', 'Month_vec', 'Day_vec', 'revenue_levels_vec','category_vec','males_in_SA2', 'females_in_SA2', 'income_per_person'],
outputCol= "features" )

outdata1 = assembler1.transform(onehotdata)

                                                                                

In [27]:
# Renaming the target column as label

outdata1 = outdata1.withColumnRenamed(
    "BNPL_earnings",
    "label"
)

In [28]:
# Assembling the features as a feature vector 

featureIndexer =\
    VectorIndexer(inputCol="features", 
    outputCol="indexedFeatures").fit(outdata1)

outdata1 = featureIndexer.transform(outdata1)

[Stage 607:> (0 + 8) / 14][Stage 608:> (0 + 0) / 14][Stage 609:> (0 + 0) / 14]4]

22/10/04 21:58:37 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/04 21:58:37 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/04 21:58:37 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/04 21:58:37 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/04 21:58:37 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/04 21:58:37 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/04 21:58:37 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/04 21:58:37 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


                                                                                

In [29]:
# Split the data into training and validation sets (30% held out for testing)

trainingData, testData = outdata1.randomSplit([0.7, 0.3], seed = 20)

In [30]:
trainingData.count(), testData.count()

[Stage 651:> (0 + 8) / 14][Stage 652:> (0 + 0) / 14][Stage 653:> (0 + 0) / 14]  

22/10/04 21:59:26 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/04 21:59:26 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/04 21:59:26 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/04 21:59:26 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/04 21:59:26 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/04 21:59:26 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/04 21:59:26 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/04 21:59:26 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


[Stage 651:=>(8 + 6) / 14][Stage 652:> (0 + 2) / 14][Stage 653:> (0 + 0) / 14]

22/10/04 21:59:33 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/04 21:59:33 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/04 21:59:33 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/04 21:59:33 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/04 21:59:33 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.




22/10/04 21:59:58 WARN DAGScheduler: Broadcasting large task binary with size 1309.8 KiB




22/10/04 22:00:41 WARN DAGScheduler: Broadcasting large task binary with size 1309.8 KiB


                                                                                

(355294, 152684)

In [31]:
# Train a RandomForest model.
rf = RandomForestRegressor(featuresCol="indexedFeatures")


# Train model.  
model = rf.fit(trainingData)

# Make predictions.
predictions_validation = model.transform(testData)



22/10/04 22:01:21 WARN DAGScheduler: Broadcasting large task binary with size 1316.4 KiB


                                                                                

22/10/04 22:01:23 WARN DAGScheduler: Broadcasting large task binary with size 1316.5 KiB


                                                                                

22/10/04 22:01:31 WARN DAGScheduler: Broadcasting large task binary with size 1320.4 KiB


                                                                                

22/10/04 22:01:38 WARN DAGScheduler: Broadcasting large task binary with size 1472.8 KiB


[Stage 841:>                                                       (0 + 8) / 14]

22/10/04 22:01:44 WARN MemoryStore: Not enough space to cache rdd_1915_3 in memory! (computed 255.3 MiB so far)
22/10/04 22:01:44 WARN MemoryStore: Not enough space to cache rdd_1915_7 in memory! (computed 255.3 MiB so far)
22/10/04 22:01:44 WARN MemoryStore: Not enough space to cache rdd_1915_6 in memory! (computed 168.8 MiB so far)
22/10/04 22:01:44 WARN MemoryStore: Not enough space to cache rdd_1915_2 in memory! (computed 112.5 MiB so far)
22/10/04 22:01:44 WARN MemoryStore: Not enough space to cache rdd_1915_5 in memory! (computed 168.8 MiB so far)
22/10/04 22:01:44 WARN MemoryStore: Not enough space to cache rdd_1915_4 in memory! (computed 168.8 MiB so far)
22/10/04 22:01:44 WARN BlockManager: Persisting block rdd_1915_3 to disk instead.
22/10/04 22:01:44 WARN BlockManager: Persisting block rdd_1915_4 to disk instead.
22/10/04 22:01:44 WARN BlockManager: Persisting block rdd_1915_2 to disk instead.
22/10/04 22:01:44 WARN BlockManager: Persisting block rdd_1915_5 to disk instead.




22/10/04 22:01:53 WARN MemoryStore: Not enough space to cache rdd_1915_9 in memory! (computed 168.8 MiB so far)
22/10/04 22:01:53 WARN BlockManager: Persisting block rdd_1915_9 to disk instead.
22/10/04 22:01:53 WARN MemoryStore: Not enough space to cache rdd_1915_11 in memory! (computed 168.8 MiB so far)
22/10/04 22:01:53 WARN BlockManager: Persisting block rdd_1915_11 to disk instead.
22/10/04 22:01:53 WARN MemoryStore: Not enough space to cache rdd_1915_8 in memory! (computed 255.3 MiB so far)
22/10/04 22:01:53 WARN BlockManager: Persisting block rdd_1915_8 to disk instead.
22/10/04 22:01:53 WARN MemoryStore: Not enough space to cache rdd_1915_12 in memory! (computed 168.8 MiB so far)
22/10/04 22:01:53 WARN BlockManager: Persisting block rdd_1915_12 to disk instead.
22/10/04 22:01:53 WARN MemoryStore: Not enough space to cache rdd_1915_10 in memory! (computed 255.3 MiB so far)
22/10/04 22:01:53 WARN BlockManager: Persisting block rdd_1915_10 to disk instead.
22/10/04 22:01:54 WARN M

                                                                                

22/10/04 22:01:56 WARN DAGScheduler: Broadcasting large task binary with size 1550.6 KiB


[Stage 853:>                                                       (0 + 8) / 14]

22/10/04 22:01:57 WARN MemoryStore: Not enough space to cache rdd_1915_1 in memory! (computed 112.5 MiB so far)
22/10/04 22:01:57 WARN MemoryStore: Not enough space to cache rdd_1915_0 in memory! (computed 112.5 MiB so far)
22/10/04 22:01:57 WARN MemoryStore: Not enough space to cache rdd_1915_4 in memory! (computed 112.5 MiB so far)
22/10/04 22:01:57 WARN MemoryStore: Not enough space to cache rdd_1915_5 in memory! (computed 112.5 MiB so far)
22/10/04 22:01:57 WARN MemoryStore: Not enough space to cache rdd_1915_2 in memory! (computed 112.5 MiB so far)
22/10/04 22:01:57 WARN MemoryStore: Not enough space to cache rdd_1915_6 in memory! (computed 168.8 MiB so far)




22/10/04 22:02:02 WARN MemoryStore: Not enough space to cache rdd_1915_12 in memory! (computed 168.8 MiB so far)
22/10/04 22:02:02 WARN MemoryStore: Not enough space to cache rdd_1915_11 in memory! (computed 255.3 MiB so far)


                                                                                

22/10/04 22:02:05 WARN DAGScheduler: Broadcasting large task binary with size 1695.8 KiB
22/10/04 22:02:05 WARN MemoryStore: Not enough space to cache rdd_1915_6 in memory! (computed 74.8 MiB so far)
22/10/04 22:02:05 WARN MemoryStore: Not enough space to cache rdd_1915_1 in memory! (computed 74.8 MiB so far)
22/10/04 22:02:05 WARN MemoryStore: Not enough space to cache rdd_1915_2 in memory! (computed 74.8 MiB so far)
22/10/04 22:02:05 WARN MemoryStore: Not enough space to cache rdd_1915_5 in memory! (computed 74.8 MiB so far)
22/10/04 22:02:05 WARN MemoryStore: Not enough space to cache rdd_1915_0 in memory! (computed 74.8 MiB so far)
22/10/04 22:02:05 WARN MemoryStore: Not enough space to cache rdd_1915_4 in memory! (computed 74.8 MiB so far)




22/10/04 22:02:10 WARN MemoryStore: Not enough space to cache rdd_1915_12 in memory! (computed 168.8 MiB so far)
22/10/04 22:02:10 WARN MemoryStore: Not enough space to cache rdd_1915_11 in memory! (computed 255.3 MiB so far)


                                                                                

22/10/04 22:02:13 WARN DAGScheduler: Broadcasting large task binary with size 1921.5 KiB
22/10/04 22:02:13 WARN MemoryStore: Not enough space to cache rdd_1915_5 in memory! (computed 74.8 MiB so far)
22/10/04 22:02:13 WARN MemoryStore: Not enough space to cache rdd_1915_2 in memory! (computed 49.0 MiB so far)
22/10/04 22:02:13 WARN MemoryStore: Not enough space to cache rdd_1915_1 in memory! (computed 49.0 MiB so far)
22/10/04 22:02:14 WARN MemoryStore: Not enough space to cache rdd_1915_4 in memory! (computed 74.8 MiB so far)
22/10/04 22:02:14 WARN MemoryStore: Not enough space to cache rdd_1915_0 in memory! (computed 74.8 MiB so far)
22/10/04 22:02:14 WARN MemoryStore: Not enough space to cache rdd_1915_6 in memory! (computed 112.5 MiB so far)




22/10/04 22:02:18 WARN MemoryStore: Not enough space to cache rdd_1915_11 in memory! (computed 168.8 MiB so far)
22/10/04 22:02:19 WARN MemoryStore: Not enough space to cache rdd_1915_12 in memory! (computed 255.3 MiB so far)


                                                                                

22/10/04 22:02:22 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB
22/10/04 22:02:22 WARN MemoryStore: Not enough space to cache rdd_1915_6 in memory! (computed 74.8 MiB so far)
22/10/04 22:02:22 WARN MemoryStore: Not enough space to cache rdd_1915_5 in memory! (computed 74.8 MiB so far)
22/10/04 22:02:22 WARN MemoryStore: Not enough space to cache rdd_1915_4 in memory! (computed 74.8 MiB so far)
22/10/04 22:02:22 WARN MemoryStore: Not enough space to cache rdd_1915_1 in memory! (computed 74.8 MiB so far)
22/10/04 22:02:22 WARN MemoryStore: Not enough space to cache rdd_1915_2 in memory! (computed 74.8 MiB so far)
22/10/04 22:02:22 WARN MemoryStore: Not enough space to cache rdd_1915_0 in memory! (computed 74.8 MiB so far)




22/10/04 22:02:27 WARN MemoryStore: Not enough space to cache rdd_1915_12 in memory! (computed 168.8 MiB so far)
22/10/04 22:02:27 WARN MemoryStore: Not enough space to cache rdd_1915_11 in memory! (computed 255.3 MiB so far)


                                                                                

In [32]:
# Evaluate the validation set 

predictions_validation.select("prediction", "label", "features").show(5)

# Select (prediction, true label) and compute test error

evaluator_train_rmse = RegressionEvaluator(
    labelCol="label", predictionCol="prediction", metricName="rmse")
rmse_train = evaluator_train_rmse.evaluate(predictions_validation)
print("Root Mean Squared Error (RMSE) on train data = %g" % rmse_train)

evaluator_train_mae = RegressionEvaluator(
    labelCol="label", predictionCol="prediction", metricName="mae")
mae_train = evaluator_train_mae.evaluate(predictions_validation)
print("Root Mean Squared Error (MAE) on train data = %g" % mae_train)

[Stage 933:>                                                        (0 + 8) / 9]

22/10/04 22:03:05 WARN DAGScheduler: Broadcasting large task binary with size 1324.2 KiB


                                                                                

+-------------------+-------------------+--------------------+
|         prediction|              label|            features|
+-------------------+-------------------+--------------------+
|-238.70533997597204|-110.05960880156465|(2835,[0,1,222,17...|
|-238.70533997597204| -27.13087366041648|(2835,[0,1,222,17...|
|-238.70533997597204| -257.3289665843789|(2835,[0,1,222,17...|
|-238.70533997597204|-122.97252883408981|(2835,[0,1,222,17...|
|-201.78667914470756|-417.99304299678636|(2835,[0,1,490,21...|
+-------------------+-------------------+--------------------+
only showing top 5 rows





22/10/04 22:03:41 WARN DAGScheduler: Broadcasting large task binary with size 1321.7 KiB


                                                                                

22/10/04 22:03:48 WARN DAGScheduler: Broadcasting large task binary with size 1322.8 KiB
Root Mean Squared Error (RMSE) on train data = 221.691




22/10/04 22:04:23 WARN DAGScheduler: Broadcasting large task binary with size 1321.7 KiB




22/10/04 22:04:30 WARN DAGScheduler: Broadcasting large task binary with size 1322.8 KiB
Root Mean Squared Error (MAE) on train data = 119.067


                                                                                