In [21]:
from pyspark.sql import SparkSession, functions as F
from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import OneHotEncoder, VectorAssembler
from pyspark.sql.functions import col,isnan, when, count
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import date_format
from pyspark.sql.functions import year, month
import pandas as pd
import lbl2vec

In [22]:
# Create a spark session
spark = (
    SparkSession.builder.appName("MAST30034 Project 2")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "6g")
    .config("spark.executor.memory", "10g")
    .getOrCreate()
)

22/10/05 15:01:02 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [23]:
# Read in data from ETL.py file
%run '../scripts/ETL.py' '../scripts/paths.json'

22/10/05 15:01:02 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


                                                                                

merchant_name,merchant_abn,categories,take_rate,revenue_levels,name,address,state,gender,trans_merchant_abn,dollar_value,order_id,order_datetime,user_id,consumer_id,postcodes,int_sa2,SA2_code,SA2_name,income_2018-2019,total_males,total_females,total_persons,state_code,state_name,population_2020,population_2021
Egestas Nunc Asso...,11121775571,digital goods: bo...,6.58,a,Christopher Rodri...,30554 Evans Strea...,NSW,Male,11121775571,11.28829564583802,2bd2a61d-72e5-42d...,2021-08-20,3698,1175,2299,111031231,111031231,Shortland - Jesmond,242936885,6412,6179,12593,1,New South Wales,12598,12694
Morbi Accumsan In...,19618998054,tent and aWning s...,1.52,c,Christopher Rodri...,30554 Evans Strea...,NSW,Male,19618998054,62.90176609196828,3582b1f8-4577-403...,2021-05-16,3698,1175,2299,111031231,111031231,Shortland - Jesmond,242936885,6412,6179,12593,1,New South Wales,12598,12694
Eu Dolor Egestas PC,94472466107,"cable, satellite,...",6.23,a,Christopher Rodri...,30554 Evans Strea...,NSW,Male,94472466107,172.15375126873164,cb05d49f-c2fa-453...,2021-07-22,3698,1175,2299,111031231,111031231,Shortland - Jesmond,242936885,6412,6179,12593,1,New South Wales,12598,12694
Urna Justo Indust...,31472801314,music shops - mus...,6.56,a,Christopher Rodri...,30554 Evans Strea...,NSW,Male,31472801314,0.4894787650356477,aeec15c1-67e8-4cb...,2021-05-18,3698,1175,2299,111031231,111031231,Shortland - Jesmond,242936885,6412,6179,12593,1,New South Wales,12598,12694
Eu Sem Pellentesq...,35424691626,"computers, comput...",3.9,b,Christopher Rodri...,30554 Evans Strea...,NSW,Male,35424691626,7.360217018778133,9df473ba-102d-461...,2021-07-04,3698,1175,2299,111031231,111031231,Shortland - Jesmond,242936885,6412,6179,12593,1,New South Wales,12598,12694


In [25]:
tagged_merchants = pd.read_csv("../data/curated/tagged_merchants.csv")
tagged_merchants = tagged_merchants.iloc[:,1:]
tagged_merchants.drop(['tags', 'name', 'cleaned_tags', 'store_type'], axis=1, inplace=True)
tagged_merchants.to_parquet("../data/curated/tagged_merchants.parquet")
tagged_merchants_sdf = spark.read.parquet("../data/curated/tagged_merchants.parquet")

In [26]:
tagged_merchants_sdf = tagged_merchants_sdf.withColumnRenamed('merchant_abn',

    'tagged_merchant_abn'
)

In [28]:
final_join3.createOrReplaceTempView("join")
tagged_merchants_sdf.createOrReplaceTempView("tagged")

joint = spark.sql(""" 

SELECT *
FROM join
INNER JOIN tagged
ON join.merchant_abn = tagged.tagged_merchant_abn
""")

joint = joint.drop('tagged_merchant_abn')

In [30]:
joint.createOrReplaceTempView("group")

main_data = spark.sql(""" 

SELECT *, ((take_rate/100)*dollar_value) AS percent
FROM group
""")

In [31]:
# Extracting the year, month, day from the timestamp


main_data = main_data.withColumn('Year', year(main_data.order_datetime))
main_data = main_data.withColumn('Month',month(main_data.order_datetime))


In [33]:
main_data = main_data.drop('merchant_abn', 'categories','name', 'address', 'trans_merchant_abn', 'order_id','order_datetime','user_id',
'consumer_id','int_sa2','SA2_name','state_code','state_name','population_2020', 'population_2021')

In [34]:
 # Find Count of Null, None, NaN of All DataFrame Columns
main_data.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in main_data.columns])



+-------------+---------+--------------+-----+------+------------+---------+--------+----------------+-----------+-------------+-------------+--------+-------+----+-----+
|merchant_name|take_rate|revenue_levels|state|gender|dollar_value|postcodes|SA2_code|income_2018-2019|total_males|total_females|total_persons|category|percent|Year|Month|
+-------------+---------+--------------+-----+------+------------+---------+--------+----------------+-----------+-------------+-------------+--------+-------+----+-----+
|            0|        0|             0|    0|     0|           0|        0|       0|               0|          0|            0|            0|       0|      0|   0|    0|
+-------------+---------+--------------+-----+------+------------+---------+--------+----------------+-----------+-------------+-------------+--------+-------+----+-----+



                                                                                

In [36]:
main_data.createOrReplaceTempView("agg")

male = spark.sql(""" 

SELECT CONCAT(merchant_name, SA2_code, Year, Month) AS m_name, COUNT(gender) as males
FROM agg
WHERE gender = 'Male'
GROUP BY merchant_name, SA2_code, Year, Month
""")


female = spark.sql(""" 

SELECT CONCAT(merchant_name, SA2_code, Year, Month) AS f_name, COUNT(gender) as females
FROM agg
WHERE gender = 'Female'
GROUP BY merchant_name, SA2_code, Year, Month
""")


                                                                                

+--------------------+-----+
|              m_name|males|
+--------------------+-----+
|Semper Tellus PC1...|    2|
|Est Nunc Consulti...|   11|
|Ipsum Primis In I...|    2|
|Euismod In LLC601...|    1|
|Leo In Consulting...|    2|
+--------------------+-----+
only showing top 5 rows





+--------------------+-------+
|              f_name|females|
+--------------------+-------+
|Quis Tristique Lt...|      1|
|Nunc In Industrie...|      2|
|Nunc Sit LLC10902...|      2|
|Leo In Consulting...|      5|
|Risus Donec Assoc...|      1|
+--------------------+-------+
only showing top 5 rows



                                                                                

In [38]:
main_data.createOrReplaceTempView("agg")

main_agg_data = spark.sql(""" 

SELECT merchant_name, COUNT(merchant_name) AS no_of_transactions, SA2_code, Year, Month, SUM(dollar_value - percent) AS total_earnings,
    CONCAT(merchant_name, SA2_code, Year, Month) AS join_col
FROM agg
GROUP BY merchant_name, SA2_code, Year, Month
""")





+--------------------+------------------+---------+----+-----+------------------+--------------------+
|       merchant_name|no_of_transactions| SA2_code|Year|Month|    total_earnings|            join_col|
+--------------------+------------------+---------+----+-----+------------------+--------------------+
|  Pede Nonummy Corp.|                 6|205021084|2021|    8|163.02718506270642|Pede Nonummy Corp...|
|    Semper Tellus PC|                 6|407031164|2021|    8| 78.98297577595977|Semper Tellus PC4...|
| Dui Nec Corporation|                 2|801041036|2021|    8|14.801019173760814|Dui Nec Corporati...|
|Ac Turpis Egestas PC|                 1|405041127|2021|    7|154.36353234367544|Ac Turpis Egestas...|
|Lobortis Tellus C...|                 3|205021082|2021|    7|108.48901763585754|Lobortis Tellus C...|
|   Est Ac Mattis Ltd|                 1|209011204|2021|    7|          149.2599|Est Ac Mattis Ltd...|
| Egestas Blandit Ltd|                 2|601051031|2021|    8|319.9547692

                                                                                

In [39]:
main_agg_data.createOrReplaceTempView("gender_join")
male.createOrReplaceTempView("male_agg")
female.createOrReplaceTempView("female_agg")

temp = spark.sql(""" 

SELECT *
FROM gender_join
INNER JOIN male_agg
ON gender_join.join_col = male_agg.m_name
""")

temp.createOrReplaceTempView("temp")

gender_agg = spark.sql(""" 

SELECT *
FROM temp
INNER JOIN female_agg
ON temp2.join_col = female_agg.f_name
""")


                                                                                

merchant_name,no_of_transactions,SA2_code,Year,Month,total_earnings,join_col,m_name,males,f_name,females
A Aliquet Ltd,2,312021351,2021,6,298.8537411028336,A Aliquet Ltd3120...,A Aliquet Ltd3120...,1,A Aliquet Ltd3120...,1
A Aliquet Ltd,2,401021010,2021,4,670.4857203238805,A Aliquet Ltd4010...,A Aliquet Ltd4010...,1,A Aliquet Ltd4010...,1
A Aliquet Ltd,2,603011065,2021,12,346.5799250465364,A Aliquet Ltd6030...,A Aliquet Ltd6030...,1,A Aliquet Ltd6030...,1
A Arcu Industries,2,124011453,2021,8,203.5414742977921,A Arcu Industries...,A Arcu Industries...,1,A Arcu Industries...,1
A Arcu Industries,2,211051282,2022,3,655.1195924003883,A Arcu Industries...,A Arcu Industries...,1,A Arcu Industries...,1


In [40]:
main_data = main_data.withColumnRenamed('income_2018-2019',

    'income_2018_2019'    
)

main_data = main_data.withColumn('income_per_persons',
    (F.col('income_2018_2019')/F.col('total_persons'))
)


In [42]:
main_data.createOrReplaceTempView("features")

other_agg = spark.sql(""" 

SELECT merchant_name AS drop_name, FIRST(take_rate) AS take_rate, FIRST(revenue_levels) AS revenue_levels, FIRST(category) AS category,
    FIRST(total_males) AS males_in_SA2, FIRST(total_females) AS females_in_SA2, FIRST(income_per_persons) AS income_per_person
FROM features
GROUP BY merchant_name
""")




+---------------+---------+--------------+--------------------+------------+--------------+------------------+
|      drop_name|take_rate|revenue_levels|            category|males_in_SA2|females_in_SA2| income_per_person|
+---------------+---------+--------------+--------------------+------------+--------------+------------------+
|   A Associates|     4.95|             b|Books, Stationary...|        9762|         10846|22526.523772559674|
|A Felis Company|     4.32|             b|Books, Stationary...|        1080|          1051| 33927.61168708765|
+---------------+---------+--------------+--------------------+------------+--------------+------------------+
only showing top 2 rows



                                                                                

In [43]:
gender_agg.createOrReplaceTempView("edit")
other_agg.createOrReplaceTempView("rates")

other_cols = spark.sql(""" 

SELECT *
FROM edit
INNER JOIN rates
ON edit.merchant_name = rates.drop_name
""")

train = other_cols.drop('m_name', 'f_name', 'drop_name','join_col')

train.limit(5)

                                                                                

merchant_name,no_of_transactions,SA2_code,Year,Month,total_earnings,males,females,take_rate,revenue_levels,category,males_in_SA2,females_in_SA2,income_per_person
A Aliquet Ltd,2,312021351,2021,6,298.8537411028336,1,1,3.87,b,Furniture,3292,3206,28693.71558221812
A Aliquet Ltd,2,401021010,2021,4,670.4857203238805,1,1,3.87,b,Furniture,3292,3206,28693.71558221812
A Aliquet Ltd,2,603011065,2021,12,346.5799250465364,1,1,3.87,b,Furniture,3292,3206,28693.71558221812
A Arcu Industries,2,124011453,2021,8,203.5414742977921,1,1,3.0,c,Furniture,4821,4683,25816.03452631579
A Arcu Industries,2,211051282,2022,3,655.1195924003883,1,1,3.0,c,Furniture,4821,4683,25816.03452631579


In [45]:
train_projection = train.select("merchant_name", "SA2_code", "Year", "Month", 'total_earnings')

                                                                                

merchant_name,SA2_code,Year,Month,total_earnings
A Aliquet Ltd,401021010,2021,4,670.4857203238805
A Aliquet Ltd,603011065,2021,12,346.5799250465364
A Arcu Industries,124011453,2021,8,203.5414742977921
A Arcu Industries,211051282,2022,3,655.1195924003883
A Arcu Industries,214021379,2021,7,451.0067711100705


In [47]:
train_projection = train_projection.withColumn("prev_year", \
              when(train_projection["Month"] == 1, train_projection['Year'] - 1).otherwise(train_projection['Year']))
train_projection = train_projection.withColumn("prev_month", \
              when(train_projection["Month"] == 1, 12).otherwise(train_projection['Month'] - 1))
train_projection = train_projection.drop("Year", "Month")
train_projection = train_projection.withColumnRenamed("total_earnings", "future_earnings") \
                            .withColumnRenamed("merchant_name", "p_merchant_name") \
                            .withColumnRenamed("SA2_code", "p_SA2_code")

                                                                                

p_merchant_name,p_SA2_code,future_earnings,prev_year,prev_month
A Aliquet Ltd,401021010,670.4857203238805,2021,3
A Aliquet Ltd,603011065,346.5799250465364,2021,11
A Arcu Industries,124011453,203.5414742977921,2021,7
A Arcu Industries,211051282,655.1195924003883,2022,2
A Arcu Industries,214021379,451.0067711100705,2021,6


In [49]:
final_data= train.join(train_projection, (train.merchant_name == train_projection.p_merchant_name) & 
                           (train.SA2_code == train_projection.p_SA2_code) & 
                           (train.Year == train_projection.prev_year) & 
                           (train.Month == train_projection.prev_month), how = 'inner')

final_data = final_data.drop("p_merchant_name", "p_SA2_code","prev_year", "prev_month")

                                                                                

merchant_name,no_of_transactions,SA2_code,Year,Month,total_earnings,males,females,take_rate,revenue_levels,category,males_in_SA2,females_in_SA2,income_per_person,future_earnings
A Auctor Non Corp...,3,202031033,2021,11,209.3276485401221,1,2,5.58,a,Furniture,2067,2014,22634.72370679088,184.5112457341748
A Auctor Non Corp...,3,205021082,2022,7,262.10605847773724,1,2,5.58,a,Furniture,2067,2014,22634.72370679088,240.9525111578749
A Auctor Non Corp...,2,205031087,2021,9,218.6890344781834,1,1,5.58,a,Furniture,2067,2014,22634.72370679088,265.85931156357447
A Auctor Non Corp...,3,210021235,2022,9,152.99495946306212,1,2,5.58,a,Furniture,2067,2014,22634.72370679088,182.4378269349945
A Auctor Non Corp...,4,211051282,2021,10,298.778609499402,2,2,5.58,a,Furniture,2067,2014,22634.72370679088,421.7745544721037


In [52]:
field_str = ['Year', 'Month', 'SA2_code']

for cols in field_str:
    final_data = final_data.withColumn(cols,

    F.col(cols).cast('STRING')

)


field_int = ['no_of_transactions', 'males', 'females', 'males_in_SA2', 'females_in_SA2']

for col in field_int:
    final_data = final_data.withColumn(col,

    F.col(col).cast('INT')

)

In [76]:
# String indexing the categorical columns

indexer = StringIndexer(inputCols = ['merchant_name', 'SA2_code', 'Year', 'Month', 'revenue_levels','category'],
outputCols = ['merchant_name_num', 'SA2_code_num', 'Year_num', 'Month_num', 'revenue_levels_num','category_num'], handleInvalid="keep")

indexd_data = indexer.fit(final_data).transform(final_data)


# Applying onehot encoding to the categorical data that is string indexed above
encoder = OneHotEncoder(inputCols = ['merchant_name_num', 'SA2_code_num', 'Year_num', 'Month_num', 'revenue_levels_num','category_num'],
outputCols = ['merchant_name_vec', 'SA2_code_vec', 'Year_vec', 'Month_vec', 'revenue_levels_vec','category_vec'])

onehotdata = encoder.fit(indexd_data).transform(indexd_data)


# Assembling the training data as a vector of features 
assembler1 = VectorAssembler(
inputCols=['merchant_name_vec', 'SA2_code_vec', 'Year_vec', 'Month_vec', 'revenue_levels_vec','category_vec','males_in_SA2','females_in_SA2', 'income_per_person', 'no_of_transactions','take_rate', 'total_earnings'],
outputCol= "features" )

outdata1 = assembler1.transform(onehotdata)

                                                                                

In [56]:
# Renaming the target column as label

outdata1 = outdata1.withColumnRenamed(
    "future_earnings",
    "label"
)

In [57]:
# Assembling the features as a feature vector 

featureIndexer =\
    VectorIndexer(inputCol="features", 
    outputCol="indexedFeatures").fit(outdata1)

outdata1 = featureIndexer.transform(outdata1)

                                                                                

In [58]:
# Split the data into training and validation sets (30% held out for testing)

trainingData, testData = outdata1.randomSplit([0.7, 0.3], seed = 20)

In [60]:
# Train a RandomForest model.
rf = RandomForestRegressor(featuresCol="indexedFeatures")


# Train model.  
model = rf.fit(trainingData)

# Make predictions.
predictions_validation = model.transform(testData)



22/10/05 15:17:20 WARN DAGScheduler: Broadcasting large task binary with size 1337.1 KiB


                                                                                

22/10/05 15:17:21 WARN DAGScheduler: Broadcasting large task binary with size 1337.2 KiB


                                                                                

22/10/05 15:17:23 WARN DAGScheduler: Broadcasting large task binary with size 1341.1 KiB


                                                                                

22/10/05 15:17:25 WARN DAGScheduler: Broadcasting large task binary with size 1453.8 KiB


                                                                                

22/10/05 15:17:31 WARN DAGScheduler: Broadcasting large task binary with size 1515.3 KiB


                                                                                

22/10/05 15:17:35 WARN DAGScheduler: Broadcasting large task binary with size 1636.4 KiB


                                                                                

22/10/05 15:17:39 WARN DAGScheduler: Broadcasting large task binary with size 1875.8 KiB


                                                                                

22/10/05 15:17:42 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB


                                                                                

In [61]:
# Evaluate the validation set 

predictions_validation.select("prediction", "label", "features").show(5)

# Select (prediction, true label) and compute test error

evaluator_train_rmse = RegressionEvaluator(
    labelCol="label", predictionCol="prediction", metricName="rmse")
rmse_train = evaluator_train_rmse.evaluate(predictions_validation)
print("Root Mean Squared Error (RMSE) on train data = %g" % rmse_train)

evaluator_train_mae = RegressionEvaluator(
    labelCol="label", predictionCol="prediction", metricName="mae")
mae_train = evaluator_train_mae.evaluate(predictions_validation)
print("Mean Absolutee Error (MAE) on train data = %g" % mae_train)



22/10/05 15:18:47 WARN DAGScheduler: Broadcasting large task binary with size 1349.8 KiB


                                                                                

+------------------+------------------+--------------------+
|        prediction|             label|            features|
+------------------+------------------+--------------------+
| 324.3243007999714| 175.5058926208255|(2085,[209,1093,2...|
|344.30531301009773| 184.5112457341748|(2085,[209,988,20...|
|344.30531301009773| 273.6255392020826|(2085,[209,976,20...|
| 359.5332149508459| 421.7745544721037|(2085,[209,989,20...|
| 337.9773343595367|152.81812360693857|(2085,[209,1039,2...|
+------------------+------------------+--------------------+
only showing top 5 rows





22/10/05 15:19:48 WARN DAGScheduler: Broadcasting large task binary with size 1342.8 KiB


                                                                                

22/10/05 15:19:50 WARN DAGScheduler: Broadcasting large task binary with size 1343.9 KiB
Root Mean Squared Error (RMSE) on train data = 338.735




22/10/05 15:20:54 WARN DAGScheduler: Broadcasting large task binary with size 1342.8 KiB




22/10/05 15:20:56 WARN DAGScheduler: Broadcasting large task binary with size 1343.9 KiB
Mean Absolutee Error (MAE) on train data = 207.436


                                                                                

In [90]:
def ExtractFeatureImportance(featureImp, dataset, featuresCol):
    list_extract = []
    for i in dataset.schema[featuresCol].metadata["ml_attr"]["attrs"]:
        list_extract = list_extract + dataset.schema[featuresCol].metadata["ml_attr"]["attrs"][i]
    varlist = pd.DataFrame(list_extract)
    varlist['score'] = varlist['idx'].apply(lambda x: featureImp[x])
    return(varlist.sort_values('score', ascending = False))
  
  
#ExtractFeatureImportance(model.stages[-1].featureImportances, dataset, "features")
dataset_fi = ExtractFeatureImportance(model.featureImportances, predictions_validation, "features")
dataset_fi = spark.createDataFrame(dataset_fi)
display(dataset_fi)

idx,name,score
2084,total_earnings,0.6695771071848149
2082,no_of_transactions,0.13398775635621468
2074,category_vec_Book...,0.04283619614809752
2078,category_vec_Beau...,0.04118762477351097
88,merchant_name_vec...,0.011191574656608779
2076,category_vec_Furn...,0.009312834402747409
2079,males_in_SA2,0.008046174475835548
69,merchant_name_vec...,0.006483644941404573
2083,take_rate,0.006344004036932749
975,SA2_code_vec_4060...,0.006041124042697079


## Future predictions

In [62]:
latest_year = train.select(max('Year')).collect()[0][0]
agg_month_1 = train.filter(train.Year == latest_year)
latest_month = agg_month_1.select(max('Month')).collect()[0][0]
predicting_data = agg_month_1.filter(train.Month == latest_month)
predicting_data = predicting_data.withColumn("future_earnings", lit(0))

                                                                                

merchant_name,no_of_transactions,SA2_code,Year,Month,total_earnings,males,females,take_rate,revenue_levels,category,males_in_SA2,females_in_SA2,income_per_person,future_earnings
A Auctor Non Corp...,3,125031480,2022,10,205.55614462620872,1,2,5.58,a,Furniture,2067,2014,22634.72370679088,0
A Auctor Non Corp...,3,126011496,2022,10,205.52264839287457,1,2,5.58,a,Furniture,2067,2014,22634.72370679088,0
A Auctor Non Corp...,3,211051282,2022,10,200.77245536872837,2,1,5.58,a,Furniture,2067,2014,22634.72370679088,0
A Auctor Non Corp...,2,509021239,2022,10,106.3998297018926,1,1,5.58,a,Furniture,2067,2014,22634.72370679088,0
A Auctor Non Corp...,6,509021240,2022,10,362.197810772229,3,2,5.58,a,Furniture,2067,2014,22634.72370679088,0


In [63]:
# String indexing the categorical columns

indexer = StringIndexer(inputCols = ['merchant_name', 'SA2_code', 'Year', 'Month', 'revenue_levels','category'],
outputCols = ['merchant_name_num', 'SA2_code_num', 'Year_num', 'Month_num', 'revenue_levels_num','category_num'], handleInvalid="keep")

indexd_data = indexer.fit(predicting_data).transform(predicting_data)


# Applying onehot encoding to the categorical data that is string indexed above
encoder = OneHotEncoder(inputCols = ['merchant_name_num', 'SA2_code_num', 'Year_num', 'Month_num', 'revenue_levels_num','category_num'],
outputCols = ['merchant_name_vec', 'SA2_code_vec', 'Year_vec', 'Month_vec', 'revenue_levels_vec','category_vec'])

onehotdata = encoder.fit(indexd_data).transform(indexd_data)


# Assembling the training data as a vector of features 
assembler1 = VectorAssembler(
inputCols=['merchant_name_vec', 'SA2_code_vec', 'Year_vec', 'Month_vec', 'revenue_levels_vec','category_vec','males_in_SA2','females_in_SA2', 'income_per_person', 'no_of_transactions','take_rate', 'total_earnings'],
outputCol= "features" )

outdata1 = assembler1.transform(onehotdata)

# Renaming the target column as label

outdata1 = outdata1.withColumnRenamed(
    "future_earnings",
    "label"
)


# Assembling the features as a feature vector 

featureIndexer =\
    VectorIndexer(inputCol="features", 
    outputCol="indexedFeatures").fit(outdata1)

outdata1 = featureIndexer.transform(outdata1)

                                                                                

In [64]:
predictions_test = model.transform(outdata1)

In [66]:
predictions_test.createOrReplaceTempView("preds")

pred = spark.sql(""" 

SELECT merchant_name, SUM(prediction) AS total_revenue
FROM preds
GROUP BY merchant_name

""")

pred.limit(5)

                                                                                

merchant_name,total_revenue
Dictum Mi Incorpo...,611.1663556783041
Dictum Mi Limited,16501.491603314207
Donec Luctus Indu...,4278.164489748128
Elit Sed Consequa...,14056.826180600992
Hendrerit Consect...,2750.2486005523683


In [68]:
pred_df = pred.toPandas()

                                                                                

In [69]:
pred_df.to_csv("../data/curated/revenue.csv")