In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import mean, stddev, count, col, when, isnull
from pyspark.sql.types import IntegerType, DoubleType
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.regression import LinearRegression
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.stat import Correlation
import matplotlib.pyplot as plt
import seaborn as sns
from pyspark.ml.regression import RandomForestRegressor,GBTRegressor,DecisionTreeRegressor
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [0]:
df = spark.read.csv("/FileStore/tables/coffee_Full.csv",
                    inferSchema=True,
                    header=True)
# Convert numerical columns to appropriate data types
def convert_column(df, column, target_type):
    return df.withColumn(column, df[column].cast(target_type))

df = convert_column(df, 'age', IntegerType())
df = convert_column(df, 'num_items', IntegerType())
df = convert_column(df, 'wait_time', DoubleType())
df = convert_column(df, 'purchase_amount', DoubleType())
df = convert_column(df, 'transaction_time', IntegerType())

df.show(3)

+---+--------------+---+----------+------+--------------+-------------+---------+---------------+----------------+---------------+--------------+----------------+-----------+
|_c0|transaction_id|age|    income|   sex|rewards_member|   occupation|num_items|purchase_method|       wait_time|purchase_amount|store_location|transaction_time|day_of_week|
+---+--------------+---+----------+------+--------------+-------------+---------+---------------+----------------+---------------+--------------+----------------+-----------+
|  1|             1| 48| $50K-$75K|Female|         false|      Retired|        3|    Credit Card|4.37041779796359|          15.19|        Uptown|              22|   Saturday|
|  2|             2| 32| $25K-$50K|Female|         false|      Retired|        3|           Cash|2.30796121678142|          12.41|       Midtown|              20|     Friday|
|  3|             3| 68|$75K-$100K|Female|         false|Self Employed|        4| Mobile Payment|3.89939822873409|          1

In [0]:
#drop the first two columns
df = df.drop("_c0", "transaction_id") 

In [0]:
display(df)

age,income,sex,rewards_member,occupation,num_items,purchase_method,wait_time,purchase_amount,store_location,transaction_time,day_of_week
48,$50K-$75K,Female,False,Retired,3,Credit Card,4.37041779796359,15.19,Uptown,22,Saturday
32,$25K-$50K,Female,False,Retired,3,Cash,2.30796121678142,12.41,Midtown,20,Friday
68,$75K-$100K,Female,False,Self Employed,4,Mobile Payment,3.89939822873409,18.16,Midtown,12,Friday
31,$25K-$50K,Female,False,Employed,4,Mobile Payment,1.87190761759261,15.41,Downtown,14,Sunday
20,Under $25K,Female,False,Student,1,Credit Card,2.66648236799237,6.23,Uptown,18,Wednesday
59,$75K-$100K,Female,True,Employed,5,Credit Card,2.50637046034456,21.72,Midtown,13,Monday
67,$75K-$100K,Male,True,Self Employed,3,Credit Card,0.966888079743862,13.08,Uptown,12,Saturday
71,Over $100K,Female,False,Retired,3,Credit Card,3.42676111896725,14.96,Downtown,13,Saturday
60,$75K-$100K,Male,True,Retired,6,Credit Card,2.94511204259134,19.27,Downtown,11,Wednesday
54,$75K-$100K,Male,True,Employed,3,Credit Card,0.853794802025569,13.38,Midtown,16,Friday


In [0]:
# find the amount of nan values
# Count null values in each column
display(df.select([count(when(isnull(c),c)).alias(c) for c in df.columns]))

age,income,sex,rewards_member,occupation,num_items,purchase_method,wait_time,purchase_amount,store_location,transaction_time,day_of_week
0,0,0,0,0,0,0,0,0,0,0,0


In [0]:
# Drop rows containing any null values
df = df.na.drop()

#Working with Categorical Columns

Convert "income", "sex", "occupation", "purchase_method", "store_location", "day_of_week" to dummy variables

In [0]:
# Columns to be indexed and one-hot encoded
categorical_columns = ["income", "sex", "occupation", "purchase_method", "store_location", "day_of_week"]

income_indexer = StringIndexer(inputCol='income', outputCol="income_index")
sex_indexer = StringIndexer(inputCol='sex', outputCol="sex_index")
occupation_indexer = StringIndexer(inputCol='occupation', outputCol="occupation_index")
purchase_method_indexer = StringIndexer(inputCol='purchase_method', outputCol="purchase_method_index")
store_location_indexer = StringIndexer(inputCol='store_location', outputCol="store_location_index")
day_of_week_indexer = StringIndexer(inputCol='day_of_week', outputCol="day_of_week_index")

sex_encoder = OneHotEncoder(inputCol='sex_index', outputCol='sex_vec')
occupation_encoder = OneHotEncoder(inputCol='occupation_index', outputCol='occupation_vec')
purchase_method_encoder = OneHotEncoder(inputCol='purchase_method_index', outputCol='purchase_method_vec')
store_location_encoder = OneHotEncoder(inputCol='store_location_index', outputCol='store_location_vec')

In [0]:
assembler = VectorAssembler(inputCols= ['age','rewards_member','wait_time','transaction_time','num_items','income_index','sex_vec','occupation_vec','purchase_method_vec','store_location_vec','day_of_week_index'], outputCol="features")


#Pipelines

In [0]:
lr = LinearRegression(featuresCol="features", labelCol="purchase_amount")

In [0]:
pipeline = Pipeline(stages = [income_indexer,sex_indexer,occupation_indexer,purchase_method_indexer,store_location_indexer,day_of_week_indexer,sex_encoder,occupation_encoder,purchase_method_encoder,store_location_encoder,assembler,lr])

In [0]:
#Split the data into 70% train and 30% test set with Yearly Amount Spent as the dependent variable.
train_data, test_data = df.randomSplit([0.7,0.3])

#Linear Regression

In [0]:
lrModel = pipeline.fit(train_data)

In [0]:
prediction = lrModel.transform(test_data)

In [0]:
display(prediction)

age,income,sex,rewards_member,occupation,num_items,purchase_method,wait_time,purchase_amount,store_location,transaction_time,day_of_week,income_index,sex_index,occupation_index,purchase_method_index,store_location_index,day_of_week_index,sex_vec,occupation_vec,purchase_method_vec,store_location_vec,features,prediction
18,Under $25K,Female,False,Employed,1,Cash,3.24740973257318,4.29,Uptown,7,Thursday,4.0,0.0,0.0,2.0,2.0,4.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 3, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 2, indices -> List(), values -> List())","Map(vectorType -> sparse, length -> 2, indices -> List(), values -> List())","Map(vectorType -> sparse, length -> 15, indices -> List(0, 2, 3, 4, 5, 6, 7, 14), values -> List(18.0, 3.24740973257318, 7.0, 1.0, 4.0, 1.0, 1.0, 4.0))",11.935680600320454
18,Under $25K,Female,False,Employed,1,Cash,4.29600362593091,9.01,Downtown,20,Saturday,4.0,0.0,0.0,2.0,0.0,5.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 3, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 2, indices -> List(), values -> List())","Map(vectorType -> sparse, length -> 2, indices -> List(0), values -> List(1.0))","Map(vectorType -> dense, length -> 15, values -> List(18.0, 0.0, 4.29600362593091, 20.0, 1.0, 4.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 5.0))",11.941649149424087
18,Under $25K,Female,False,Employed,1,Cash,4.40110086877122,8.53,Downtown,15,Tuesday,4.0,0.0,0.0,2.0,0.0,2.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 3, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 2, indices -> List(), values -> List())","Map(vectorType -> sparse, length -> 2, indices -> List(0), values -> List(1.0))","Map(vectorType -> dense, length -> 15, values -> List(18.0, 0.0, 4.40110086877122, 15.0, 1.0, 4.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 2.0))",11.939970010287174
18,Under $25K,Female,False,Employed,1,Cash,4.49414304964018,3.65,Midtown,23,Wednesday,4.0,0.0,0.0,2.0,1.0,3.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 3, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 2, indices -> List(), values -> List())","Map(vectorType -> sparse, length -> 2, indices -> List(1), values -> List(1.0))","Map(vectorType -> dense, length -> 15, values -> List(18.0, 0.0, 4.49414304964018, 23.0, 1.0, 4.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 3.0))",11.95828285048382
18,Under $25K,Female,False,Employed,1,Cash,4.79498195976688,8.57,Midtown,21,Tuesday,4.0,0.0,0.0,2.0,1.0,2.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 3, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 2, indices -> List(), values -> List())","Map(vectorType -> sparse, length -> 2, indices -> List(1), values -> List(1.0))","Map(vectorType -> dense, length -> 15, values -> List(18.0, 0.0, 4.79498195976688, 21.0, 1.0, 4.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0))",11.95736334328901
18,Under $25K,Female,False,Employed,1,Cash,4.80476233433403,6.34,Midtown,10,Thursday,4.0,0.0,0.0,2.0,1.0,4.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 3, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 2, indices -> List(), values -> List())","Map(vectorType -> sparse, length -> 2, indices -> List(1), values -> List(1.0))","Map(vectorType -> dense, length -> 15, values -> List(18.0, 0.0, 4.80476233433403, 10.0, 1.0, 4.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 4.0))",11.942172045483352
18,Under $25K,Female,False,Employed,1,Cash,6.55763560259464,8.24,Midtown,15,Thursday,4.0,0.0,0.0,2.0,1.0,4.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 3, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 2, indices -> List(), values -> List())","Map(vectorType -> sparse, length -> 2, indices -> List(1), values -> List(1.0))","Map(vectorType -> dense, length -> 15, values -> List(18.0, 0.0, 6.55763560259464, 15.0, 1.0, 4.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 4.0))",11.947993557642398
18,Under $25K,Female,False,Employed,1,Cash,7.80851897757534,6.72,Uptown,18,Saturday,4.0,0.0,0.0,2.0,2.0,5.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 3, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 2, indices -> List(), values -> List())","Map(vectorType -> sparse, length -> 2, indices -> List(), values -> List())","Map(vectorType -> sparse, length -> 15, indices -> List(0, 2, 3, 4, 5, 6, 7, 14), values -> List(18.0, 7.80851897757534, 18.0, 1.0, 4.0, 1.0, 1.0, 5.0))",11.947205083891038
18,Under $25K,Female,False,Employed,1,Credit Card,0.549283054336255,5.14,Uptown,15,Friday,4.0,0.0,0.0,0.0,2.0,1.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 3, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 2, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 2, indices -> List(), values -> List())","Map(vectorType -> dense, length -> 15, values -> List(18.0, 0.0, 0.549283054336255, 15.0, 1.0, 4.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0))",11.929453512449983
18,Under $25K,Female,False,Employed,1,Credit Card,0.764162130484375,7.28,Downtown,21,Thursday,4.0,0.0,0.0,0.0,0.0,4.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 3, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 2, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 2, indices -> List(0), values -> List(1.0))","Map(vectorType -> dense, length -> 15, values -> List(18.0, 0.0, 0.764162130484375, 21.0, 1.0, 4.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 4.0))",11.924721846516942


In [0]:
lrModelComponent = lrModel.stages[-1]
coefficients = lrModelComponent.coefficients
print(coefficients)

[-0.10464867338261635,0.34756894457626175,7.397248927368423e-05,0.0011383695520022301,3.8521063115668324,0.4242865191836574,0.0022915392129945903,-1.1830556452482779,-1.1464351351879871,-1.1436151141362243,-0.01913939145815297,-0.007104735372996799,-0.007572844066780754,0.0029611352582632907,-0.0013349781061429645]


In [0]:
test_pipeline = Pipeline(stages=[
    income_indexer,
    sex_indexer,
    occupation_indexer,
    purchase_method_indexer,
    store_location_indexer,
    day_of_week_indexer,
    sex_encoder,
    occupation_encoder,
    purchase_method_encoder,
    store_location_encoder,
    assembler
])

# Transform the test data using the new pipeline
test_data1 = test_pipeline.fit(test_data).transform(test_data)

lrtestResults = lrModelComponent.evaluate(test_data1)
print(lrtestResults.rootMeanSquaredError, lrtestResults.meanSquaredError,lrtestResults.r2)

4.111674694607407 16.905868794274912 0.7670466837961479


In [0]:
lrModelComponent.summary.pValues

Out[78]: [0.0,
 0.0,
 0.9784256471360022,
 0.11961828696121213,
 0.0,
 0.0,
 0.7629023185704107,
 0.0,
 0.0,
 0.0,
 0.06994153032530837,
 0.5768505959462211,
 0.3959916013913709,
 0.7558731310877236,
 0.5150297705935261,
 0.0]

#Hyperparameter Tuning for linear regression

In [0]:
sampled_data = df.sample(fraction=0.1, seed=42) 
train_sampled_data, test_sampled_data = sampled_data.randomSplit([0.7,0.3])

In [0]:
test_pipeline = Pipeline(stages=[
    income_indexer,
    sex_indexer,
    occupation_indexer,
    purchase_method_indexer,
    store_location_indexer,
    day_of_week_indexer,
    sex_encoder,
    occupation_encoder,
    purchase_method_encoder,
    store_location_encoder,
    assembler
])

# Transform the test data using the new pipeline
test_sampled_data1 = test_pipeline.fit(test_sampled_data).transform(test_sampled_data)
train_sampled_data1 = test_pipeline.fit(train_sampled_data).transform(train_sampled_data)

In [0]:
paramGrid = ParamGridBuilder().addGrid(lr.regParam, [0.1,0.01]).\
                               addGrid(lr.elasticNetParam,[0,0.5,1]).build()

In [0]:
crossval = CrossValidator(estimator=lr, estimatorParamMaps=paramGrid, evaluator=RegressionEvaluator(labelCol='purchase_amount',predictionCol='prediction',metricName='rmse'),numFolds=10)

In [0]:
cv_model = crossval.fit(train_sampled_data1)

In [0]:
cv_prediction = cv_model.transform(test_sampled_data1)
evaluator.evaluate(cv_prediction)

Out[119]: 4.0934105566653605

In [0]:
print(cv_model.bestModel._java_obj.getRegParam())
print(cv_model.bestModel._java_obj.getElasticNetParam())

0.01
0.5


#Tree-based Models

In [0]:
dtr = DecisionTreeRegressor(labelCol='purchase_amount',featuresCol='features')
rdr = RandomForestRegressor(labelCol='purchase_amount',featuresCol='features')
gbr = GBTRegressor(labelCol='purchase_amount',featuresCol='features')

In [0]:
dtr_pipeline = Pipeline(stages = [income_indexer,sex_indexer,occupation_indexer,purchase_method_indexer,store_location_indexer,day_of_week_indexer,sex_encoder,occupation_encoder,purchase_method_encoder,store_location_encoder,assembler,dtr])

rdr_pipeline = Pipeline(stages = [income_indexer,sex_indexer,occupation_indexer,purchase_method_indexer,store_location_indexer,day_of_week_indexer,sex_encoder,occupation_encoder,purchase_method_encoder,store_location_encoder,assembler,rdr])

gbr_pipeline = Pipeline(stages = [income_indexer,sex_indexer,occupation_indexer,purchase_method_indexer,store_location_indexer,day_of_week_indexer,sex_encoder,occupation_encoder,purchase_method_encoder,store_location_encoder,assembler,gbr])



In [0]:
dtrModel = dtr_pipeline.fit(train_data)
rdrModel = rdr_pipeline.fit(train_data)
gbrModel = gbr_pipeline.fit(train_data)

In [0]:
dtPrediction = dtrModel.transform(test_data)
rfPrediction = rdrModel.transform(test_data)
gbPrediction = gbrModel.transform(test_data)

In [0]:
display(dtPrediction)

age,income,sex,rewards_member,occupation,num_items,purchase_method,wait_time,purchase_amount,store_location,transaction_time,day_of_week,income_index,sex_index,occupation_index,purchase_method_index,store_location_index,day_of_week_index,sex_vec,occupation_vec,purchase_method_vec,store_location_vec,features,prediction
18,Under $25K,Female,False,Employed,1,Cash,3.24740973257318,4.29,Uptown,7,Thursday,4.0,0.0,0.0,2.0,2.0,4.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 3, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 2, indices -> List(), values -> List())","Map(vectorType -> sparse, length -> 2, indices -> List(), values -> List())","Map(vectorType -> sparse, length -> 15, indices -> List(0, 2, 3, 4, 5, 6, 7, 14), values -> List(18.0, 3.24740973257318, 7.0, 1.0, 4.0, 1.0, 1.0, 4.0))",7.99974021515221
18,Under $25K,Female,False,Employed,1,Cash,4.29600362593091,9.01,Downtown,20,Saturday,4.0,0.0,0.0,2.0,0.0,5.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 3, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 2, indices -> List(), values -> List())","Map(vectorType -> sparse, length -> 2, indices -> List(0), values -> List(1.0))","Map(vectorType -> dense, length -> 15, values -> List(18.0, 0.0, 4.29600362593091, 20.0, 1.0, 4.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 5.0))",7.99974021515221
18,Under $25K,Female,False,Employed,1,Cash,4.40110086877122,8.53,Downtown,15,Tuesday,4.0,0.0,0.0,2.0,0.0,2.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 3, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 2, indices -> List(), values -> List())","Map(vectorType -> sparse, length -> 2, indices -> List(0), values -> List(1.0))","Map(vectorType -> dense, length -> 15, values -> List(18.0, 0.0, 4.40110086877122, 15.0, 1.0, 4.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 2.0))",7.99974021515221
18,Under $25K,Female,False,Employed,1,Cash,4.49414304964018,3.65,Midtown,23,Wednesday,4.0,0.0,0.0,2.0,1.0,3.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 3, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 2, indices -> List(), values -> List())","Map(vectorType -> sparse, length -> 2, indices -> List(1), values -> List(1.0))","Map(vectorType -> dense, length -> 15, values -> List(18.0, 0.0, 4.49414304964018, 23.0, 1.0, 4.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 3.0))",7.99974021515221
18,Under $25K,Female,False,Employed,1,Cash,4.79498195976688,8.57,Midtown,21,Tuesday,4.0,0.0,0.0,2.0,1.0,2.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 3, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 2, indices -> List(), values -> List())","Map(vectorType -> sparse, length -> 2, indices -> List(1), values -> List(1.0))","Map(vectorType -> dense, length -> 15, values -> List(18.0, 0.0, 4.79498195976688, 21.0, 1.0, 4.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0))",7.99974021515221
18,Under $25K,Female,False,Employed,1,Cash,4.80476233433403,6.34,Midtown,10,Thursday,4.0,0.0,0.0,2.0,1.0,4.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 3, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 2, indices -> List(), values -> List())","Map(vectorType -> sparse, length -> 2, indices -> List(1), values -> List(1.0))","Map(vectorType -> dense, length -> 15, values -> List(18.0, 0.0, 4.80476233433403, 10.0, 1.0, 4.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 4.0))",7.99974021515221
18,Under $25K,Female,False,Employed,1,Cash,6.55763560259464,8.24,Midtown,15,Thursday,4.0,0.0,0.0,2.0,1.0,4.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 3, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 2, indices -> List(), values -> List())","Map(vectorType -> sparse, length -> 2, indices -> List(1), values -> List(1.0))","Map(vectorType -> dense, length -> 15, values -> List(18.0, 0.0, 6.55763560259464, 15.0, 1.0, 4.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 4.0))",7.99974021515221
18,Under $25K,Female,False,Employed,1,Cash,7.80851897757534,6.72,Uptown,18,Saturday,4.0,0.0,0.0,2.0,2.0,5.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 3, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 2, indices -> List(), values -> List())","Map(vectorType -> sparse, length -> 2, indices -> List(), values -> List())","Map(vectorType -> sparse, length -> 15, indices -> List(0, 2, 3, 4, 5, 6, 7, 14), values -> List(18.0, 7.80851897757534, 18.0, 1.0, 4.0, 1.0, 1.0, 5.0))",7.99974021515221
18,Under $25K,Female,False,Employed,1,Credit Card,0.549283054336255,5.14,Uptown,15,Friday,4.0,0.0,0.0,0.0,2.0,1.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 3, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 2, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 2, indices -> List(), values -> List())","Map(vectorType -> dense, length -> 15, values -> List(18.0, 0.0, 0.549283054336255, 15.0, 1.0, 4.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0))",7.99974021515221
18,Under $25K,Female,False,Employed,1,Credit Card,0.764162130484375,7.28,Downtown,21,Thursday,4.0,0.0,0.0,0.0,0.0,4.0,"Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 3, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 2, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 2, indices -> List(0), values -> List(1.0))","Map(vectorType -> dense, length -> 15, values -> List(18.0, 0.0, 0.764162130484375, 21.0, 1.0, 4.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 4.0))",7.99974021515221


In [0]:
display(rfPrediction)



In [0]:
display(gbPrediction)



In [0]:
dtrModelComponent = dtrModel.stages[-1] # Extract decision tree model from the pipeline
importances = dtrModelComponent.featureImportances
print(importances)

(15,[0,2,3,4,5,6,14],[0.13027861841898195,4.825706908597925e-07,2.583699540707302e-06,0.8453881021711181,0.024328458391477598,5.550542883579868e-07,1.1996939024409573e-06])


In [0]:
rdrModelComponent = rdrModel.stages[-1]
importances = rdrModelComponent.featureImportances
print(importances)

(15,[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],[0.12066234027994735,0.1795685202638848,0.051173299972855454,0.00011383670335695518,0.6193781347053109,0.02750087675422161,1.0467999968645888e-06,1.9315020098357e-05,0.0008735200192331452,4.375387112245607e-05,4.35545843126981e-06,0.0006337180246874264,1.5245161979396568e-06,1.3530603106199467e-05,1.222700754936505e-05])


In [0]:
gbrModelComponent = gbrModel.stages[-1]
importances = gbrModelComponent.featureImportances
print(importances)

(15,[0,2,3,4,5,6,7,8,9,10,11,12,13,14],[0.15465190372693274,0.0001130302867881418,0.0001255405482027831,0.8157036960369171,0.029043971243292496,8.07860142732491e-06,1.742913256965089e-05,7.3418562950598525e-06,3.070363272215832e-05,1.4382571536139454e-05,4.286495618153089e-05,2.1994473195807536e-05,2.1875714368473345e-05,0.00019718721957045972])


In [0]:
evaluator = RegressionEvaluator(labelCol='purchase_amount',predictionCol='prediction',metricName='rmse')

In [0]:
print(evaluator.evaluate(dtPrediction))

3.323597562740899


In [0]:
print(evaluator.evaluate(rfPrediction))

3.9856135369533434


In [0]:
print(evaluator.evaluate(gbPrediction))

3.1273353751495803


#Hyperparameter Tuning for Random Forest

In [0]:
paramGrid = ParamGridBuilder().addGrid(rdr.maxDepth,[2,4,6]).\
            addGrid(rdr.maxBins,[20,60]).build()

In [0]:
rdr_crossval = CrossValidator(estimator=rdr, estimatorParamMaps=paramGrid, evaluator=RegressionEvaluator(labelCol='purchase_amount',predictionCol='prediction',metricName='rmse'), numFolds=10)

In [0]:
rdr_cv_model = rdr_crossval.fit(train_sampled_data1)

In [0]:
rdr_cv_prediction = rdr_cv_model.transform(test_sampled_data1)
evaluator.evaluate(rdr_cv_prediction)

Out[128]: 3.78129185966335

In [0]:
print(rdr_cv_model.bestModel._java_obj.getMaxDepth())
print(rdr_cv_model.bestModel._java_obj.getMaxBins())

6
60


In [0]:
print(rdr_cv_model.bestModel)

RandomForestRegressionModel: uid=RandomForestRegressor_9d0b44b5b35d, numTrees=20, numFeatures=15
