In [0]:
from pyspark.sql import SparkSession

In [0]:
spark = SparkSession.builder.appName('IMMLDDT').getOrCreate()

In [0]:
file_location = "/FileStore/tables/car_prices.csv"
file_type = "csv"

# CSV options
infer_schema = "true"
first_row_is_header = "true"
delimiter = ","

df = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(file_location)

display(df)
#df.head()

year,make,model,trim,body,transmission,vin,state,condition,odometer,color,interior,seller,mmr,sellingprice,saledate
2015,Kia,Sorento,LX,SUV,automatic,5xyktca69fg566472,ca,5.0,16639.0,white,black,"kia motors america, inc",20500,21500,Tue Dec 16 2014 12:30:00 GMT-0800 (PST)
2015,Kia,Sorento,LX,SUV,automatic,5xyktca69fg561319,ca,5.0,9393.0,white,beige,"kia motors america, inc",20800,21500,Tue Dec 16 2014 12:30:00 GMT-0800 (PST)
2014,BMW,3 Series,328i SULEV,Sedan,automatic,wba3c1c51ek116351,ca,4.5,1331.0,gray,black,financial services remarketing (lease),31900,30000,Thu Jan 15 2015 04:30:00 GMT-0800 (PST)
2015,Volvo,S60,T5,Sedan,automatic,yv1612tb4f1310987,ca,4.1,14282.0,white,black,volvo na rep/world omni,27500,27750,Thu Jan 29 2015 04:30:00 GMT-0800 (PST)
2014,BMW,6 Series Gran Coupe,650i,Sedan,automatic,wba6b2c57ed129731,ca,4.3,2641.0,gray,black,financial services remarketing (lease),66000,67000,Thu Dec 18 2014 12:30:00 GMT-0800 (PST)
2015,Nissan,Altima,2.5 S,Sedan,automatic,1n4al3ap1fn326013,ca,1.0,5554.0,gray,black,enterprise vehicle exchange / tra / rental / tulsa,15350,10900,Tue Dec 30 2014 12:00:00 GMT-0800 (PST)
2014,BMW,M5,Base,Sedan,automatic,wbsfv9c51ed593089,ca,3.4,14943.0,black,black,the hertz corporation,69000,65000,Wed Dec 17 2014 12:30:00 GMT-0800 (PST)
2014,Chevrolet,Cruze,1LT,Sedan,automatic,1g1pc5sb2e7128460,ca,2.0,28617.0,black,black,enterprise vehicle exchange / tra / rental / tulsa,11900,9800,Tue Dec 16 2014 13:00:00 GMT-0800 (PST)
2014,Audi,A4,2.0T Premium Plus quattro,Sedan,automatic,wauffafl3en030343,ca,4.2,9557.0,white,black,audi mission viejo,32100,32250,Thu Dec 18 2014 12:00:00 GMT-0800 (PST)
2014,Chevrolet,Camaro,LT,Convertible,automatic,2g1fb3d37e9218789,ca,3.0,4809.0,red,black,d/m auto sales inc,26300,17500,Tue Jan 20 2015 04:00:00 GMT-0800 (PST)


In [0]:
df.printSchema()

In [0]:
# Selecting the dependent and the independent variables that are identified as most useful attributes to make predictions

data=df.select(['year', 'make','model','trim','body',
                                 'transmission','condition','odometer', 'color', 'state', 'interior', 'seller','sellingprice'])

In [0]:
data=data.dropna()

In [0]:
# Create a 70-30 train test split

train_data,test_data=data.randomSplit([0.7,0.3])

### Building the Decision Tree Regression

In [0]:
from pyspark.ml import Pipeline
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.feature import VectorAssembler,StringIndexer
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql import SparkSession

In [0]:
# Use StringIndexer to convert the categorical columns to hold numerical data

make_indexer = StringIndexer(inputCol='make',outputCol='make_index',handleInvalid='keep')
model_indexer = StringIndexer(inputCol='model',outputCol='model_index',handleInvalid='keep')
trim_indexer = StringIndexer(inputCol='trim',outputCol='trim_index',handleInvalid='keep')
body_indexer = StringIndexer(inputCol='body',outputCol='body_index',handleInvalid='keep')
transmission_indexer = StringIndexer(inputCol='transmission',outputCol='transmission_index',handleInvalid='keep')
state_indexer = StringIndexer(inputCol='state',outputCol='state_index',handleInvalid='keep')
color_indexer = StringIndexer(inputCol='color',outputCol='color_index',handleInvalid='keep')
interior_indexer = StringIndexer(inputCol='interior',outputCol='interior_index',handleInvalid='keep')
seller_indexer = StringIndexer(inputCol='seller',outputCol='seller_index',handleInvalid='keep')
condition_indexer = StringIndexer(inputCol='condition',outputCol='condition_index',handleInvalid='keep')


In [0]:
# Vector assembler is used to create a vector of input features

assembler = VectorAssembler(inputCols=['year','make_index','model_index','trim_index','body_index',
                                       'transmission_index','state_index','condition_index','odometer','color_index', 'interior_index', 'seller_index'],
                            outputCol="features")

In [0]:
# Train a DecisionTree model.
dt = DecisionTreeRegressor(labelCol='sellingprice',maxBins=11000)

In [0]:
# Pipeline is used to pass the data through indexer and assembler simultaneously. Also, it helps to pre-rocess the test data
# in the same way as that of the train data

pipe = Pipeline(stages=[make_indexer, model_indexer,trim_indexer,body_indexer,
                                       transmission_indexer, state_indexer,color_indexer,interior_indexer, seller_indexer,condition_indexer, assembler,dt])

In [0]:
#fit model train data

fit_model=pipe.fit(train_data)

In [0]:
# Store the results in a dataframe

results = fit_model.transform(test_data)
display(results)

year,make,model,trim,body,transmission,condition,odometer,color,state,interior,seller,sellingprice,make_index,model_index,trim_index,body_index,transmission_index,state_index,color_index,interior_index,seller_index,condition_index,features,prediction
1990,Honda,Accord,LX,Sedan,automatic,1.0,183366.0,gold,nv,—,automotive remarketing inc,400,5.0,6.0,2.0,0.0,0.0,12.0,7.0,4.0,98.0,31.0,"Map(vectorType -> dense, length -> 12, values -> List(1990.0, 5.0, 6.0, 2.0, 0.0, 0.0, 12.0, 31.0, 183366.0, 7.0, 4.0, 98.0))",1764.6133908108698
1990,Toyota,Camry,Deluxe,Sedan,automatic,2.0,214723.0,blue,wa,blue,donate for charity,375,3.0,3.0,155.0,0.0,0.0,18.0,4.0,8.0,1358.0,10.0,"Map(vectorType -> dense, length -> 12, values -> List(1990.0, 3.0, 3.0, 155.0, 0.0, 0.0, 18.0, 10.0, 214723.0, 4.0, 8.0, 1358.0))",1764.6133908108698
1991,Honda,Accord,SE,Sedan,automatic,2.0,186903.0,gray,md,blue,purple heart services inc,275,5.0,6.0,1.0,0.0,0.0,13.0,3.0,8.0,81.0,10.0,"Map(vectorType -> dense, length -> 12, values -> List(1991.0, 5.0, 6.0, 1.0, 0.0, 0.0, 13.0, 10.0, 186903.0, 3.0, 8.0, 81.0))",1764.6133908108698
1991,Mazda,MX-5 Miata,Base,Convertible,automatic,2.0,110154.0,white,nv,—,automotive remarketing inc,800,16.0,282.0,0.0,9.0,0.0,12.0,1.0,4.0,98.0,10.0,"Map(vectorType -> dense, length -> 12, values -> List(1991.0, 16.0, 282.0, 0.0, 9.0, 0.0, 12.0, 10.0, 110154.0, 1.0, 4.0, 98.0))",1764.6133908108698
1991,Toyota,Camry,Deluxe,Sedan,automatic,2.0,252591.0,gray,ca,gray,honda of serramonte,150,3.0,3.0,155.0,0.0,0.0,1.0,3.0,1.0,1816.0,10.0,"Map(vectorType -> dense, length -> 12, values -> List(1991.0, 3.0, 3.0, 155.0, 0.0, 0.0, 1.0, 10.0, 252591.0, 3.0, 1.0, 1816.0))",1764.6133908108698
1992,Buick,Park Avenue,Base,Sedan,automatic,1.0,136859.0,blue,ga,blue,rick hendrick chevrolet,700,21.0,369.0,0.0,0.0,0.0,3.0,4.0,8.0,244.0,31.0,"Map(vectorType -> dense, length -> 12, values -> List(1992.0, 21.0, 369.0, 0.0, 0.0, 0.0, 3.0, 31.0, 136859.0, 4.0, 8.0, 244.0))",1764.6133908108698
1992,Buick,Park Avenue,Base,Sedan,automatic,2.0,145277.0,gold,fl,beige,autonation toyota fort myers,400,21.0,369.0,0.0,0.0,0.0,0.0,7.0,2.0,314.0,10.0,"Map(vectorType -> dense, length -> 12, values -> List(1992.0, 21.0, 369.0, 0.0, 0.0, 0.0, 0.0, 10.0, 145277.0, 7.0, 2.0, 314.0))",1764.6133908108698
1992,Cadillac,DeVille,Base,Sedan,automatic,1.0,155093.0,blue,ca,blue,grand auto sales,600,17.0,165.0,0.0,0.0,0.0,1.0,4.0,8.0,10571.0,31.0,"Map(vectorType -> dense, length -> 12, values -> List(1992.0, 17.0, 165.0, 0.0, 0.0, 0.0, 1.0, 31.0, 155093.0, 4.0, 8.0, 10571.0))",1764.6133908108698
1992,Ford,Explorer,XLT,SUV,automatic,1.0,18069.0,gold,fl,beige,coggin nissan at the avenues,500,0.0,13.0,5.0,1.0,0.0,0.0,7.0,2.0,686.0,31.0,"Map(vectorType -> dense, length -> 12, values -> List(1992.0, 0.0, 13.0, 5.0, 1.0, 0.0, 0.0, 31.0, 18069.0, 7.0, 2.0, 686.0))",19156.121784069204
1992,Honda,Accord,LX,Sedan,automatic,1.0,169012.0,white,fl,blue,autonation honda clearwater,700,5.0,6.0,2.0,0.0,0.0,0.0,1.0,8.0,648.0,31.0,"Map(vectorType -> dense, length -> 12, values -> List(1992.0, 5.0, 6.0, 2.0, 0.0, 0.0, 0.0, 31.0, 169012.0, 1.0, 8.0, 648.0))",1764.6133908108698


In [0]:
results.select(['sellingprice', 'features']).show()

##### Evaluating the model

In [0]:
#And evaluate how well is our model doing. Our metrics will be RMSE, MSE, MAE and R^2.

from pyspark.ml.evaluation import RegressionEvaluator
eval = RegressionEvaluator(labelCol="sellingprice", predictionCol="prediction", metricName="rmse")

# Root Mean Square Error
rmse = eval.evaluate(results)
print("RMSE: %.3f" % rmse)

# Mean Square Error
mse = eval.evaluate(results, {eval.metricName: "mse"})
print("MSE: %.3f" % mse)

# Mean Absolute Error
mae = eval.evaluate(results, {eval.metricName: "mae"})
print("MAE: %.3f" % mae)

# r2 - coefficient of determination
r2 = eval.evaluate(results, {eval.metricName: "r2"})
print("r2: %.3f" %r2)

In [0]:
#Interpretation

#The r2 of 0.798 indicates that approximately 80% of the data is a good fit for the model.