In [None]:
# data set from : https://www.kaggle.com/code/casper6290/car-purchase-prediction/data

In [1]:
# importing spark
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark_session = SparkSession.builder.appName("decision tree").getOrCreate()
spark_session

In [5]:
# reading the data
data = spark_session.read.csv("car_data.csv", inferSchema = True, header = True)
print(data.count())
data.show()

1000
+-------+------+---+------------+---------+
|User ID|Gender|Age|AnnualSalary|Purchased|
+-------+------+---+------------+---------+
|    385|  Male| 35|       20000|        0|
|    681|  Male| 40|       43500|        0|
|    353|  Male| 49|       74000|        0|
|    895|  Male| 40|      107500|        1|
|    661|  Male| 25|       79000|        0|
|    846|Female| 47|       33500|        1|
|    219|Female| 46|      132500|        1|
|    588|  Male| 42|       64000|        0|
|     85|Female| 30|       84500|        0|
|    465|  Male| 41|       52000|        0|
|    686|  Male| 42|       80000|        0|
|    408|  Male| 47|       23000|        1|
|    790|Female| 32|       72500|        0|
|    116|Female| 27|       57000|        0|
|    118|Female| 42|      108000|        1|
|     54|Female| 33|      149000|        1|
|     90|  Male| 35|       75000|        0|
|    372|  Male| 35|       53000|        0|
|    926|  Male| 46|       79000|        1|
|     94|Female| 39|      1

In [7]:
# transforming the gender to int using string indexer
from pyspark.ml.feature import StringIndexer
string_indexer = StringIndexer(inputCol="Gender" , outputCol="Gender_indexed")
indexed_data = string_indexer.fit(data).transform(data)
indexed_data.show()

+-------+------+---+------------+---------+--------------+
|User ID|Gender|Age|AnnualSalary|Purchased|Gender_indexed|
+-------+------+---+------------+---------+--------------+
|    385|  Male| 35|       20000|        0|           1.0|
|    681|  Male| 40|       43500|        0|           1.0|
|    353|  Male| 49|       74000|        0|           1.0|
|    895|  Male| 40|      107500|        1|           1.0|
|    661|  Male| 25|       79000|        0|           1.0|
|    846|Female| 47|       33500|        1|           0.0|
|    219|Female| 46|      132500|        1|           0.0|
|    588|  Male| 42|       64000|        0|           1.0|
|     85|Female| 30|       84500|        0|           0.0|
|    465|  Male| 41|       52000|        0|           1.0|
|    686|  Male| 42|       80000|        0|           1.0|
|    408|  Male| 47|       23000|        1|           1.0|
|    790|Female| 32|       72500|        0|           0.0|
|    116|Female| 27|       57000|        0|           0.

In [14]:
# assembling the features using vectorAssembler
from pyspark.ml.feature import VectorAssembler
va = VectorAssembler(inputCols = ["User ID","Age","AnnualSalary","Gender_indexed"] , outputCol = "features")
finalized_data = va.transform(indexed_data)
finalized_data = finalized_data.select(["features" , "Purchased"])
finalized_data.show()

+--------------------+---------+
|            features|Purchased|
+--------------------+---------+
|[385.0,35.0,20000...|        0|
|[681.0,40.0,43500...|        0|
|[353.0,49.0,74000...|        0|
|[895.0,40.0,10750...|        1|
|[661.0,25.0,79000...|        0|
|[846.0,47.0,33500...|        1|
|[219.0,46.0,13250...|        1|
|[588.0,42.0,64000...|        0|
|[85.0,30.0,84500....|        0|
|[465.0,41.0,52000...|        0|
|[686.0,42.0,80000...|        0|
|[408.0,47.0,23000...|        1|
|[790.0,32.0,72500...|        0|
|[116.0,27.0,57000...|        0|
|[118.0,42.0,10800...|        1|
|[54.0,33.0,149000...|        1|
|[90.0,35.0,75000....|        0|
|[372.0,35.0,53000...|        0|
|[926.0,46.0,79000...|        1|
|[94.0,39.0,134000...|        1|
+--------------------+---------+
only showing top 20 rows



In [16]:
#spliting the data  : test , train
test_data , train_data = finalized_data.randomSplit([0.3,0.7])
print(test_data.count())
print(train_data.count())

286
714


In [18]:
#creating the model and training it
from pyspark.ml.classification import DecisionTreeClassifier
classifier = DecisionTreeClassifier(labelCol = "Purchased")
classifier = classifier.fit(train_data)

In [21]:
# getting the predictions
predictions = classifier.transform(test_data)
predictions.show()

+--------------------+---------+-------------+--------------------+----------+
|            features|Purchased|rawPrediction|         probability|prediction|
+--------------------+---------+-------------+--------------------+----------+
|[1.0,32.0,100000....|        1|   [11.0,8.0]|[0.57894736842105...|       0.0|
|[7.0,51.0,134000....|        0|  [10.0,66.0]|[0.13157894736842...|       1.0|
|[8.0,54.0,26000.0...|        1|   [3.0,42.0]|[0.06666666666666...|       1.0|
|[9.0,46.0,33500.0...|        1|   [3.0,42.0]|[0.06666666666666...|       1.0|
|[10.0,24.0,64500....|        0|  [232.0,0.0]|           [1.0,0.0]|       0.0|
|[13.0,29.0,80000....|        0|   [79.0,1.0]|     [0.9875,0.0125]|       0.0|
|[14.0,47.0,60500....|        0|  [24.0,19.0]|[0.55813953488372...|       0.0|
|[15.0,48.0,26500....|        1|   [3.0,42.0]|[0.06666666666666...|       1.0|
|[16.0,55.0,152500...|        1|  [10.0,66.0]|[0.13157894736842...|       1.0|
|[18.0,63.0,44500....|        1|    [0.0,3.0]|      

In [25]:
# eavluating the accuracy 
from pyspark.ml.evaluation import BinaryClassificationEvaluator
bca = BinaryClassificationEvaluator(labelCol = "Purchased")
bca.evaluate(predictions)

0.9347490347490347

# the accuarcy using decision tree algorithm in this case is better than the logistic regression