In [1]:
# data set from : https://www.kaggle.com/code/casper6290/car-purchase-prediction/data

importing the spark session and initializing it

In [2]:
import findspark
findspark.init()
from pyspark.sql import SparkSession

In [3]:
spark_session = SparkSession.builder.appName("cars").getOrCreate()

loding the data

In [4]:
data = spark_session.read.csv("car_data.csv", inferSchema = True, header = True)
data.show()

+-------+------+---+------------+---------+
|User ID|Gender|Age|AnnualSalary|Purchased|
+-------+------+---+------------+---------+
|    385|  Male| 35|       20000|        0|
|    681|  Male| 40|       43500|        0|
|    353|  Male| 49|       74000|        0|
|    895|  Male| 40|      107500|        1|
|    661|  Male| 25|       79000|        0|
|    846|Female| 47|       33500|        1|
|    219|Female| 46|      132500|        1|
|    588|  Male| 42|       64000|        0|
|     85|Female| 30|       84500|        0|
|    465|  Male| 41|       52000|        0|
|    686|  Male| 42|       80000|        0|
|    408|  Male| 47|       23000|        1|
|    790|Female| 32|       72500|        0|
|    116|Female| 27|       57000|        0|
|    118|Female| 42|      108000|        1|
|     54|Female| 33|      149000|        1|
|     90|  Male| 35|       75000|        0|
|    372|  Male| 35|       53000|        0|
|    926|  Male| 46|       79000|        1|
|     94|Female| 39|      134000

transforming the gender to int

In [5]:
from pyspark.ml.feature import StringIndexer
str_indexer = StringIndexer(inputCol="Gender" , outputCol="gender_indexed")

In [6]:
indexed = str_indexer.fit(data).transform(data)
indexed.show()

+-------+------+---+------------+---------+--------------+
|User ID|Gender|Age|AnnualSalary|Purchased|gender_indexed|
+-------+------+---+------------+---------+--------------+
|    385|  Male| 35|       20000|        0|           1.0|
|    681|  Male| 40|       43500|        0|           1.0|
|    353|  Male| 49|       74000|        0|           1.0|
|    895|  Male| 40|      107500|        1|           1.0|
|    661|  Male| 25|       79000|        0|           1.0|
|    846|Female| 47|       33500|        1|           0.0|
|    219|Female| 46|      132500|        1|           0.0|
|    588|  Male| 42|       64000|        0|           1.0|
|     85|Female| 30|       84500|        0|           0.0|
|    465|  Male| 41|       52000|        0|           1.0|
|    686|  Male| 42|       80000|        0|           1.0|
|    408|  Male| 47|       23000|        1|           1.0|
|    790|Female| 32|       72500|        0|           0.0|
|    116|Female| 27|       57000|        0|           0.

assembling features

In [7]:
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols = ["User ID","Age","AnnualSalary","gender_indexed"] , outputCol="features")
finalized_data = assembler.transform(indexed)
finalized_data = finalized_data.select("features" , "Purchased")
finalized_data.show(truncate = False)

+-------------------------+---------+
|features                 |Purchased|
+-------------------------+---------+
|[385.0,35.0,20000.0,1.0] |0        |
|[681.0,40.0,43500.0,1.0] |0        |
|[353.0,49.0,74000.0,1.0] |0        |
|[895.0,40.0,107500.0,1.0]|1        |
|[661.0,25.0,79000.0,1.0] |0        |
|[846.0,47.0,33500.0,0.0] |1        |
|[219.0,46.0,132500.0,0.0]|1        |
|[588.0,42.0,64000.0,1.0] |0        |
|[85.0,30.0,84500.0,0.0]  |0        |
|[465.0,41.0,52000.0,1.0] |0        |
|[686.0,42.0,80000.0,1.0] |0        |
|[408.0,47.0,23000.0,1.0] |1        |
|[790.0,32.0,72500.0,0.0] |0        |
|[116.0,27.0,57000.0,0.0] |0        |
|[118.0,42.0,108000.0,0.0]|1        |
|[54.0,33.0,149000.0,0.0] |1        |
|[90.0,35.0,75000.0,1.0]  |0        |
|[372.0,35.0,53000.0,1.0] |0        |
|[926.0,46.0,79000.0,1.0] |1        |
|[94.0,39.0,134000.0,0.0] |1        |
+-------------------------+---------+
only showing top 20 rows



splitting the data into test and train data

In [8]:
test_data,  train_data = finalized_data.randomSplit([0.3,0.7])

creating  the model and using it

In [9]:
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(labelCol="Purchased")

In [10]:
lr = lr.fit(train_data)
res = lr.evaluate(test_data).predictions

In [11]:
res.show()

+--------------------+---------+--------------------+--------------------+----------+
|            features|Purchased|       rawPrediction|         probability|prediction|
+--------------------+---------+--------------------+--------------------+----------+
|[19.0,48.0,31500....|        1|[0.40336316718364...|[0.59949542830589...|       0.0|
|[20.0,40.0,107000...|        1|[-0.5885614172981...|[0.35696500001117...|       1.0|
|[22.0,56.0,131500...|        1|[-4.8644174296869...|[0.00765723647883...|       1.0|
|[26.0,47.0,47000....|        0|[0.08060364027053...|[0.52014000720184...|       0.0|
|[27.0,40.0,60500....|        0|[1.15175962886747...|[0.75983217356560...|       0.0|
|[29.0,36.0,40500....|        0|[2.62271995905869...|[0.93230956077747...|       0.0|
|[30.0,27.0,58000....|        0|[3.99493241529427...|[0.98192406380064...|       0.0|
|[32.0,47.0,50000....|        1|[-0.0248213863584...|[0.49379497198423...|       1.0|
|[33.0,30.0,62500....|        0|[3.17537249382304...|[

evaluate the accuracy of the model

In [12]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(labelCol="Purchased")
evaluator.evaluate(res)

0.9163201008191556