# Creating SparkSession

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Practice").getOrCreate()
spark



# Loading the data

In [2]:
df_pyspark = spark.read.csv("car_data.csv",inferSchema=True, header=True)
df_pyspark.show(5)

+------+------------+-----+-------+--------+------+--------+
|buying|maintainence|doors|persons|lug_boot|safety|car_type|
+------+------------+-----+-------+--------+------+--------+
| vhigh|       vhigh|    2|      2|   small|   low|   unacc|
| vhigh|       vhigh|    2|      2|   small|   med|   unacc|
| vhigh|       vhigh|    2|      2|   small|  high|   unacc|
| vhigh|       vhigh|    2|      2|     med|   low|   unacc|
| vhigh|       vhigh|    2|      2|     med|   med|   unacc|
+------+------------+-----+-------+--------+------+--------+
only showing top 5 rows



# Schema of this DataFrame

In [3]:
df_pyspark.printSchema()

root
 |-- buying: string (nullable = true)
 |-- maintainence: string (nullable = true)
 |-- doors: string (nullable = true)
 |-- persons: string (nullable = true)
 |-- lug_boot: string (nullable = true)
 |-- safety: string (nullable = true)
 |-- car_type: string (nullable = true)



# Encoding String Columns into Integers

In [4]:
from pyspark.ml.feature import StringIndexer
categoricalColumns = ["buying","maintainence","doors","persons","lug_boot","safety","car_type"]
l = []
for categoricalCol in categoricalColumns:
    stringIndexer = StringIndexer(inputCol = categoricalCol, outputCol = categoricalCol+"_encoded").fit(df_pyspark)
    df_pyspark = stringIndexer.transform(df_pyspark)
    df_pyspark = df_pyspark.withColumn(categoricalCol+"_encoded", df_pyspark[categoricalCol+"_encoded"].cast('int'))
encoded_df =  df_pyspark.select("buying_encoded","doors_encoded","maintainence_encoded","persons_encoded","lug_boot_encoded","safety_encoded","car_type_encoded")
encoded_df.show(5)

+--------------+-------------+--------------------+---------------+----------------+--------------+----------------+
|buying_encoded|doors_encoded|maintainence_encoded|persons_encoded|lug_boot_encoded|safety_encoded|car_type_encoded|
+--------------+-------------+--------------------+---------------+----------------+--------------+----------------+
|             3|            0|                   3|              0|               2|             1|               0|
|             3|            0|                   3|              0|               2|             2|               0|
|             3|            0|                   3|              0|               2|             0|               0|
|             3|            0|                   3|              0|               1|             1|               0|
|             3|            0|                   3|              0|               1|             2|               0|
+--------------+-------------+--------------------+-------------

# Feature extraction 

In [5]:
from pyspark.ml.feature import VectorAssembler
featureAssembler = VectorAssembler(inputCols=["buying_encoded","doors_encoded","maintainence_encoded","persons_encoded","lug_boot_encoded","safety_encoded"],outputCol="features")
output = featureAssembler.transform(encoded_df)
output.select("features","car_type_encoded").show(5)

+--------------------+----------------+
|            features|car_type_encoded|
+--------------------+----------------+
|[3.0,0.0,3.0,0.0,...|               0|
|[3.0,0.0,3.0,0.0,...|               0|
|[3.0,0.0,3.0,0.0,...|               0|
|[3.0,0.0,3.0,0.0,...|               0|
|[3.0,0.0,3.0,0.0,...|               0|
+--------------------+----------------+
only showing top 5 rows



# Splitting the data 

In [6]:
train, test = output.randomSplit([0.8, 0.2], seed=17)

In [7]:
print("Size of training data: ",train.count())
print("Size of testing data: ",test.count())

Size of training data:  1377
Size of testing data:  351


# Logistic Regression

In [8]:
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(featuresCol = 'features', labelCol = 'car_type_encoded', maxIter=10)
lrModel = lr.fit(train)

In [9]:
predictions = lrModel.transform(test)
predictions.show(5)

+--------------+-------------+--------------------+---------------+----------------+--------------+----------------+-------------------+--------------------+--------------------+----------+
|buying_encoded|doors_encoded|maintainence_encoded|persons_encoded|lug_boot_encoded|safety_encoded|car_type_encoded|           features|       rawPrediction|         probability|prediction|
+--------------+-------------+--------------------+---------------+----------------+--------------+----------------+-------------------+--------------------+--------------------+----------+
|             0|            0|                   0|              0|               0|             0|               0|          (6,[],[])|[1.99198136762971...|[0.76125517518711...|       0.0|
|             0|            0|                   0|              0|               2|             0|               0|      (6,[4],[2.0])|[3.26816128608158...|[0.90473344519298...|       0.0|
|             0|            0|                   0

In [10]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator()
evaluator.setLabelCol("car_type_encoded")
evaluator.setPredictionCol("prediction")
evaluator.evaluate(predictions)
print("Test Area Under ROC: ",evaluator.evaluate(predictions))

Test Area Under ROC:  0.5901985607326307


# Decision Tree

In [11]:
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

#Training Model
dt = DecisionTreeClassifier(featuresCol = 'features', labelCol = 'car_type_encoded', maxDepth = 3)
dtModel = dt.fit(train)

#Prediction
predictions = dtModel.transform(test)

#Evaluating the performance
evaluator = MulticlassClassificationEvaluator()
evaluator.setLabelCol("car_type_encoded")
evaluator.setPredictionCol("prediction")
print("Test Area Under ROC: ",evaluator.evaluate(predictions))

Test Area Under ROC:  0.7398590195812419


# Random Forest

In [12]:
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

#Training Model
rf = RandomForestClassifier(featuresCol = 'features', labelCol = 'car_type_encoded', numTrees = 500, maxDepth = 10)
rfModel = rf.fit(train)

#Prediction
predictions = rfModel.transform(test)

#Evaluating the performance
evaluator = MulticlassClassificationEvaluator()
evaluator.setLabelCol("car_type_encoded")
evaluator.setPredictionCol("prediction")
print("Test Area Under ROC: ",evaluator.evaluate(predictions))

Test Area Under ROC:  0.9858482629300609
