<a href="https://colab.research.google.com/github/LamiMusa/Car-Transmission-Prediction/blob/main/Car-Transmission-Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q pyspark

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone


In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

In [None]:
!ls drive/MyDrive/DataCollection/ukcars.zip

drive/MyDrive/DataCollection/ukcars.zip


In [None]:
!unzip -q drive/MyDrive/DataCollection/ukcars.zip -d sample_data/uk_cars

In [None]:
!ls sample_data/uk_cars

 audi.csv     focus.csv    merc.csv    'unclean cclass.csv'   vw.csv
 bmw.csv      ford.csv	   skoda.csv   'unclean focus.csv'
 cclass.csv   hyundi.csv   toyota.csv   vauxhall.csv


In [None]:
from pyspark.sql.types import *

file_location = "/content/sample_data/uk_cars/*.csv"
carSchema = StructType([
    StructField("model", StringType(), False),
    StructField("Year", DoubleType(), False),
    StructField("Price", DoubleType(), False),
    StructField("Transmission", StringType(), False),
    StructField("Mileage", DoubleType(), False),
    StructField("Fueltype", StringType(), False),
    StructField("Tax", DoubleType(), False),
    StructField("mpg", DoubleType(), False),
    StructField("Engine Size", DoubleType(), False)
])

data= spark.read.schema(carSchema).csv(file_location, header=True)


data.show()

+---------+------+-------+------------+-------+--------+-----+----+-----------+
|    model|  Year|  Price|Transmission|Mileage|Fueltype|  Tax| mpg|Engine Size|
+---------+------+-------+------------+-------+--------+-----+----+-----------+
|   Fiesta|2017.0|12000.0|   Automatic|15944.0|  Petrol|150.0|57.7|        1.0|
|    Focus|2018.0|14000.0|      Manual| 9083.0|  Petrol|150.0|57.7|        1.0|
|    Focus|2017.0|13000.0|      Manual|12456.0|  Petrol|150.0|57.7|        1.0|
|   Fiesta|2019.0|17500.0|      Manual|10460.0|  Petrol|145.0|40.3|        1.5|
|   Fiesta|2019.0|16500.0|   Automatic| 1482.0|  Petrol|145.0|48.7|        1.0|
|   Fiesta|2015.0|10500.0|      Manual|35432.0|  Petrol|145.0|47.9|        1.6|
|     Puma|2019.0|22500.0|      Manual| 2029.0|  Petrol|145.0|50.4|        1.0|
|   Fiesta|2017.0| 9000.0|      Manual|13054.0|  Petrol|145.0|54.3|        1.2|
|     Kuga|2019.0|25500.0|   Automatic| 6894.0|  Diesel|145.0|42.2|        2.0|
|    Focus|2018.0|10000.0|      Manual|4

In [None]:
from pyspark.sql.functions import count, col, when
nullCount = (data.select([count(when(col(c).isNull(),  c)).alias(c) for c in data.columns])).show()

+-----+----+-----+------------+-------+--------+----+-----+-----------+
|model|Year|Price|Transmission|Mileage|Fueltype| Tax|  mpg|Engine Size|
+-----+----+-----+------------+-------+--------+----+-----+-----------+
|  155| 247| 9610|         155|   9372|     155|8694|10443|      18963|
+-----+----+-----+------------+-------+--------+----+-----+-----------+



In [None]:
data = data.dropna("any")
nullCount = (data.select([count(when(col(c).isNull(),  c)).alias(c) for c in data.columns])).show()

In [None]:
data.groupBy('Transmission').count().show()

+------------+-----+
|Transmission|count|
+------------+-----+
|   Semi-Auto|22677|
|   Automatic|20056|
|       Other|    9|
|      Manual|56445|
+------------+-----+



In [None]:
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import StringIndexer, VectorAssembler

In [None]:
#using a string indexer, transform the target variable into an index and fit into the dataset
indexer = StringIndexer(inputCol = "Transmission", outputCol = "Index Transmission")
indexed = indexer.setHandleInvalid("skip").fit(data).transform(data)

indexed.show()

+---------+------+-------+------------+-------+--------+-----+----+-----------+------------------+
|    model|  Year|  Price|Transmission|Mileage|Fueltype|  Tax| mpg|Engine Size|Index Transmission|
+---------+------+-------+------------+-------+--------+-----+----+-----------+------------------+
|   Fiesta|2017.0|12000.0|   Automatic|15944.0|  Petrol|150.0|57.7|        1.0|               2.0|
|    Focus|2018.0|14000.0|      Manual| 9083.0|  Petrol|150.0|57.7|        1.0|               0.0|
|    Focus|2017.0|13000.0|      Manual|12456.0|  Petrol|150.0|57.7|        1.0|               0.0|
|   Fiesta|2019.0|17500.0|      Manual|10460.0|  Petrol|145.0|40.3|        1.5|               0.0|
|   Fiesta|2019.0|16500.0|   Automatic| 1482.0|  Petrol|145.0|48.7|        1.0|               2.0|
|   Fiesta|2015.0|10500.0|      Manual|35432.0|  Petrol|145.0|47.9|        1.6|               0.0|
|     Puma|2019.0|22500.0|      Manual| 2029.0|  Petrol|145.0|50.4|        1.0|               0.0|
|   Fiesta

In [None]:
#Combining the selected independent variables to be used for the analysis and putting them in a single variables called "feauture"
assembler = VectorAssembler(inputCols=["Price", "Tax", "mpg","Engine Size"], outputCol = "features")

In [None]:
#Transform the vector assembled data based on the indexed data
output = assembler.transform(indexed)

In [None]:
#Creating a model that selects just the feature variable and the target variable
model_data = output.select("features", "Index Transmission")

model_data.show()

+--------------------+------------------+
|            features|Index Transmission|
+--------------------+------------------+
|[12000.0,150.0,57...|               2.0|
|[14000.0,150.0,57...|               0.0|
|[13000.0,150.0,57...|               0.0|
|[17500.0,145.0,40...|               0.0|
|[16500.0,145.0,48...|               2.0|
|[10500.0,145.0,47...|               0.0|
|[22500.0,145.0,50...|               0.0|
|[9000.0,145.0,54....|               0.0|
|[25500.0,145.0,42...|               2.0|
|[10000.0,145.0,61...|               0.0|
|[11561.0,145.0,56...|               0.0|
|[13500.0,145.0,54...|               0.0|
|[11000.0,0.0,65.7...|               0.0|
|[17999.0,145.0,54...|               1.0|
|[18999.0,145.0,58...|               1.0|
|[14399.0,145.0,64...|               0.0|
|[17999.0,145.0,38...|               0.0|
|[16899.0,145.0,54...|               0.0|
|[10999.0,20.0,62....|               0.0|
|[12500.0,145.0,55...|               0.0|
+--------------------+------------

In [None]:
#Create a test and train data based on a random split of the model data
trainingData, testData = model_data.randomSplit([0.7, 0.3])

In [None]:
dtModel = DecisionTreeClassifier(labelCol="Index Transmission").fit(trainingData)

In [None]:
dtPredictions = dtModel.transform(testData)

dtPredictions.show()

+--------------------+------------------+--------------------+--------------------+----------+
|            features|Index Transmission|       rawPrediction|         probability|prediction|
+--------------------+------------------+--------------------+--------------------+----------+
|[450.0,235.0,40.4...|               0.0| [49.0,9.0,41.0,0.0]|[0.49494949494949...|       0.0|
|[590.0,160.0,44.8...|               0.0|[17301.0,634.0,87...|[0.91982561539688...|       0.0|
|[675.0,260.0,36.2...|               0.0|[4893.0,1334.0,17...|[0.61454408440090...|       0.0|
|[850.0,145.0,54.3...|               0.0|[17301.0,634.0,87...|[0.91982561539688...|       0.0|
|[850.0,195.0,43.5...|               0.0|[17301.0,634.0,87...|[0.91982561539688...|       0.0|
|[995.0,200.0,42.8...|               0.0|[17301.0,634.0,87...|[0.91982561539688...|       0.0|
|[1000.0,145.0,48....|               0.0|[17301.0,634.0,87...|[0.91982561539688...|       0.0|
|[1000.0,145.0,56....|               0.0|[17301.0,

In [None]:
evaluator = MulticlassClassificationEvaluator(labelCol = "Index Transmission", metricName = "accuracy")

accuracy = evaluator.evaluate(dtPredictions)
accuracy

0.7038645245332176

In [None]:
precision = MulticlassClassificationEvaluator(labelCol = "Index Transmission", metricName = 'weightedPrecision').evaluate(dtPredictions)
precision

0.7127043395895822