# Install pyspark library

In [None]:
!pip install pyspark

In [1]:
#import pyspark library
import pyspark

In [2]:
#import spark session library
from pyspark.sql import SparkSession

In [3]:
# Create SparkSession object
spark = SparkSession\
        .builder\
        .appName("Decision_Tree_Classification")\
        .getOrCreate()

# Create DataFrame

In [4]:
data1 = spark.read.format("csv").option("header","true") \
.option("inferSchema","true").load("Social_Network_Ads.csv")

In [5]:
data1.show()

+--------+------+---+---------------+---------+
| User ID|Gender|Age|EstimatedSalary|Purchased|
+--------+------+---+---------------+---------+
|15624510|  Male| 19|          19000|        0|
|15810944|  Male| 35|          20000|        0|
|15668575|Female| 26|          43000|        0|
|15603246|Female| 27|          57000|        0|
|15804002|  Male| 19|          76000|        0|
|15728773|  Male| 27|          58000|        0|
|15598044|Female| 27|          84000|        0|
|15694829|Female| 32|         150000|        1|
|15600575|  Male| 25|          33000|        0|
|15727311|Female| 35|          65000|        0|
|15570769|Female| 26|          80000|        0|
|15606274|Female| 26|          52000|        0|
|15746139|  Male| 20|          86000|        0|
|15704987|  Male| 32|          18000|        0|
|15628972|  Male| 18|          82000|        0|
|15697686|  Male| 29|          80000|        0|
|15733883|  Male| 47|          25000|        1|
|15617482|  Male| 45|          26000|   

# Verctorize the features

In [6]:
from pyspark.ml.feature import *

In [7]:
from pyspark.ml.feature import VectorAssembler

In [8]:
vectorizer = VectorAssembler()
vectorizer.setInputCols(["Age","EstimatedSalary"])
vectorizer.setOutputCol("features")

data = vectorizer.transform(data1)
data.show(10, False)

+--------+------+---+---------------+---------+---------------+
|User ID |Gender|Age|EstimatedSalary|Purchased|features       |
+--------+------+---+---------------+---------+---------------+
|15624510|Male  |19 |19000          |0        |[19.0,19000.0] |
|15810944|Male  |35 |20000          |0        |[35.0,20000.0] |
|15668575|Female|26 |43000          |0        |[26.0,43000.0] |
|15603246|Female|27 |57000          |0        |[27.0,57000.0] |
|15804002|Male  |19 |76000          |0        |[19.0,76000.0] |
|15728773|Male  |27 |58000          |0        |[27.0,58000.0] |
|15598044|Female|27 |84000          |0        |[27.0,84000.0] |
|15694829|Female|32 |150000         |1        |[32.0,150000.0]|
|15600575|Male  |25 |33000          |0        |[25.0,33000.0] |
|15727311|Female|35 |65000          |0        |[35.0,65000.0] |
+--------+------+---+---------------+---------+---------------+
only showing top 10 rows



# Data Scaling

In [9]:
#import scaler library
from pyspark.ml.feature import MinMaxScaler

In [10]:
scaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures")

In [11]:
scalerModel = scaler.fit(data)

In [12]:
scaledData = scalerModel.transform(data)

In [13]:
scaledData.show()

+--------+------+---+---------------+---------+---------------+--------------------+
| User ID|Gender|Age|EstimatedSalary|Purchased|       features|      scaledFeatures|
+--------+------+---+---------------+---------+---------------+--------------------+
|15624510|  Male| 19|          19000|        0| [19.0,19000.0]|[0.02380952380952...|
|15810944|  Male| 35|          20000|        0| [35.0,20000.0]|[0.40476190476190...|
|15668575|Female| 26|          43000|        0| [26.0,43000.0]|[0.19047619047619...|
|15603246|Female| 27|          57000|        0| [27.0,57000.0]|[0.21428571428571...|
|15804002|  Male| 19|          76000|        0| [19.0,76000.0]|[0.02380952380952...|
|15728773|  Male| 27|          58000|        0| [27.0,58000.0]|[0.21428571428571...|
|15598044|Female| 27|          84000|        0| [27.0,84000.0]|[0.21428571428571...|
|15694829|Female| 32|         150000|        1|[32.0,150000.0]|[0.33333333333333...|
|15600575|  Male| 25|          33000|        0| [25.0,33000.0]|[0

In [14]:
scaledData.select("scaledFeatures").take(2)

[Row(scaledFeatures=DenseVector([0.0238, 0.0296])),
 Row(scaledFeatures=DenseVector([0.4048, 0.037]))]

without scaling use features column as a input column

with scaling use scaledFeatures column as a input column

# Model Training

In [15]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [16]:
# Index labels, adding metadata to the label column.
    # Fit on whole dataset to include all labels in index.
labelIndexer = StringIndexer(inputCol="Purchased", outputCol="indexedLabel").fit(scaledData)

In [17]:
# Automatically identify categorical features, and index them.
    # specify maxCategories so features with > 4 distinct values are treated as continuous.
featureIndexer =\
        VectorIndexer(inputCol="scaledFeatures", outputCol="indexedFeatures", maxCategories=4).fit(scaledData)

In [18]:
# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = scaledData.randomSplit([0.7, 0.3])

In [19]:
# Train a DecisionTree model.
dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures")

In [20]:
# Chain indexers and tree in a Pipeline
pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt])

In [21]:
# Train model.  This also runs the indexers.
model = pipeline.fit(trainingData)

# Model Testing

In [22]:
# Make predictions.
predictions = model.transform(testData)

In [23]:
# Select rows to display.
predictions.select("prediction", "Purchased", "scaledFeatures").show(5)

+----------+---------+--------------------+
|prediction|Purchased|      scaledFeatures|
+----------+---------+--------------------+
|       0.0|        0|[0.40476190476190...|
|       1.0|        1|[0.95238095238095...|
|       0.0|        0|[0.40476190476190...|
|       1.0|        1|[0.95238095238095...|
|       0.0|        0|[0.11904761904761...|
+----------+---------+--------------------+
only showing top 5 rows



# Model Summary

In [24]:
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
        labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g " % (1.0 - accuracy))

Test Error = 0.0869565 


In [25]:
treeModel = model.stages[2]
    # summary only
print(treeModel)

DecisionTreeClassificationModel: uid=DecisionTreeClassifier_eb30250fc606, depth=5, numNodes=31, numClasses=2, numFeatures=2


In [26]:
predictions.groupBy('Purchased', 'prediction').count().show()

+---------+----------+-----+
|Purchased|prediction|count|
+---------+----------+-----+
|        1|       0.0|    8|
|        0|       0.0|   66|
|        1|       1.0|   39|
|        0|       1.0|    2|
+---------+----------+-----+



In [27]:
# Calculate the elements of the confusion matrix
TN = predictions.filter('prediction = 0 AND Purchased = prediction').count()
TP = predictions.filter('prediction = 1 AND Purchased = prediction').count()
FN = predictions.filter('prediction = 0 AND Purchased = 1').count()
FP = predictions.filter('prediction = 1 AND Purchased = 0').count()

# Accuracy measures the proportion of correct predictions
accuracy = (TN + TP) / (TN + TP + FN + FP)
print("Model Accuracy = %g " % (accuracy))

Model Accuracy = 0.913043 
