# Install pyspark library

In [None]:
!pip install pyspark

In [1]:
#import pyspark library
import pyspark

In [2]:
#import spark session library
from pyspark.sql import SparkSession

In [3]:
# Create SparkSession object
spark = SparkSession\
        .builder\
        .appName("Random_Forest_Classifier")\
        .getOrCreate()

# Create DataFrame

In [4]:
data1 = spark.read.format("csv").option("header","true") \
.option("inferSchema","true").load("Social_Network_Ads.csv")

In [5]:
data1.show()

+--------+------+---+---------------+---------+
| User ID|Gender|Age|EstimatedSalary|Purchased|
+--------+------+---+---------------+---------+
|15624510|  Male| 19|          19000|        0|
|15810944|  Male| 35|          20000|        0|
|15668575|Female| 26|          43000|        0|
|15603246|Female| 27|          57000|        0|
|15804002|  Male| 19|          76000|        0|
|15728773|  Male| 27|          58000|        0|
|15598044|Female| 27|          84000|        0|
|15694829|Female| 32|         150000|        1|
|15600575|  Male| 25|          33000|        0|
|15727311|Female| 35|          65000|        0|
|15570769|Female| 26|          80000|        0|
|15606274|Female| 26|          52000|        0|
|15746139|  Male| 20|          86000|        0|
|15704987|  Male| 32|          18000|        0|
|15628972|  Male| 18|          82000|        0|
|15697686|  Male| 29|          80000|        0|
|15733883|  Male| 47|          25000|        1|
|15617482|  Male| 45|          26000|   

# Verctorize the features

In [6]:
from pyspark.ml.feature import *

In [7]:
from pyspark.ml.feature import VectorAssembler

In [8]:
vectorizer = VectorAssembler()
vectorizer.setInputCols(["Age","EstimatedSalary"])
vectorizer.setOutputCol("features")

data = vectorizer.transform(data1)
data.show(10, False)

+--------+------+---+---------------+---------+---------------+
|User ID |Gender|Age|EstimatedSalary|Purchased|features       |
+--------+------+---+---------------+---------+---------------+
|15624510|Male  |19 |19000          |0        |[19.0,19000.0] |
|15810944|Male  |35 |20000          |0        |[35.0,20000.0] |
|15668575|Female|26 |43000          |0        |[26.0,43000.0] |
|15603246|Female|27 |57000          |0        |[27.0,57000.0] |
|15804002|Male  |19 |76000          |0        |[19.0,76000.0] |
|15728773|Male  |27 |58000          |0        |[27.0,58000.0] |
|15598044|Female|27 |84000          |0        |[27.0,84000.0] |
|15694829|Female|32 |150000         |1        |[32.0,150000.0]|
|15600575|Male  |25 |33000          |0        |[25.0,33000.0] |
|15727311|Female|35 |65000          |0        |[35.0,65000.0] |
+--------+------+---+---------------+---------+---------------+
only showing top 10 rows



In [9]:
data.show()

+--------+------+---+---------------+---------+---------------+
| User ID|Gender|Age|EstimatedSalary|Purchased|       features|
+--------+------+---+---------------+---------+---------------+
|15624510|  Male| 19|          19000|        0| [19.0,19000.0]|
|15810944|  Male| 35|          20000|        0| [35.0,20000.0]|
|15668575|Female| 26|          43000|        0| [26.0,43000.0]|
|15603246|Female| 27|          57000|        0| [27.0,57000.0]|
|15804002|  Male| 19|          76000|        0| [19.0,76000.0]|
|15728773|  Male| 27|          58000|        0| [27.0,58000.0]|
|15598044|Female| 27|          84000|        0| [27.0,84000.0]|
|15694829|Female| 32|         150000|        1|[32.0,150000.0]|
|15600575|  Male| 25|          33000|        0| [25.0,33000.0]|
|15727311|Female| 35|          65000|        0| [35.0,65000.0]|
|15570769|Female| 26|          80000|        0| [26.0,80000.0]|
|15606274|Female| 26|          52000|        0| [26.0,52000.0]|
|15746139|  Male| 20|          86000|   

# Data Scaling

In [10]:
#import scaler library
from pyspark.ml.feature import MinMaxScaler

In [11]:
scaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures")

In [12]:
scalerModel = scaler.fit(data)

In [13]:
scaledData = scalerModel.transform(data)

In [14]:
scaledData.show()

+--------+------+---+---------------+---------+---------------+--------------------+
| User ID|Gender|Age|EstimatedSalary|Purchased|       features|      scaledFeatures|
+--------+------+---+---------------+---------+---------------+--------------------+
|15624510|  Male| 19|          19000|        0| [19.0,19000.0]|[0.02380952380952...|
|15810944|  Male| 35|          20000|        0| [35.0,20000.0]|[0.40476190476190...|
|15668575|Female| 26|          43000|        0| [26.0,43000.0]|[0.19047619047619...|
|15603246|Female| 27|          57000|        0| [27.0,57000.0]|[0.21428571428571...|
|15804002|  Male| 19|          76000|        0| [19.0,76000.0]|[0.02380952380952...|
|15728773|  Male| 27|          58000|        0| [27.0,58000.0]|[0.21428571428571...|
|15598044|Female| 27|          84000|        0| [27.0,84000.0]|[0.21428571428571...|
|15694829|Female| 32|         150000|        1|[32.0,150000.0]|[0.33333333333333...|
|15600575|  Male| 25|          33000|        0| [25.0,33000.0]|[0

In [15]:
scaledData.select("scaledFeatures").take(2)

[Row(scaledFeatures=DenseVector([0.0238, 0.0296])),
 Row(scaledFeatures=DenseVector([0.4048, 0.037]))]

without scaling use features column as a input column

with scaling use scaledFeatures column as a input column

# Model Training

In [16]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [17]:
# Index labels, adding metadata to the label column.
    # Fit on whole dataset to include all labels in index.
labelIndexer = StringIndexer(inputCol="Purchased", outputCol="indexedLabel").fit(scaledData)

In [18]:
# Automatically identify categorical features, and index them.
    # Set maxCategories so features with > 4 distinct values are treated as continuous.
featureIndexer =\
        VectorIndexer(inputCol="scaledFeatures", outputCol="indexedFeatures", maxCategories=4).fit(scaledData)

In [19]:
# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = scaledData.randomSplit([0.7, 0.3])

In [20]:
# Train a RandomForest model.
rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", numTrees=10)

In [21]:
# Convert indexed labels back to original labels.
labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel",
                                   labels=labelIndexer.labels)

In [22]:
# Chain indexers and forest in a Pipeline
pipeline = Pipeline(stages=[labelIndexer, featureIndexer, rf, labelConverter])

In [23]:
# Train model.  This also runs the indexers.
model = pipeline.fit(trainingData)

# Model Testing

In [24]:
# Make predictions.
predictions = model.transform(testData)

In [25]:
# Select rows to display.
predictions.select("predictedLabel", "Purchased", "scaledFeatures").show(5)

+--------------+---------+--------------------+
|predictedLabel|Purchased|      scaledFeatures|
+--------------+---------+--------------------+
|             0|        1|[0.52380952380952...|
|             1|        1|[0.95238095238095...|
|             0|        0|[0.40476190476190...|
|             1|        1|[0.59523809523809...|
|             0|        1|[0.54761904761904...|
+--------------+---------+--------------------+
only showing top 5 rows



# Model Summary

In [26]:
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
        labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % (1.0 - accuracy))

Test Error = 0.179245


In [27]:
rfModel = model.stages[2]
print(rfModel)

RandomForestClassificationModel: uid=RandomForestClassifier_55acfe893d2f, numTrees=10, numClasses=2, numFeatures=2


In [28]:
predictions.groupBy('Purchased', 'predictedLabel').count().show()

+---------+--------------+-----+
|Purchased|predictedLabel|count|
+---------+--------------+-----+
|        1|             0|   11|
|        0|             1|    8|
|        0|             0|   52|
|        1|             1|   35|
+---------+--------------+-----+



In [29]:
# Calculate the elements of the confusion matrix
TN = predictions.filter('predictedLabel = 0 AND Purchased = predictedLabel').count()
TP = predictions.filter('predictedLabel = 1 AND Purchased = predictedLabel').count()
FN = predictions.filter('predictedLabel = 0 AND Purchased = 1').count()
FP = predictions.filter('predictedLabel = 1 AND Purchased = 0').count()

# Accuracy measures the proportion of correct predictions
accuracy = (TN + TP) / (TN + TP + FN + FP)
print("Model Accuracy = %g " % (accuracy))

Model Accuracy = 0.820755 
