## Overview

This notebook will show you how to create and query a table or DataFrame that you uploaded to DBFS. [DBFS](https://docs.databricks.com/user-guide/dbfs-databricks-file-system.html) is a Databricks File System that allows you to store data for querying inside of Databricks. This notebook assumes that you have a file already inside of DBFS that you would like to read from.

This notebook is written in **Python** so the default cell type is Python. However, you can use different languages by using the `%LANGUAGE` syntax. Python, Scala, SQL, and R are all supported.

In [0]:
# File location and type
file_location = "/FileStore/tables/data.csv"
file_type = "csv"


In [0]:
# import data
df=spark.read.csv(file_location,header=True, inferSchema=True)
display(df)

id,Time,Is_CH,who CH,Dist_To_CH,ADV_S,ADV_R,JOIN_S,JOIN_R,SCH_S,SCH_R,Rank,DATA_S,DATA_R,Data_Sent_To_BS,dist_CH_To_BS,send_code,Expaned Energy,Attack type
101000,50,1,101000,0.0,1,0,0,25,1,0,0,0,1200,48,130.08535,0,2.4694,Normal
101001,50,0,101044,75.32345,0,4,1,0,0,1,2,38,0,0,0.0,4,0.06957,Normal
101002,50,0,101010,46.95453,0,4,1,0,0,1,19,41,0,0,0.0,3,0.06898,Normal
101003,50,0,101044,64.85231,0,4,1,0,0,1,16,38,0,0,0.0,4,0.06673,Normal
101004,50,0,101010,4.83341,0,4,1,0,0,1,25,41,0,0,0.0,3,0.06534,Normal
101005,50,0,101010,31.91198,0,4,1,0,0,1,18,41,0,0,0.0,3,0.06717,Normal
101006,50,0,101044,24.34167,0,4,1,0,0,1,5,38,0,0,0.0,4,0.06214,Normal
101007,50,0,101010,26.75033,0,4,1,0,0,1,21,41,0,0,0.0,3,0.06662,Normal
101008,50,0,101044,63.66485,0,4,1,0,0,1,17,38,0,0,0.0,4,0.06649,Normal
101009,50,0,101000,32.90217,0,4,1,0,0,1,12,48,0,0,0.0,1,0.07903,Normal


In [0]:
df.printSchema()

root
 |--  id: integer (nullable = true)
 |--  Time: integer (nullable = true)
 |--  Is_CH: integer (nullable = true)
 |--  who CH: integer (nullable = true)
 |--  Dist_To_CH: double (nullable = true)
 |--  ADV_S: integer (nullable = true)
 |--  ADV_R: integer (nullable = true)
 |--  JOIN_S: integer (nullable = true)
 |--  JOIN_R: integer (nullable = true)
 |--  SCH_S: integer (nullable = true)
 |--  SCH_R: integer (nullable = true)
 |-- Rank: integer (nullable = true)
 |--  DATA_S: integer (nullable = true)
 |--  DATA_R: integer (nullable = true)
 |--  Data_Sent_To_BS: integer (nullable = true)
 |--  dist_CH_To_BS: double (nullable = true)
 |--  send_code : integer (nullable = true)
 |-- Expaned Energy: double (nullable = true)
 |-- Attack type: string (nullable = true)



In [0]:
# checking Null value in evry columns

from pyspark.sql.functions import *

col_null_cnt_df =  df.select([count(when(col(c).isNull(),c)).alias(c) for c in df.columns])

display(col_null_cnt_df)
# Output shows that there doesnt have any null vlaue

id,Time,Is_CH,who CH,Dist_To_CH,ADV_S,ADV_R,JOIN_S,JOIN_R,SCH_S,SCH_R,Rank,DATA_S,DATA_R,Data_Sent_To_BS,dist_CH_To_BS,send_code,Expaned Energy,Attack type
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [0]:
# check categorical columns if have we have to make it numerical value
# I use stringIndexer and one hot encoding to make it numerical
# so Attack type column is categorical

ctg_col= [item[0] for item in df.dtypes if item[1].startswith('string')]
ctg_col

Out[5]: ['Attack type']

In [0]:
# import necessary library 
from pyspark.ml.feature import StringIndexer,MinMaxScaler
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import DecisionTreeClassifier,LogisticRegression
from pyspark.ml import Pipeline


In [0]:
# split the data set into test and train 
train_df,test_df= df.randomSplit([0.75,0.25])

train_df.count(),test_df.count()

Out[7]: (281375, 93286)

In [0]:
# StringIndexer for creating numerical value of Attack type colom
Indexer=StringIndexer(inputCol='Attack type', outputCol='Target_col')



# grouping the columns for independent variable ,use vectorAssembeler. the columns should be sorting way
Assembler=VectorAssembler(inputCols=[' id',
 ' Time',
 ' Is_CH',
 ' who CH',
 ' Dist_To_CH',
 ' ADV_S',
 ' ADV_R',
 ' JOIN_S',
 ' JOIN_R',
 ' SCH_S',
 ' SCH_R',
 'Rank',
 ' DATA_S',
 ' DATA_R',
 ' Data_Sent_To_BS',
 ' dist_CH_To_BS',
 ' send_code ',
 'Expaned Energy'],outputCol='Independet_features')   

# MinMaxScaler Transformation 
scaler = MinMaxScaler(inputCol="Independet_features", outputCol="Independet_col")

# 1st Model Using decision tree

dt_model= DecisionTreeClassifier(labelCol='Target_col',featuresCol='Independet_col')

# The second model using LogisticRegression
lr_model =  LogisticRegression(featuresCol='Independet_col',labelCol='Target_col')



In [0]:
# predict 1st model 

# Setup the pipeline
pipeline=Pipeline(stages=[Indexer,Assembler,scaler,dt_model])

# Fill the pipeline model
final_pipeline= pipeline.fit(train_df)

# predict on test data
test_prediction = final_pipeline.transform(test_df)
test_prediction.show(10,truncate=False)

+------+-----+------+-------+-----------+------+------+-------+-------+------+------+----+-------+-------+----------------+--------------+-----------+--------------+-----------+----------+------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------+-----------------------------------------------------+----------+
| id   | Time| Is_CH| who CH| Dist_To_CH| ADV_S| ADV_R| JOIN_S| JOIN_R| SCH_S| SCH_R|Rank| DATA_S| DATA_R| Data_Sent_To_BS| dist_CH_To_BS| send_code |Expaned Energy|Attack type|Target_col|Independet_features                                                                             |Independet_col                                                                                                   

In [0]:
# display as pandas dataframe 
display(test_prediction)

id,Time,Is_CH,who CH,Dist_To_CH,ADV_S,ADV_R,JOIN_S,JOIN_R,SCH_S,SCH_R,Rank,DATA_S,DATA_R,Data_Sent_To_BS,dist_CH_To_BS,send_code,Expaned Energy,Attack type,Target_col,Independet_features,Independet_col,rawPrediction,probability,prediction
101000,53,0,101030,42.04102,0,2,1,0,0,1,12,19,0,0,0.0,2,45.03302,Normal,0.0,"Map(vectorType -> dense, length -> 18, values -> List(101000.0, 53.0, 0.0, 101030.0, 42.04102, 0.0, 2.0, 1.0, 0.0, 0.0, 1.0, 12.0, 19.0, 0.0, 0.0, 0.0, 2.0, 45.03302))","Map(vectorType -> sparse, length -> 18, indices -> List(1, 3, 4, 6, 7, 10, 11, 12, 16, 17), values -> List(8.450704225352113E-4, 9.08787979764321E-6, 0.19620158467671067, 0.017094017094017096, 1.0, 1.0, 0.12121212121212122, 0.07883817427385892, 0.13333333333333333, 0.9986490424212211))","Map(vectorType -> dense, length -> 5, values -> List(167599.0, 0.0, 0.0, 315.0, 0.0))","Map(vectorType -> dense, length -> 5, values -> List(0.9981240396869826, 0.0, 0.0, 0.001875960313017378, 0.0))",0.0
101000,53,0,101043,24.35267,0,7,1,0,0,1,16,65,0,0,0.0,6,0.10608,Normal,0.0,"Map(vectorType -> dense, length -> 18, values -> List(101000.0, 53.0, 0.0, 101043.0, 24.35267, 0.0, 7.0, 1.0, 0.0, 0.0, 1.0, 16.0, 65.0, 0.0, 0.0, 0.0, 6.0, 0.10608))","Map(vectorType -> sparse, length -> 18, indices -> List(1, 3, 4, 6, 7, 10, 11, 12, 16, 17), values -> List(8.450704225352113E-4, 1.3025961043288602E-5, 0.11365167745951432, 0.05982905982905984, 1.0, 1.0, 0.16161616161616163, 0.2697095435684647, 0.4, 0.002352422520631375))","Map(vectorType -> dense, length -> 5, values -> List(167599.0, 0.0, 0.0, 315.0, 0.0))","Map(vectorType -> dense, length -> 5, values -> List(0.9981240396869826, 0.0, 0.0, 0.001875960313017378, 0.0))",0.0
101000,53,0,101073,11.24658,0,7,1,0,0,1,1,80,0,0,0.0,5,0.12931,Normal,0.0,"Map(vectorType -> dense, length -> 18, values -> List(101000.0, 53.0, 0.0, 101073.0, 11.24658, 0.0, 7.0, 1.0, 0.0, 0.0, 1.0, 1.0, 80.0, 0.0, 0.0, 0.0, 5.0, 0.12931))","Map(vectorType -> sparse, length -> 18, indices -> List(1, 3, 4, 6, 7, 10, 11, 12, 16, 17), values -> List(8.450704225352113E-4, 2.211384084093181E-5, 0.05248675741438719, 0.05982905982905984, 1.0, 1.0, 0.010101010101010102, 0.33195020746887965, 0.3333333333333333, 0.002867569345237963))","Map(vectorType -> dense, length -> 5, values -> List(167599.0, 0.0, 0.0, 315.0, 0.0))","Map(vectorType -> dense, length -> 5, values -> List(0.9981240396869826, 0.0, 0.0, 0.001875960313017378, 0.0))",0.0
101000,53,0,101076,37.54837,0,5,1,0,0,1,9,43,0,0,0.0,3,0.07225,Normal,0.0,"Map(vectorType -> dense, length -> 18, values -> List(101000.0, 53.0, 0.0, 101076.0, 37.54837, 0.0, 5.0, 1.0, 0.0, 0.0, 1.0, 9.0, 43.0, 0.0, 0.0, 0.0, 3.0, 0.07225))","Map(vectorType -> sparse, length -> 18, indices -> List(1, 3, 4, 6, 7, 10, 11, 12, 16, 17), values -> List(8.450704225352113E-4, 2.3022628820696134E-5, 0.17523479915633497, 0.042735042735042736, 1.0, 1.0, 0.09090909090909091, 0.17842323651452283, 0.2, 0.0016022108513915616))","Map(vectorType -> dense, length -> 5, values -> List(167599.0, 0.0, 0.0, 315.0, 0.0))","Map(vectorType -> dense, length -> 5, values -> List(0.9981240396869826, 0.0, 0.0, 0.001875960313017378, 0.0))",0.0
101000,53,0,101094,40.2791,0,6,1,0,0,1,17,42,0,0,0.0,3,0.07062,Normal,0.0,"Map(vectorType -> dense, length -> 18, values -> List(101000.0, 53.0, 0.0, 101094.0, 40.2791, 0.0, 6.0, 1.0, 0.0, 0.0, 1.0, 17.0, 42.0, 0.0, 0.0, 0.0, 3.0, 0.07062))","Map(vectorType -> sparse, length -> 18, indices -> List(1, 3, 4, 6, 7, 10, 11, 12, 16, 17), values -> List(8.450704225352113E-4, 2.8475356699282058E-5, 0.187978865625803, 0.05128205128205129, 1.0, 1.0, 0.17171717171717174, 0.17427385892116182, 0.2, 0.0015660640875470185))","Map(vectorType -> dense, length -> 5, values -> List(167599.0, 0.0, 0.0, 315.0, 0.0))","Map(vectorType -> dense, length -> 5, values -> List(0.9981240396869826, 0.0, 0.0, 0.001875960313017378, 0.0))",0.0
101000,53,1,101000,0.0,1,0,0,28,28,0,0,0,0,0,0.0,0,0.00229,TDMA,3.0,"Map(vectorType -> sparse, length -> 18, indices -> List(0, 1, 2, 3, 5, 8, 9, 17), values -> List(101000.0, 53.0, 1.0, 101000.0, 1.0, 28.0, 28.0, 0.00229))","Map(vectorType -> sparse, length -> 18, indices -> List(1, 2, 5, 8, 9, 17), values -> List(8.450704225352113E-4, 1.0, 0.010309278350515464, 0.22580645161290322, 0.2828282828282829, 5.0782876812272334E-5))","Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.0, 0.0, 4276.0, 0.0))","Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.0, 0.0, 1.0, 0.0))",3.0
101000,53,1,101000,0.0,1,0,0,78,78,0,0,0,0,0,0.0,0,0.00229,TDMA,3.0,"Map(vectorType -> sparse, length -> 18, indices -> List(0, 1, 2, 3, 5, 8, 9, 17), values -> List(101000.0, 53.0, 1.0, 101000.0, 1.0, 78.0, 78.0, 0.00229))","Map(vectorType -> sparse, length -> 18, indices -> List(1, 2, 5, 8, 9, 17), values -> List(8.450704225352113E-4, 1.0, 0.010309278350515464, 0.6290322580645161, 0.787878787878788, 5.0782876812272334E-5))","Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.0, 0.0, 4276.0, 0.0))","Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.0, 0.0, 1.0, 0.0))",3.0
101001,50,0,101044,75.32345,0,4,1,0,0,1,2,38,0,0,0.0,4,0.06957,Normal,0.0,"Map(vectorType -> dense, length -> 18, values -> List(101001.0, 50.0, 0.0, 101044.0, 75.32345, 0.0, 4.0, 1.0, 0.0, 0.0, 1.0, 2.0, 38.0, 0.0, 0.0, 0.0, 4.0, 0.06957))","Map(vectorType -> sparse, length -> 18, indices -> List(0, 3, 4, 6, 7, 10, 11, 12, 16, 17), values -> List(3.029296936532594E-7, 1.332889036987671E-5, 0.35152763309065715, 0.03418803418803419, 1.0, 1.0, 0.020202020202020204, 0.15767634854771784, 0.26666666666666666, 0.0015427793623710858))","Map(vectorType -> dense, length -> 5, values -> List(167599.0, 0.0, 0.0, 315.0, 0.0))","Map(vectorType -> dense, length -> 5, values -> List(0.9981240396869826, 0.0, 0.0, 0.001875960313017378, 0.0))",0.0
101001,53,0,101003,69.03671,0,3,1,0,0,1,1,20,0,0,0.0,2,0.03734,Normal,0.0,"Map(vectorType -> dense, length -> 18, values -> List(101001.0, 53.0, 0.0, 101003.0, 69.03671, 0.0, 3.0, 1.0, 0.0, 0.0, 1.0, 1.0, 20.0, 0.0, 0.0, 0.0, 2.0, 0.03734))","Map(vectorType -> dense, length -> 18, values -> List(3.029296936532594E-7, 8.450704225352113E-4, 0.0, 9.08787979764321E-7, 0.3221879940797468, 0.0, 0.025641025641025644, 1.0, 0.0, 0.0, 1.0, 0.010101010101010102, 0.08298755186721991, 0.0, 0.0, 0.0, 0.13333333333333333, 8.280491791136458E-4))","Map(vectorType -> dense, length -> 5, values -> List(167599.0, 0.0, 0.0, 315.0, 0.0))","Map(vectorType -> dense, length -> 5, values -> List(0.9981240396869826, 0.0, 0.0, 0.001875960313017378, 0.0))",0.0
101001,53,0,101009,56.71397,0,23,1,0,0,1,56,21,0,0,0.0,3,0.03656,Normal,0.0,"Map(vectorType -> dense, length -> 18, values -> List(101001.0, 53.0, 0.0, 101009.0, 56.71397, 0.0, 23.0, 1.0, 0.0, 0.0, 1.0, 56.0, 21.0, 0.0, 0.0, 0.0, 3.0, 0.03656))","Map(vectorType -> dense, length -> 18, values -> List(3.029296936532594E-7, 8.450704225352113E-4, 0.0, 2.726363939292963E-6, 0.2646788966420755, 0.0, 0.1965811965811966, 1.0, 0.0, 0.0, 1.0, 0.5656565656565657, 0.08713692946058091, 0.0, 0.0, 0.0, 0.2, 8.107519546972388E-4))","Map(vectorType -> dense, length -> 5, values -> List(167599.0, 0.0, 0.0, 315.0, 0.0))","Map(vectorType -> dense, length -> 5, values -> List(0.9981240396869826, 0.0, 0.0, 0.001875960313017378, 0.0))",0.0


In [0]:
# Observe predicted value with actual value
summary=test_prediction.select('Target_col','prediction')
display(summary)

Target_col,prediction
0.0,0.0
0.0,0.0
0.0,0.0
0.0,0.0
0.0,0.0
3.0,3.0
3.0,3.0
0.0,0.0
0.0,0.0
0.0,0.0


In [0]:
# Evaluation of the first model (DecisionTree)
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol="Target_col", predictionCol="prediction")
accuracy = evaluator.evaluate(test_prediction)
print("Accuracy = %s" % (accuracy))
print("Test Error = %s" % (1.0 - accuracy))

Accuracy = 0.9823854433750878
Test Error = 0.01761455662491218


In [0]:
# Predict 2nd model


# Setup the pipeline
pipeline2=Pipeline(stages=[Indexer,Assembler,scaler,lr_model])

# Fill the pipeline model
final_pipeline2= pipeline2.fit(train_df)

# predict on test data
test_prediction2 = final_pipeline2.transform(test_df)
test_prediction2.show(10,truncate=False)

+------+-----+------+-------+-----------+------+------+-------+-------+------+------+----+-------+-------+----------------+--------------+-----------+--------------+-----------+----------+------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------+----------+
| id   | Time| Is_CH| who CH| Dist_To_CH| ADV_S| ADV_R| JOIN_S| JOIN_R| SCH_S| SCH_R|Rank| DATA_S| DATA_R| Data_Sent_To_BS| dist_CH_To_BS| send_code |Expaned Energy|Attack type|Target_col|Independet_features                                                                  

In [0]:
# Display as a Pandas Dataframe
display(test_prediction2)

id,Time,Is_CH,who CH,Dist_To_CH,ADV_S,ADV_R,JOIN_S,JOIN_R,SCH_S,SCH_R,Rank,DATA_S,DATA_R,Data_Sent_To_BS,dist_CH_To_BS,send_code,Expaned Energy,Attack type,Target_col,Independet_features,Independet_col,rawPrediction,probability,prediction
101000,53,0,101030,42.04102,0,2,1,0,0,1,12,19,0,0,0.0,2,45.03302,Normal,0.0,"Map(vectorType -> dense, length -> 18, values -> List(101000.0, 53.0, 0.0, 101030.0, 42.04102, 0.0, 2.0, 1.0, 0.0, 0.0, 1.0, 12.0, 19.0, 0.0, 0.0, 0.0, 2.0, 45.03302))","Map(vectorType -> sparse, length -> 18, indices -> List(1, 3, 4, 6, 7, 10, 11, 12, 16, 17), values -> List(8.450704225352113E-4, 9.08787979764321E-6, 0.19620158467671067, 0.017094017094017096, 1.0, 1.0, 0.12121212121212122, 0.07883817427385892, 0.13333333333333333, 0.9986490424212211))","Map(vectorType -> dense, length -> 5, values -> List(31.070834468086435, -41.54189240233315, 16.886023216187123, -5.315464401403679, -1.0995008805367341))","Map(vectorType -> dense, length -> 5, values -> List(0.9999993087829325, 2.915366130580392E-32, 6.912170564966165E-7, 1.5762709940564386E-16, 1.0680721154374011E-14))",0.0
101000,53,0,101043,24.35267,0,7,1,0,0,1,16,65,0,0,0.0,6,0.10608,Normal,0.0,"Map(vectorType -> dense, length -> 18, values -> List(101000.0, 53.0, 0.0, 101043.0, 24.35267, 0.0, 7.0, 1.0, 0.0, 0.0, 1.0, 16.0, 65.0, 0.0, 0.0, 0.0, 6.0, 0.10608))","Map(vectorType -> sparse, length -> 18, indices -> List(1, 3, 4, 6, 7, 10, 11, 12, 16, 17), values -> List(8.450704225352113E-4, 1.3025961043288602E-5, 0.11365167745951432, 0.05982905982905984, 1.0, 1.0, 0.16161616161616163, 0.2697095435684647, 0.4, 0.002352422520631375))","Map(vectorType -> dense, length -> 5, values -> List(14.202784880524044, -10.370652554057882, -5.276818591120087, 7.38635083978811, -5.941664575134188))","Map(vectorType -> dense, length -> 5, values -> List(0.9989055729054086, 2.1252800075092085E-11, 3.46449659370437E-9, 0.0010944218268686716, 1.7819733162151591E-9))",0.0
101000,53,0,101073,11.24658,0,7,1,0,0,1,1,80,0,0,0.0,5,0.12931,Normal,0.0,"Map(vectorType -> dense, length -> 18, values -> List(101000.0, 53.0, 0.0, 101073.0, 11.24658, 0.0, 7.0, 1.0, 0.0, 0.0, 1.0, 1.0, 80.0, 0.0, 0.0, 0.0, 5.0, 0.12931))","Map(vectorType -> sparse, length -> 18, indices -> List(1, 3, 4, 6, 7, 10, 11, 12, 16, 17), values -> List(8.450704225352113E-4, 2.211384084093181E-5, 0.05248675741438719, 0.05982905982905984, 1.0, 1.0, 0.010101010101010102, 0.33195020746887965, 0.3333333333333333, 0.002867569345237963))","Map(vectorType -> dense, length -> 5, values -> List(12.921457287012243, -9.932687762654608, -4.09745599475044, 5.595747508840498, -4.487061038447694))","Map(vectorType -> dense, length -> 5, values -> List(0.999341972841817, 1.186547270658873E-10, 4.059700565023806E-8, 6.579589451827282E-4, 2.7497339862842844E-8))",0.0
101000,53,0,101076,37.54837,0,5,1,0,0,1,9,43,0,0,0.0,3,0.07225,Normal,0.0,"Map(vectorType -> dense, length -> 18, values -> List(101000.0, 53.0, 0.0, 101076.0, 37.54837, 0.0, 5.0, 1.0, 0.0, 0.0, 1.0, 9.0, 43.0, 0.0, 0.0, 0.0, 3.0, 0.07225))","Map(vectorType -> sparse, length -> 18, indices -> List(1, 3, 4, 6, 7, 10, 11, 12, 16, 17), values -> List(8.450704225352113E-4, 2.3022628820696134E-5, 0.17523479915633497, 0.042735042735042736, 1.0, 1.0, 0.09090909090909091, 0.17842323651452283, 0.2, 0.0016022108513915616))","Map(vectorType -> dense, length -> 5, values -> List(14.478470983752723, -10.180536400266869, -5.886134724104256, 7.435697517098812, -5.847497376480409))","Map(vectorType -> dense, length -> 5, values -> List(0.9991270597307823, 1.9514161063751773E-11, 1.4301606885668303E-9, 8.729373330427916E-4, 1.4864996904420685E-9))",0.0
101000,53,0,101094,40.2791,0,6,1,0,0,1,17,42,0,0,0.0,3,0.07062,Normal,0.0,"Map(vectorType -> dense, length -> 18, values -> List(101000.0, 53.0, 0.0, 101094.0, 40.2791, 0.0, 6.0, 1.0, 0.0, 0.0, 1.0, 17.0, 42.0, 0.0, 0.0, 0.0, 3.0, 0.07062))","Map(vectorType -> sparse, length -> 18, indices -> List(1, 3, 4, 6, 7, 10, 11, 12, 16, 17), values -> List(8.450704225352113E-4, 2.8475356699282058E-5, 0.187978865625803, 0.05128205128205129, 1.0, 1.0, 0.17171717171717174, 0.17427385892116182, 0.2, 0.0015660640875470185))","Map(vectorType -> dense, length -> 5, values -> List(15.075807548468278, -10.4201493122047, -6.393725344963789, 8.189081295185892, -6.451014186485682))","Map(vectorType -> dense, length -> 5, values -> List(0.9989797892688089, 8.448961409994392E-12, 4.736494499385211E-10, 0.0010202098018151544, 4.4727725046907146E-10))",0.0
101000,53,1,101000,0.0,1,0,0,28,28,0,0,0,0,0,0.0,0,0.00229,TDMA,3.0,"Map(vectorType -> sparse, length -> 18, indices -> List(0, 1, 2, 3, 5, 8, 9, 17), values -> List(101000.0, 53.0, 1.0, 101000.0, 1.0, 28.0, 28.0, 0.00229))","Map(vectorType -> sparse, length -> 18, indices -> List(1, 2, 5, 8, 9, 17), values -> List(8.450704225352113E-4, 1.0, 0.010309278350515464, 0.22580645161290322, 0.2828282828282829, 5.0782876812272334E-5))","Map(vectorType -> dense, length -> 5, values -> List(-170.90120380885412, 40.36112630775328, -34.107260497361345, 172.98437462581327, -8.337036627351111))","Map(vectorType -> dense, length -> 5, values -> List(4.491494577650254E-150, 2.5261264336732647E-58, 1.1514514146465433E-90, 1.0, 1.7910662150073498E-79))",3.0
101000,53,1,101000,0.0,1,0,0,78,78,0,0,0,0,0,0.0,0,0.00229,TDMA,3.0,"Map(vectorType -> sparse, length -> 18, indices -> List(0, 1, 2, 3, 5, 8, 9, 17), values -> List(101000.0, 53.0, 1.0, 101000.0, 1.0, 78.0, 78.0, 0.00229))","Map(vectorType -> sparse, length -> 18, indices -> List(1, 2, 5, 8, 9, 17), values -> List(8.450704225352113E-4, 1.0, 0.010309278350515464, 0.6290322580645161, 0.787878787878788, 5.0782876812272334E-5))","Map(vectorType -> dense, length -> 5, values -> List(-480.17828721343614, 108.17429228669545, -99.26961661192576, 488.7722562167408, -17.498644678074406))","Map(vectorType -> dense, length -> 5, values -> List(0.0, 5.1098063487251053E-166, 4.136752161118699E-256, 1.0, 1.3469187331551676E-220))",3.0
101001,50,0,101044,75.32345,0,4,1,0,0,1,2,38,0,0,0.0,4,0.06957,Normal,0.0,"Map(vectorType -> dense, length -> 18, values -> List(101001.0, 50.0, 0.0, 101044.0, 75.32345, 0.0, 4.0, 1.0, 0.0, 0.0, 1.0, 2.0, 38.0, 0.0, 0.0, 0.0, 4.0, 0.06957))","Map(vectorType -> sparse, length -> 18, indices -> List(0, 3, 4, 6, 7, 10, 11, 12, 16, 17), values -> List(3.029296936532594E-7, 1.332889036987671E-5, 0.35152763309065715, 0.03418803418803419, 1.0, 1.0, 0.020202020202020204, 0.15767634854771784, 0.26666666666666666, 0.0015427793623710858))","Map(vectorType -> dense, length -> 5, values -> List(13.59147704862243, -9.41836947185725, -5.906328218508367, 6.615561343682018, -4.882340701938831))","Map(vectorType -> dense, length -> 5, values -> List(0.9990667481767334, 1.0151848578657085E-10, 3.4025558922998553E-9, 9.332388455409594E-4, 9.47365130930714E-9))",0.0
101001,53,0,101003,69.03671,0,3,1,0,0,1,1,20,0,0,0.0,2,0.03734,Normal,0.0,"Map(vectorType -> dense, length -> 18, values -> List(101001.0, 53.0, 0.0, 101003.0, 69.03671, 0.0, 3.0, 1.0, 0.0, 0.0, 1.0, 1.0, 20.0, 0.0, 0.0, 0.0, 2.0, 0.03734))","Map(vectorType -> dense, length -> 18, values -> List(3.029296936532594E-7, 8.450704225352113E-4, 0.0, 9.08787979764321E-7, 0.3221879940797468, 0.0, 0.025641025641025644, 1.0, 0.0, 0.0, 1.0, 0.010101010101010102, 0.08298755186721991, 0.0, 0.0, 0.0, 0.13333333333333333, 8.280491791136458E-4))","Map(vectorType -> dense, length -> 5, values -> List(14.241613217464648, -9.640514696219373, -6.394432191551899, 7.142277179327946, -5.348943509021323))","Map(vectorType -> dense, length -> 5, values -> List(0.9991750239663405, 4.243900572669392E-11, 1.0902370083045848E-9, 8.249717994898586E-4, 3.10149357529824E-9))",0.0
101001,53,0,101009,56.71397,0,23,1,0,0,1,56,21,0,0,0.0,3,0.03656,Normal,0.0,"Map(vectorType -> dense, length -> 18, values -> List(101001.0, 53.0, 0.0, 101009.0, 56.71397, 0.0, 23.0, 1.0, 0.0, 0.0, 1.0, 56.0, 21.0, 0.0, 0.0, 0.0, 3.0, 0.03656))","Map(vectorType -> dense, length -> 18, values -> List(3.029296936532594E-7, 8.450704225352113E-4, 0.0, 2.726363939292963E-6, 0.2646788966420755, 0.0, 0.1965811965811966, 1.0, 0.0, 0.0, 1.0, 0.5656565656565657, 0.08713692946058091, 0.0, 0.0, 0.0, 0.2, 8.107519546972388E-4))","Map(vectorType -> dense, length -> 5, values -> List(17.28148157837379, -11.136026382067007, -8.934529889955947, 10.31119802790206, -7.522123334252896))","Map(vectorType -> dense, length -> 5, values -> List(0.9990614950928084, 4.550145920568104E-13, 4.11266280035391E-12, 9.385048857379702E-4, 1.6885871354092032E-11))",0.0


In [0]:
summary2=test_prediction2.select('Target_col','prediction')
display(summary2)

Target_col,prediction
0.0,0.0
0.0,0.0
0.0,0.0
0.0,0.0
0.0,0.0
3.0,3.0
3.0,3.0
0.0,0.0
0.0,0.0
0.0,0.0


In [0]:
# Evaluation of the 2nd model
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol="Target_col", predictionCol="prediction")
accuracy = evaluator.evaluate(test_prediction2)
print("Accuracy = %s" % (accuracy))
print("Test Error = %s" % (1.0 - accuracy))

Accuracy = 0.9729280823073438
Test Error = 0.027071917692656156


In [0]:
# result of DicicsionTree
#Accuracy = 0.9823854433750878
#Test Error = 0.01761455662491218

# result of LogisticRegression
#Accuracy = 0.9729280823073438
#Test Error = 0.027071917692656156

# DecisionTree is the best model than LogisticResgression for this dataset because DicisionTree has more accuracy and less error.
