In [32]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml.feature import Imputer,VectorAssembler
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

Read dataset

In [15]:
spark = SparkSession.builder.appName('dt').getOrCreate()
spark
df = spark.read.csv('Datasets/covtype.data',inferSchema=True,header=False)
#df.show(5)
colnames = ["Elevation","Aspect","Slope","Horizontal_Distance_To_Hydrology","Vertical_Distance_To_Hydrology",\
           "Horizontal_Distance_To_Roadways","Hillshade_9am","Hillshade_noon","Hillshade_3pm",\
           "Horizontal_Distance_To_Fire_Points"] + \
           [f"Wilderness_Area_{i}" for i in range(4)] + [f"Soil_Type_{i}" for i in range(40)] + ["Cover_Type"]
df = df.toDF(*colnames)
df.show(5)
df.printSchema()

+---------+------+-----+--------------------------------+------------------------------+-------------------------------+-------------+--------------+-------------+----------------------------------+-----------------+-----------------+-----------------+-----------------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+----------+
|Elevation|Aspect|Slope|Horizontal_Distance_To_Hydrology|Vertical_Distance_To_Hydrology|Horizontal_Distance_To_Roadways|Hillshade_9am|Hillshade_noon|Hillshade_3pm|Horizontal_Distance_To_Fire_Points|Wilderness

Display stats

In [16]:
df.describe().show()

+-------+-----------------+------------------+------------------+--------------------------------+------------------------------+-------------------------------+------------------+------------------+------------------+----------------------------------+------------------+--------------------+-------------------+-------------------+--------------------+-------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-------------------+-------------------+--------------------+--------------------+--------------------+-------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-------------------+-------------------+-------------------+--------------------+--------------------+--------------------+--------------------+-------------------+-------------------+-------------------+-------------------+------

Null Values and Imputation

In [26]:
colnames  
null_cond = col(colnames[0]).isNull()
for col_name in colnames[1:]:
    null_cond = null_cond | col(col_name).isNull()
filtered_df = df.filter(null_cond)
filtered_df.show()
num_features = [
    'Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology',
    'Horizontal_Distance_To_Roadways', 'Hillshade_9am', 'Hillshade_noon', 'Hillshade_3pm',
    'Horizontal_Distance_To_Fire_Points'
]
cat_features = [
    'Wilderness_Area_0','Wilderness_Area_1','Wilderness_Area_2','Wilderness_Area_3','Soil_Type_0', 'Soil_Type_1', 'Soil_Type_2', 'Soil_Type_3', 'Soil_Type_4', 'Soil_Type_5', 'Soil_Type_6',
    'Soil_Type_7', 'Soil_Type_8', 'Soil_Type_9', 'Soil_Type_10', 'Soil_Type_11', 'Soil_Type_12', 'Soil_Type_13',
    'Soil_Type_14', 'Soil_Type_15', 'Soil_Type_16', 'Soil_Type_17', 'Soil_Type_18', 'Soil_Type_19', 'Soil_Type_20',
    'Soil_Type_21', 'Soil_Type_22', 'Soil_Type_23', 'Soil_Type_24', 'Soil_Type_25', 'Soil_Type_26', 'Soil_Type_27',
    'Soil_Type_28', 'Soil_Type_29', 'Soil_Type_30', 'Soil_Type_31', 'Soil_Type_32', 'Soil_Type_33', 'Soil_Type_34',
    'Soil_Type_35', 'Soil_Type_36', 'Soil_Type_37', 'Soil_Type_38', 'Soil_Type_39', 'Cover_Type'
]

num_imputer = Imputer(inputCols=num_features,outputCols=[f'{col_name}_imputed' for col_name in num_features])
num_imputed = num_imputer.fit(df)
df_num_imp = num_imputed.transform(df)
cat_imputer = Imputer(inputCols=cat_features,outputCols=[f"{col_name}_imputed" for col_name in cat_features])
cat_imputed = cat_imputer.fit(df_num_imp)
df_imp = cat_imputed.transform(df_num_imp)
print(f"imputed categorical and numerical features")
df_imp.show(5)


+---------+------+-----+--------------------------------+------------------------------+-------------------------------+-------------+--------------+-------------+----------------------------------+-----------------+-----------------+-----------------+-----------------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+----------+
|Elevation|Aspect|Slope|Horizontal_Distance_To_Hydrology|Vertical_Distance_To_Hydrology|Horizontal_Distance_To_Roadways|Hillshade_9am|Hillshade_noon|Hillshade_3pm|Horizontal_Distance_To_Fire_Points|Wilderness

Imputation

In [31]:
train,test = df.randomSplit([0.9,0.1])
#assembler = VectorAssembler(inputCols=df.col)
col_names = df.columns
col_names = col_names[:-1]
assembler = VectorAssembler(inputCols=col_names,outputCol='features') 
train = assembler.transform(df)
test = assembler.transform(df)
train.show(3)
test.show(3)



+---------+------+-----+--------------------------------+------------------------------+-------------------------------+-------------+--------------+-------------+----------------------------------+-----------------+-----------------+-----------------+-----------------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+----------+--------------------+
|Elevation|Aspect|Slope|Horizontal_Distance_To_Hydrology|Vertical_Distance_To_Hydrology|Horizontal_Distance_To_Roadways|Hillshade_9am|Hillshade_noon|Hillshade_3pm|Horizontal_Distance_To_F

In [34]:
dt_model = DecisionTreeClassifier(labelCol='Cover_Type',featuresCol='features')
dt = dt_model.fit(train)
print(dt.toDebugString)

DecisionTreeClassificationModel: uid=DecisionTreeClassifier_8323d46e5d38, depth=5, numNodes=45, numClasses=8, numFeatures=54
  If (feature 0 <= 3038.5)
   If (feature 0 <= 2560.5)
    If (feature 10 <= 0.5)
     If (feature 0 <= 2446.5)
      If (feature 3 <= 15.0)
       Predict: 4.0
      Else (feature 3 > 15.0)
       Predict: 3.0
     Else (feature 0 > 2446.5)
      If (feature 17 <= 0.5)
       Predict: 2.0
      Else (feature 17 > 0.5)
       Predict: 3.0
    Else (feature 10 > 0.5)
     If (feature 9 <= 5445.0)
      Predict: 2.0
     Else (feature 9 > 5445.0)
      If (feature 5 <= 569.5)
       Predict: 2.0
      Else (feature 5 > 569.5)
       Predict: 5.0
   Else (feature 0 > 2560.5)
    If (feature 0 <= 2942.5)
     If (feature 15 <= 0.5)
      If (feature 17 <= 0.5)
       Predict: 2.0
      Else (feature 17 > 0.5)
       Predict: 3.0
     Else (feature 15 > 0.5)
      Predict: 3.0
    Else (feature 0 > 2942.5)
     If (feature 3 <= 157.5)
      If (feature 36 <= 0.5)
    

In [35]:
preds = dt.transform(test)
preds.show(5) 

+---------+------+-----+--------------------------------+------------------------------+-------------------------------+-------------+--------------+-------------+----------------------------------+-----------------+-----------------+-----------------+-----------------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+----------+--------------------+--------------------+--------------------+----------+
|Elevation|Aspect|Slope|Horizontal_Distance_To_Hydrology|Vertical_Distance_To_Hydrology|Horizontal_Distance_To_Roadways|Hillshade_9am|

In [41]:
evaluator_acc = MulticlassClassificationEvaluator(predictionCol='prediction',labelCol='Cover_Type',metricName='accuracy')
accuracy = evaluator_acc.evaluate(preds)
print(f"Accuracy: {accuracy}")
evaluator_prec = MulticlassClassificationEvaluator(predictionCol='prediction',labelCol='Cover_Type',metricName='weightedPrecision') 
precision = evaluator_prec.evaluate(preds)
print(f"Precision: {precision}")
evaluator_recall = MulticlassClassificationEvaluator(labelCol="Cover_Type", predictionCol="prediction", metricName="weightedRecall")
recall = evaluator_recall.evaluate(preds)
print(f"Recall: {recall}")

Accuracy: 0.699326347820699
Precision: 0.6785237607087015
Recall: 0.699326347820699
