In [20]:
from pyspark import SparkContext
from pyspark.sql import SparkSession

spark_session = SparkSession.Builder().getOrCreate()
train = spark_session.read.csv('/data/covertype2/train.csv', header=True)
test = spark_session.read.csv('/data/covertype2/test.csv', header=True)

In [21]:
train1 = train.select(
    train.Elevation.astype('int'),
    train.Aspect.astype('int'),
    train.Slope.astype('int'),
    train.Horizontal_Distance_To_Hydrology.astype('int'),
    train.Vertical_Distance_To_Hydrology.astype('int'),
    train.Horizontal_Distance_To_Roadways.astype('int'),
    train.Hillshade_9am.astype('int'),
    train.Hillshade_Noon.astype('int'),
    train.Hillshade_3pm.astype('int'),
    train.Horizontal_Distance_To_Fire_Points.astype('int'),
    train.Wild_Type.astype('string'),
    train.Soil_Type.astype('string'),
    train.Target.astype('int').alias('label')
)

test1 = train.select(
    train.Elevation.astype('int'),
    train.Aspect.astype('int'),
    train.Slope.astype('int'),
    train.Horizontal_Distance_To_Hydrology.astype('int'),
    train.Vertical_Distance_To_Hydrology.astype('int'),
    train.Horizontal_Distance_To_Roadways.astype('int'),
    train.Hillshade_9am.astype('int'),
    train.Hillshade_Noon.astype('int'),
    train.Hillshade_3pm.astype('int'),
    train.Horizontal_Distance_To_Fire_Points.astype('int'),
    train.Wild_Type.astype('string'),
    train.Soil_Type.astype('string'),
    train.Target.astype('int').alias('label')
)

In [22]:
from pyspark.ml.feature import VectorAssembler, OneHotEncoder, StringIndexer

stringIndexer = StringIndexer(inputCol='Wild_Type', outputCol='Wild_Type_Index')
st_model1 = stringIndexer.fit(train1)
indexed_train1 = st_model1.transform(train1)
indexed_test1 = st_model1.transform(test1)

stringIndexer = StringIndexer(inputCol='Soil_Type', outputCol='Soil_Type_Index')
st_model2 = stringIndexer.fit(indexed_train1)
indexed2_train1 = st_model2.transform(indexed_train1)
indexed2_test1 = st_model2.transform(indexed_test1)

encoder = OneHotEncoder(inputCol='Wild_Type_Index', outputCol='Wild_Type_vec')
encoder.setDropLast(False)
encoded_train1 = encoder.transform(indexed2_train1)
encoded_test1 = encoder.transform(indexed2_test1)

encoder2 = OneHotEncoder(inputCol='Soil_Type_Index', outputCol='Soil_Type_vec')
encoder2.setDropLast(False)
encoded2_train1 = encoder2.transform(encoded_train1)
encoded2_test1 = encoder2.transform(encoded_test1)

In [23]:

assembler = VectorAssembler().setInputCols(['Elevation',
                        'Aspect',
                        'Slope',
                        'Horizontal_Distance_To_Hydrology',
                        'Vertical_Distance_To_Hydrology',
                        'Horizontal_Distance_To_Roadways',
                        'Hillshade_9am',
                        'Hillshade_Noon',
                        'Hillshade_3pm',
                        'Horizontal_Distance_To_Fire_Points',
                         'Wild_Type_vec', 'Soil_Type_vec'
                         ]).setOutputCol("features")


In [24]:
train2 = assembler.transform(encoded2_train1)
train3 = train2.select('features','label')

test2 = assembler.transform(encoded2_test1)
test3 = test2.select('features','label')

In [25]:
from pyspark.ml.classification import RandomForestClassifier
rfClassifier = RandomForestClassifier(labelCol="label", numTrees=100, maxDepth=9)
rfModel = rfClassifier.fit(train3)
predictions = rfModel.transform(test3)
#predictions.select("label", "prediction").show()

In [26]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)

In [27]:
print(accuracy)

0.7267147781128824


In [None]:
from pyspark.ml import Pipeline
rfClassifier = RandomForestClassifier(labelCol="label")
pipeline = Pipeline(stages=[rfClassifier])

In [None]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
param_grid = ParamGridBuilder()\
            .addGrid(rfClassifier.numTrees, [75 , 100, 150, 200])\
            .addGrid(rfClassifier.maxDepth, [4, 5, 6, 7, 8])\
            .build()
cross_val = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=param_grid,
                          evaluator=evaluator,
                          numFolds=10)
cvModel = cross_val.fit(train3)

In [None]:
cvModel.avgMetrics

In [None]:
#print(cvModel.bestModel.stages[0])