In [1]:
from pyspark import SparkContext
from math import sin, cos, sqrt, atan2, radians
from geopy.distance import geodesic, great_circle
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder
from pyspark.sql.types import FloatType
from pyspark.ml.classification import MultilayerPerceptronClassifier, LogisticRegression, \
    DecisionTreeClassifier, RandomForestClassifier, NaiveBayes, GBTClassifier, LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import TrainValidationSplit,ParamGridBuilder, CrossValidator

In [2]:
spark = SparkSession.builder.appName("HW2").getOrCreate()

# Checking The Data

In [3]:
df = spark.read\
        .option("inferSchema","true")\
        .option("delimiter",",")\
        .csv('leaf/leaf.csv')

In [4]:
df.show(10)

+---+---+-------+------+-------+-------+-------+-------+---------+---------+---------+--------+---------+---------+---------+-------+
|_c0|_c1|    _c2|   _c3|    _c4|    _c5|    _c6|    _c7|      _c8|      _c9|     _c10|    _c11|     _c12|     _c13|     _c14|   _c15|
+---+---+-------+------+-------+-------+-------+-------+---------+---------+---------+--------+---------+---------+---------+-------+
|  1|  1|0.72694|1.4742|0.32396|0.98535|    1.0|0.83592|0.0046566|0.0039465|  0.04779| 0.12795| 0.016108|0.0052323|2.7477E-4| 1.1756|
|  1|  2|0.74173|1.5257|0.36116|0.98152|0.99825|0.79867|0.0052423|0.0050016|  0.02416|0.090476|0.0081195| 0.002708|7.4846E-5|0.69659|
|  1|  3|0.76722|1.5725|0.38998|0.97755|    1.0|0.80812|0.0074573| 0.010121| 0.011897|0.057445|0.0032891|9.2068E-4|3.7886E-5|0.44348|
|  1|  4|0.73797|1.4597|0.35376|0.97566|    1.0|0.81697|0.0068768|0.0086068|  0.01595|0.065491|0.0042707|0.0011544|6.6272E-5|0.58785|
|  1|  5|0.82301|1.7707|0.44462|0.97698|    1.0|0.75493| 0.007

#### We can see from the information about the data that _c1 column is the number of specimens available, which means how many leaf from that plant the sample which is something like a row number. Therefore, I decided to drop that column since it is not a prameter for the leaf structure

In [5]:
df = df.drop('_c1')
df.show(10)

+---+-------+------+-------+-------+-------+-------+---------+---------+---------+--------+---------+---------+---------+-------+
|_c0|    _c2|   _c3|    _c4|    _c5|    _c6|    _c7|      _c8|      _c9|     _c10|    _c11|     _c12|     _c13|     _c14|   _c15|
+---+-------+------+-------+-------+-------+-------+---------+---------+---------+--------+---------+---------+---------+-------+
|  1|0.72694|1.4742|0.32396|0.98535|    1.0|0.83592|0.0046566|0.0039465|  0.04779| 0.12795| 0.016108|0.0052323|2.7477E-4| 1.1756|
|  1|0.74173|1.5257|0.36116|0.98152|0.99825|0.79867|0.0052423|0.0050016|  0.02416|0.090476|0.0081195| 0.002708|7.4846E-5|0.69659|
|  1|0.76722|1.5725|0.38998|0.97755|    1.0|0.80812|0.0074573| 0.010121| 0.011897|0.057445|0.0032891|9.2068E-4|3.7886E-5|0.44348|
|  1|0.73797|1.4597|0.35376|0.97566|    1.0|0.81697|0.0068768|0.0086068|  0.01595|0.065491|0.0042707|0.0011544|6.6272E-5|0.58785|
|  1|0.82301|1.7707|0.44462|0.97698|    1.0|0.75493| 0.007428| 0.010042|0.0079379|0.045339

# Checking For Null Values

In [6]:
 columns = df.columns

for col in columns:
    print(col + " number of nulls")
    print(df.filter(col + ' is null').count())

_c0 number of nulls
0
_c2 number of nulls
0
_c3 number of nulls
0
_c4 number of nulls
0
_c5 number of nulls
0
_c6 number of nulls
0
_c7 number of nulls
0
_c8 number of nulls
0
_c9 number of nulls
0
_c10 number of nulls
0
_c11 number of nulls
0
_c12 number of nulls
0
_c13 number of nulls
0
_c14 number of nulls
0
_c15 number of nulls
0


#### It seams like there is no null values

### Casting Colouns to Decimals

In [7]:
df=df.withColumn("_c14", df["_c14"].cast("Decimal(10,10)"))
df=df.withColumn("_c13", df["_c13"].cast("Decimal(10,10)"))

In [8]:
df.select('_c13', '_c14').show()

+------------+------------+
|        _c13|        _c14|
+------------+------------+
|0.0052323000|0.0002747700|
|0.0027080000|0.0000748460|
|0.0009206800|0.0000378860|
|0.0011544000|0.0000662720|
|0.0005598600|0.0000235040|
|0.0011248000|0.0000247980|
|0.0022713000|0.0000414950|
|0.0024664000|0.0001467600|
|0.0003881200|0.0000328630|
|0.0004588900|0.0000282510|
|0.0003087200|0.0000318390|
|0.0008164800|0.0001385500|
|0.0020648000|0.0002388300|
|0.0014887000|0.0000832710|
|0.0022383000|0.0002016600|
|0.0022541000|0.0001985400|
|0.0018929000|0.0001245200|
|0.0021199000|0.0002772900|
|0.0012274000|0.0001492900|
|0.0018832000|0.0002434500|
+------------+------------+
only showing top 20 rows



In [9]:
df.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- _c2: double (nullable = true)
 |-- _c3: double (nullable = true)
 |-- _c4: double (nullable = true)
 |-- _c5: double (nullable = true)
 |-- _c6: double (nullable = true)
 |-- _c7: double (nullable = true)
 |-- _c8: double (nullable = true)
 |-- _c9: double (nullable = true)
 |-- _c10: double (nullable = true)
 |-- _c11: double (nullable = true)
 |-- _c12: double (nullable = true)
 |-- _c13: decimal(10,10) (nullable = true)
 |-- _c14: decimal(10,10) (nullable = true)
 |-- _c15: double (nullable = true)



### First we need to change parameters data types to float

In [10]:
#for col in columns[1:]:
#    df = df.withColumn(col, df[col].cast(FloatType()))
#df.schema

## Vector Assembler 

### Now we can convert parameters to vector assembler

In [11]:
vecA = VectorAssembler(inputCols=df.columns[1:],outputCol="features")
df = vecA.transform(df)

In [12]:
df.select('features').show()

+--------------------+
|            features|
+--------------------+
|[0.72694,1.4742,0...|
|[0.74173,1.5257,0...|
|[0.76722,1.5725,0...|
|[0.73797,1.4597,0...|
|[0.82301,1.7707,0...|
|[0.72997,1.4892,0...|
|[0.82063,1.7529,0...|
|[0.77982,1.6215,0...|
|[0.83089,1.8199,0...|
|[0.90631,2.3906,0...|
|[0.7459,1.4927,0....|
|[0.79606,1.6934,0...|
|[0.93361,2.7582,0...|
|[0.91186,2.4994,0...|
|[0.89063,2.2927,0...|
|[0.86755,2.009,0....|
|[0.91852,2.5247,0...|
|[0.88795,2.2038,0...|
|[0.85121,1.9548,0...|
|[0.89084,2.2979,0...|
+--------------------+
only showing top 20 rows



## We do not need to string index our label since iti is already a integer value

In [13]:
df = df.withColumnRenamed('_c0', 'label')
df.select('label').show()

+-----+
|label|
+-----+
|    1|
|    1|
|    1|
|    1|
|    1|
|    1|
|    1|
|    1|
|    1|
|    1|
|    1|
|    1|
|    2|
|    2|
|    2|
|    2|
|    2|
|    2|
|    2|
|    2|
+-----+
only showing top 20 rows



## Since there is no categorical parameter in the data set we do not need to use dummification or one hot encoding

### Divide to train and split

In [14]:
(trainDF, testDF) = df.randomSplit([0.75,0.25], seed=1234)

## Random Forest Classifier

In [15]:
rfC = RandomForestClassifier()

In [16]:
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")

#### Trani Validation Split

In [17]:
myParamsTVS = ParamGridBuilder()\
            .addGrid(rfC.numTrees,[15,20])\
            .addGrid(rfC.maxDepth,[6,8,10])\
            .addGrid(rfC.seed,[1234])\
            .addGrid(rfC.maxBins,[15,35,45])\
            .build()

validatorTVS = TrainValidationSplit( estimator=rfC,
                                  estimatorParamMaps=myParamsTVS,
                                  trainRatio=0.75,
                                   evaluator = evaluator, seed=1234)

bestModelRF = validatorTVS.fit(trainDF)
resultDF = bestModelRF.transform(testDF)

result = evaluator.evaluate(resultDF)
print("Accuracy = ",result)

print("Num Trees : ",bestModelRF.bestModel._java_obj.getNumTrees())
print("Max Depth : ",bestModelRF.bestModel._java_obj.getMaxDepth())
print("Max Bins : ",bestModelRF.bestModel._java_obj.getMaxBins())
print("Impurity :  ",bestModelRF.bestModel._java_obj.getImpurity())

Accuracy =  0.7285714285714285
Num Trees :  20
Max Depth :  8
Max Bins :  45
Impurity :   gini


#### Cross Validation

In [18]:
myParamsCV = ParamGridBuilder()\
            .addGrid(rfC.numTrees,[19,21,23])\
            .addGrid(rfC.maxDepth,[12,15,17])\
            .addGrid(rfC.seed,[1234])\
            .build()

validatorCV = CrossValidator(estimator=rfC,
                                  estimatorParamMaps=myParamsCV,
                                   evaluator = evaluator,
                                    numFolds=4, seed=1234)

bestModelRF = validatorCV.fit(trainDF)
resultDF = bestModelRF.transform(testDF)

result = evaluator.evaluate(resultDF)
print("Accuracy = ",result)

print("Num Trees : ",bestModelRF.bestModel._java_obj.getNumTrees())
print("Max Depth : ",bestModelRF.bestModel._java_obj.getMaxDepth())
print("Max Bins : ",bestModelRF.bestModel._java_obj.getMaxBins())
print("Impurity :  ",bestModelRF.bestModel._java_obj.getImpurity())

Accuracy =  0.7714285714285715
Num Trees :  23
Max Depth :  12
Max Bins :  32
Impurity :   gini


## Decision Tree

In [19]:
dt = DecisionTreeClassifier()

## Train Validation Split

In [20]:
myParamsTVS = ParamGridBuilder()\
            .addGrid(dt.maxDepth,[8,12,28])\
            .addGrid(dt.maxBins,[10,15,25])\
            .build()

validatorTVS = TrainValidationSplit( estimator=dt,
                                  estimatorParamMaps=myParamsTVS,
                                  trainRatio=0.60,
                                   evaluator = evaluator)

bestModelDT = validatorTVS.fit(trainDF)
resultDF = bestModelDT.transform(testDF)

result = evaluator.evaluate(resultDF)
print("Accuracy = ",result)

print("Num Trees : ",bestModelDT.bestModel._java_obj.getMaxDepth())
print("Max Bins : ",bestModelDT.bestModel._java_obj.getMaxBins())
print("Impurity :  ",bestModelDT.bestModel._java_obj.getImpurity())

Accuracy =  0.6571428571428571
Num Trees :  12
Max Bins :  25
Impurity :   gini


## Cross Validation

In [21]:
myParamsCV = ParamGridBuilder()\
            .addGrid(dt.maxDepth,[12,15,20])\
            .addGrid(dt.maxBins,[10,15,25])\
            .build()
validatorCV = CrossValidator(estimator=dt,
                                  estimatorParamMaps=myParamsCV,
                                   evaluator = evaluator,
                                    numFolds=10
                                    )

bestModelDT = validatorCV.fit(trainDF)
resultDF = bestModelDT.transform(testDF)

result = evaluator.evaluate(resultDF)
print("Accuracy = ",result)

print("Num Trees : ",bestModelDT.bestModel._java_obj.getMaxDepth())
print("Max Bins : ",bestModelDT.bestModel._java_obj.getMaxBins())
print("Impurity :  ",bestModelDT.bestModel._java_obj.getImpurity())

Accuracy =  0.5857142857142857
Num Trees :  15
Max Bins :  15
Impurity :   gini


# Logistic Regression

In [22]:
lr = LogisticRegression()

## Train Validation Split

In [23]:
myParamsTVS = ParamGridBuilder()\
            .addGrid(lr.maxIter,[10,50,100])\
            .addGrid(lr.regParam,[0.0, 0.3, 0.5])\
            .addGrid(lr.elasticNetParam,[0.0,0.5,0.8])\
            .build()

validatorTVS = TrainValidationSplit( estimator=lr,
                                  estimatorParamMaps=myParamsTVS,
                                  trainRatio=0.75,
                                   evaluator = evaluator, seed=1234)

bestModelLR = validatorTVS.fit(trainDF)
resultLR = bestModelLR.transform(testDF)

result = evaluator.evaluate(resultLR)
print("Accuracy = ",result)


print("Max Iter : ",bestModelLR.bestModel._java_obj.getMaxIter())
print("Reg Param : ",bestModelLR.bestModel._java_obj.getRegParam())
print("Elastic Net Param : ",bestModelLR.bestModel._java_obj.getElasticNetParam())

Accuracy =  0.6428571428571429
Max Iter :  50
Reg Param :  0.0
Elastic Net Param :  0.0


## Cross Validation

In [24]:
myParamsCV = ParamGridBuilder()\
            .addGrid(lr.maxIter,[10,50,100])\
            .addGrid(lr.regParam,[0.0, 0.3, 0.5])\
            .addGrid(lr.elasticNetParam,[0.0,0.5,0.8])\
            .build()

validatorCV = CrossValidator(estimator=lr,
                                  estimatorParamMaps=myParamsCV,
                                   evaluator = evaluator,
                                    numFolds=4)

bestModelLR = validatorCV.fit(trainDF)
resultLR = bestModelLR.transform(testDF)

result = evaluator.evaluate(resultLR)
print("Accuracy = ",result)


print("Max Iter : ",bestModelLR.bestModel._java_obj.getMaxIter())
print("Reg Param : ",bestModelLR.bestModel._java_obj.getRegParam())
print("Elastic Net Param : ",bestModelLR.bestModel._java_obj.getElasticNetParam())

Accuracy =  0.7142857142857143
Max Iter :  100
Reg Param :  0.0
Elastic Net Param :  0.0


# Naive Bayes

In [25]:
nb = NaiveBayes()

## Train Validation Split

In [26]:
myParamsTVS = ParamGridBuilder()\
            .addGrid(nb.smoothing,[0.0, 0.2])\
            .build()
validatorTVS = TrainValidationSplit( estimator=nb,
                                  estimatorParamMaps=myParamsTVS,
                                  trainRatio=0.70,
                                   evaluator = evaluator)

bestModelNB = validatorTVS.fit(trainDF)
resultDF = bestModelNB.transform(testDF)

result = evaluator.evaluate(resultDF)
print("Accuracy = ",result)

print("Smooting : ",bestModelNB.bestModel._java_obj.getSmoothing())

Accuracy =  0.014285714285714285
Smooting :  0.0


## Cross Validation

In [27]:
myParamsCV = ParamGridBuilder()\
            .addGrid(nb.smoothing,[0.0, 0.2])\
            .build()
validatorCV = CrossValidator(estimator=nb,
                                  estimatorParamMaps=myParamsCV,
                                   evaluator = evaluator,
                                    numFolds=4)

bestModelNB = validatorCV.fit(trainDF)
resultDF = bestModelNB.transform(testDF)

result = evaluator.evaluate(resultDF)
print("Accuracy = ",result)

print("Smooting : ",bestModelNB.bestModel._java_obj.getSmoothing())

Accuracy =  0.014285714285714285
Smooting :  0.0
