In [2]:
import findspark
findspark.init('/home/ubuntu/spark-2.1.1-bin-hadoop2.7')
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('gurpreet722').getOrCreate()

from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [3]:
data = spark.read.csv('cleaneddata.csv', header=True, inferSchema=True)

In [4]:
data.head

<bound method DataFrame.head of DataFrame[BloodPressure: int, Pregnancies: double, Glucose: int, BMI: double, DiabetesPedigreeFunction: double, Age: int, Outcome: int]>

In [5]:
data.show(5)

+-------------+-----------+-------+----+------------------------+---+-------+
|BloodPressure|Pregnancies|Glucose| BMI|DiabetesPedigreeFunction|Age|Outcome|
+-------------+-----------+-------+----+------------------------+---+-------+
|           72|        6.0|    148|33.6|                   0.627| 50|      1|
|           66|        1.0|     85|26.6|                   0.351| 31|      0|
|           64|        8.0|    183|23.3|                   0.672| 32|      1|
|           66|        1.0|     89|28.1|                   0.167| 21|      0|
|           72|        0.0|    137|43.1|                  0.3725| 33|      1|
+-------------+-----------+-------+----+------------------------+---+-------+
only showing top 5 rows



In [6]:
data.columns

['BloodPressure',
 'Pregnancies',
 'Glucose',
 'BMI',
 'DiabetesPedigreeFunction',
 'Age',
 'Outcome']

In [7]:
# A few things we need to do before Spark can accept the data!
# It needs to be in the form of two columns: "label" and "features".

# Import VectorAssembler and Vectors
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [8]:
# Combine all features into one vector named features.
assembler = VectorAssembler(
  inputCols=['BloodPressure',
 'Pregnancies',
 'Glucose',
 'BMI',
 'DiabetesPedigreeFunction',
 'Age'],
              outputCol="features")

In [9]:
# Let's transform the data. 
data = assembler.transform(data)

In [10]:
# Let's start off with binary classification.
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Note that the label column isn't named label, it's named PrivateIndex in this case.
my_binary_eval = BinaryClassificationEvaluator(labelCol = 'Outcome')

In [11]:
# Let's import the evaluator.
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [12]:
# Select (prediction, true label) and compute test error. 
acc_evaluator = MulticlassClassificationEvaluator(labelCol="Outcome", predictionCol="prediction", metricName="accuracy")

In [13]:
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(labelCol="Outcome", predictionCol="prediction", metricName="accuracy")

In [14]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import (RandomForestClassifier, GBTClassifier, DecisionTreeClassifier)

# 7.2

# Split 70:30

In [15]:
train_data,test_data = data.randomSplit([0.7,0.3])

Decision tree

In [16]:
dtc = DecisionTreeClassifier(labelCol='Outcome',featuresCol='features')
dtc_model = dtc.fit(train_data)
dtc_predictions = dtc_model.transform(test_data)
print("DTC")
print(my_binary_eval.evaluate(dtc_predictions))
dtc_acc = acc_evaluator.evaluate(dtc_predictions)
accuracy = evaluator.evaluate(dtc_predictions)
print("Test Error = %g" % (1.0 - accuracy))
print("Here are the results!")
print('-'*40)
print('A single decision tree has an accuracy of: {0:2.2f}%'.format(dtc_acc*100))

DTC
0.7739967239967239
Test Error = 0.259414
Here are the results!
----------------------------------------
A single decision tree has an accuracy of: 74.06%


Random Forest

In [17]:
rfc = RandomForestClassifier(labelCol='Outcome',featuresCol='features', seed=1)
rfc_model = rfc.fit(train_data)
rfc_predictions = rfc_model.transform(test_data)
print("RFC")
print(my_binary_eval.evaluate(rfc_predictions))
rfc_acc = acc_evaluator.evaluate(rfc_predictions)
accuracy = evaluator.evaluate(rfc_predictions)
print("Test Error = %g" % (1.0 - accuracy))
print("Here are the results!")
print('-'*40)
print('A single decision tree has an accuracy of: {0:2.2f}%'.format(rfc_acc*100))

RFC
0.8436936936936926
Test Error = 0.238494
Here are the results!
----------------------------------------
A single decision tree has an accuracy of: 76.15%


Gradient Bossting Tree

In [18]:
gbt = GBTClassifier(labelCol='Outcome',featuresCol='features',seed=1)
gbt_model = gbt.fit(train_data)
gbt_predictions = gbt_model.transform(test_data)
my_binary_gbt_eval = BinaryClassificationEvaluator(labelCol='Outcome', rawPredictionCol='prediction')
print("GBT")
print(my_binary_gbt_eval.evaluate(gbt_predictions))
gbt_acc = acc_evaluator.evaluate(gbt_predictions)
accuracy = evaluator.evaluate(gbt_predictions)
print("Test Error = %g" % (1.0 - accuracy))
print("Here are the results!")
print('-'*40)
print('A single decision tree has an accuracy of: {0:2.2f}%'.format(gbt_acc*100))

GBT
0.7310810810810812
Test Error = 0.242678
Here are the results!
----------------------------------------
A single decision tree has an accuracy of: 75.73%


# 80:20 split

In [19]:
train_data,test_data = data.randomSplit([0.8,0.2])

Decision Tree

In [20]:
dtc = DecisionTreeClassifier(labelCol='Outcome',featuresCol='features')
dtc_model = dtc.fit(train_data)
dtc_predictions = dtc_model.transform(test_data)
print("DTC")
print(my_binary_eval.evaluate(dtc_predictions))
dtc_acc = acc_evaluator.evaluate(dtc_predictions)
accuracy = evaluator.evaluate(dtc_predictions)
print("Test Error = %g" % (1.0 - accuracy))
print("Here are the results!")
print('-'*40)
print('A single decision tree has an accuracy of: {0:2.2f}%'.format(dtc_acc*100))

DTC
0.7728851832551676
Test Error = 0.234177
Here are the results!
----------------------------------------
A single decision tree has an accuracy of: 76.58%


Random Forest

In [21]:
rfc = RandomForestClassifier(labelCol='Outcome',featuresCol='features')
rfc_model = rfc.fit(train_data)
rfc_predictions = rfc_model.transform(test_data)
print("RFC")
print(my_binary_eval.evaluate(rfc_predictions))
rfc_acc = acc_evaluator.evaluate(rfc_predictions)
accuracy = evaluator.evaluate(rfc_predictions)
print("Test Error = %g" % (1.0 - accuracy))
print("Here are the results!")
print('-'*40)
print('A single decision tree has an accuracy of: {0:2.2f}%'.format(rfc_acc*100))

RFC
0.8547854785478544
Test Error = 0.21519
Here are the results!
----------------------------------------
A single decision tree has an accuracy of: 78.48%


In [22]:
gbt = GBTClassifier(labelCol='Outcome',featuresCol='features',seed=1)
gbt_model = gbt.fit(train_data)
gbt_predictions = gbt_model.transform(test_data)
my_binary_gbt_eval = BinaryClassificationEvaluator(labelCol='Outcome', rawPredictionCol='prediction')
print("GBT")
print(my_binary_gbt_eval.evaluate(gbt_predictions))
gbt_acc = acc_evaluator.evaluate(gbt_predictions)
accuracy = evaluator.evaluate(gbt_predictions)
print("Test Error = %g" % (1.0 - accuracy))
print("Here are the results!")
print('-'*40)
print('A single decision tree has an accuracy of: {0:2.2f}%'.format(gbt_acc*100))

GBT
0.7365815528921313
Test Error = 0.234177
Here are the results!
----------------------------------------
A single decision tree has an accuracy of: 76.58%


# 75:25 Split

In [23]:
train_data,test_data = data.randomSplit([0.75,0.25])

Decision Tree

In [24]:
dtc = DecisionTreeClassifier(labelCol='Outcome',featuresCol='features')
dtc_model = dtc.fit(train_data)
dtc_predictions = dtc_model.transform(test_data)
print("DTC")
print(my_binary_eval.evaluate(dtc_predictions))
dtc_acc = acc_evaluator.evaluate(dtc_predictions)
accuracy = evaluator.evaluate(dtc_predictions)
print("Test Error = %g" % (1.0 - accuracy))
print("Here are the results!")
print('-'*40)
print('A single decision tree has an accuracy of: {0:2.2f}%'.format(dtc_acc*100))

DTC
0.7399749373433584
Test Error = 0.302439
Here are the results!
----------------------------------------
A single decision tree has an accuracy of: 69.76%


Random Forest

In [25]:
rfc = RandomForestClassifier(labelCol='Outcome',featuresCol='features', seed=1)
rfc_model = rfc.fit(train_data)
rfc_predictions = rfc_model.transform(test_data)
print("RFC")
print(my_binary_eval.evaluate(rfc_predictions))
rfc_acc = acc_evaluator.evaluate(rfc_predictions)
accuracy = evaluator.evaluate(rfc_predictions)
print("Test Error = %g" % (1.0 - accuracy))
print("Here are the results!")
print('-'*40)
print('A single decision tree has an accuracy of: {0:2.2f}%'.format(rfc_acc*100))

RFC
0.7698412698412699
Test Error = 0.263415
Here are the results!
----------------------------------------
A single decision tree has an accuracy of: 73.66%


Gradient Boosting Tree 

In [26]:
gbt = GBTClassifier(labelCol='Outcome',featuresCol='features',seed=1)
gbt_model = gbt.fit(train_data)
gbt_predictions = gbt_model.transform(test_data)
my_binary_gbt_eval = BinaryClassificationEvaluator(labelCol='Outcome', rawPredictionCol='prediction')
print("GBT")
print(my_binary_gbt_eval.evaluate(gbt_predictions))
gbt_acc = acc_evaluator.evaluate(gbt_predictions)
accuracy = evaluator.evaluate(gbt_predictions)
print("Test Error = %g" % (1.0 - accuracy))
print("Here are the results!")
print('-'*40)
print('A single decision tree has an accuracy of: {0:2.2f}%'.format(gbt_acc*100))

GBT
0.6869778613199665
Test Error = 0.278049
Here are the results!
----------------------------------------
A single decision tree has an accuracy of: 72.20%


# 7.3

In [27]:
#Split 70:30
#Random Forest
train_data,test_data = data.randomSplit([0.70,0.30])
rfc = RandomForestClassifier(labelCol='Outcome',featuresCol='features', seed=1)
rfc_model = rfc.fit(train_data)
rfc_predictions = rfc_model.transform(test_data)
print("RFC")
print(my_binary_eval.evaluate(rfc_predictions))
rfc_acc = acc_evaluator.evaluate(rfc_predictions)
accuracy = evaluator.evaluate(rfc_predictions)
print("Test Error = %g" % (1.0 - accuracy))
print("Here are the results!")
print('-'*40)
print('A single decision tree has an accuracy of: {0:2.2f}%'.format(rfc_acc*100))

RFC
0.8100645161290325
Test Error = 0.231373
Here are the results!
----------------------------------------
A single decision tree has an accuracy of: 76.86%


# 8.5

# Re-Iteration 2 (On reduced data with 70:30 split)

In [28]:
data1= spark.read.csv('cleaneddatareduced.csv', header=True, inferSchema=True)

In [29]:
data1.show(5)

+-------+----+------------------------+---+-------+
|Glucose| BMI|DiabetesPedigreeFunction|Age|Outcome|
+-------+----+------------------------+---+-------+
|    148|33.6|                   0.627| 50|      1|
|     85|26.6|                   0.351| 31|      0|
|    183|23.3|                   0.672| 32|      1|
|     89|28.1|                   0.167| 21|      0|
|    137|43.1|                  0.3725| 33|      1|
+-------+----+------------------------+---+-------+
only showing top 5 rows



In [30]:
data1.columns

['Glucose', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome']

In [31]:
# Combine all features into one vector named features.
assembler = VectorAssembler(
  inputCols=['Glucose',
 'BMI',
 'DiabetesPedigreeFunction',
 'Age'],
              outputCol="features")

In [32]:
# Let's transform the data. 
data1 = assembler.transform(data1)

In [33]:
#Split 70:30
#Random Forest
train_data,test_data = data1.randomSplit([0.70,0.30])
rfc = RandomForestClassifier(labelCol='Outcome',featuresCol='features', seed=1)
rfc_model = rfc.fit(train_data)
rfc_predictions = rfc_model.transform(test_data)
print("RFC")
print(my_binary_eval.evaluate(rfc_predictions))
rfc_acc = acc_evaluator.evaluate(rfc_predictions)
accuracy = evaluator.evaluate(rfc_predictions)
print("Test Error = %g" % (1.0 - accuracy))
print("Here are the results!")
print('-'*40)
print('A single decision tree has an accuracy of: {0:2.2f}%'.format(rfc_acc*100))

RFC
0.8110639802050188
Test Error = 0.268182
Here are the results!
----------------------------------------
A single decision tree has an accuracy of: 73.18%


# Re-Iteration 3 (On reduced data with 80:20 split)

In [34]:
#Split 80:20
#Random Forest
train_data,test_data = data1.randomSplit([0.80,0.20])
rfc = RandomForestClassifier(labelCol='Outcome',featuresCol='features', seed=1)
rfc_model = rfc.fit(train_data)
rfc_predictions = rfc_model.transform(test_data)
print("RFC")
print(my_binary_eval.evaluate(rfc_predictions))
rfc_acc = acc_evaluator.evaluate(rfc_predictions)
accuracy = evaluator.evaluate(rfc_predictions)
print("Test Error = %g" % (1.0 - accuracy))
print("Here are the results!")
print('-'*40)
print('A single decision tree has an accuracy of: {0:2.2f}%'.format(rfc_acc*100))

RFC
0.8227638015373869
Test Error = 0.23125
Here are the results!
----------------------------------------
A single decision tree has an accuracy of: 76.88%
