Random Forest and Linear Regression

In [129]:
import pyspark
import findspark
from __future__ import print_function
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, VectorIndexer, StandardScaler
from pyspark.ml.regression import LinearRegression, RandomForestRegressor
from pyspark.mllib.evaluation import BinaryClassificationMetrics
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import col, count, isnan, when

In [50]:
findspark.init()
findspark.find()

'C:\\Users\\gamsi\\Documents\\Spark'

In [51]:
spark = SparkSession.builder.appName("ML Testing").getOrCreate()

In [52]:
df = spark.read.csv("recruitment.csv", header = True)

In [53]:
type(df)

pyspark.sql.dataframe.DataFrame

In [54]:
df.printSchema()

root
 |-- GRE Score: string (nullable = true)
 |-- TOEFL Score: string (nullable = true)
 |-- University Rating: string (nullable = true)
 |-- SOP: string (nullable = true)
 |-- LOR: string (nullable = true)
 |-- CGPA: string (nullable = true)
 |-- Research: string (nullable = true)
 |-- Chance of Admit: string (nullable = true)



In [55]:
df.show()

+---------+-----------+-----------------+----+----+----+--------+---------------+
|GRE Score|TOEFL Score|University Rating| SOP| LOR|CGPA|Research|Chance of Admit|
+---------+-----------+-----------------+----+----+----+--------+---------------+
|   337.00|     118.00|                4|4.50|4.50|9.65|    1.00|           0.92|
|   324.00|     107.00|                4|4.00|4.50|8.87|    1.00|           0.76|
|     null|     104.00|                3|3.00|3.50|8.00|    1.00|           0.72|
|   322.00|     110.00|                3|3.50|2.50|8.67|    1.00|           0.80|
|   314.00|     103.00|                2|2.00|3.00|8.21|    0.00|           0.65|
|   330.00|     115.00|                5|4.50|3.00|9.34|    1.00|           0.90|
|   321.00|     109.00|             null|3.00|4.00|8.20|    1.00|           0.75|
|   308.00|     101.00|                2|3.00|4.00|7.90|    0.00|           0.68|
|   302.00|     102.00|                1|2.00|1.50|8.00|    0.00|           0.50|
|   323.00|     

In [56]:
float_df = df.select(*(col(c).cast("float").alias(c) for c in df.columns))

In [57]:
float_df.printSchema()

root
 |-- GRE Score: float (nullable = true)
 |-- TOEFL Score: float (nullable = true)
 |-- University Rating: float (nullable = true)
 |-- SOP: float (nullable = true)
 |-- LOR: float (nullable = true)
 |-- CGPA: float (nullable = true)
 |-- Research: float (nullable = true)
 |-- Chance of Admit: float (nullable = true)



In [59]:
float_df.select([count(when(col(c).isNull(), c)).alias(c) for c in float_df.columns]).show()

+---------+-----------+-----------------+---+---+----+--------+---------------+
|GRE Score|TOEFL Score|University Rating|SOP|LOR|CGPA|Research|Chance of Admit|
+---------+-----------+-----------------+---+---+----+--------+---------------+
|       15|         10|               15|  0|  0|   0|       0|              0|
+---------+-----------+-----------------+---+---+----+--------+---------------+



In [62]:
imputer = Imputer(inputCols = ["GRE Score", "TOEFL Score", "University Rating"], 
                  outputCols = ["GRE Score", "TOEFL Score", "University Rating"])

model = imputer.fit(float_df)

imputed = model.transform(float_df)
imputed

DataFrame[GRE Score: float, TOEFL Score: float, University Rating: float, SOP: float, LOR: float, CGPA: float, Research: float, Chance of Admit: float]

In [63]:
imputed.select([count(when(col(c).isNull(), c)).alias(c) for c in imputed.columns]).show()

+---------+-----------+-----------------+---+---+----+--------+---------------+
|GRE Score|TOEFL Score|University Rating|SOP|LOR|CGPA|Research|Chance of Admit|
+---------+-----------+-----------------+---+---+----+--------+---------------+
|        0|          0|                0|  0|  0|   0|       0|              0|
+---------+-----------+-----------------+---+---+----+--------+---------------+



In [64]:
features = imputed_data.drop('Chance of Admit')
features

DataFrame[GRE Score: float, TOEFL Score: float, University Rating: float, SOP: float, LOR: float, CGPA: float, Research: float]

In [80]:
assembler = VectorAssembler(inputCols = features.columns, outputCol = "features")

In [81]:
data = assembler.transform(imputed)

In [82]:
data = data.select("features", "Chance of Admit")

In [70]:
train_df, test_df = output.randomSplit([0.7, 0.3])

In [71]:
train_df.show()

+--------------------+---------------+
|            features|Chance of Admit|
+--------------------+---------------+
|[290.0,100.0,1.0,...|           0.47|
|[294.0,93.0,1.0,1...|           0.46|
|[294.0,95.0,1.0,1...|           0.49|
|[295.0,93.0,1.0,2...|           0.46|
|[295.0,96.0,2.0,1...|           0.47|
|[295.0,99.0,1.0,2...|           0.37|
|[295.0,99.0,2.0,2...|           0.57|
|[295.0,101.0,2.0,...|           0.69|
|[296.0,95.0,2.0,3...|           0.44|
|[296.0,97.0,2.0,1...|           0.49|
|[296.0,99.0,2.0,3...|           0.47|
|[296.0,101.0,1.0,...|            0.6|
|[297.0,96.0,2.0,2...|           0.43|
|[297.0,98.0,2.0,2...|           0.59|
|[297.0,99.0,4.0,3...|           0.54|
|[297.0,100.0,1.0,...|           0.52|
|[297.0,101.0,3.0,...|           0.57|
|[298.0,99.0,1.0,1...|           0.53|
|[298.0,100.0,3.0,...|           0.58|
|[298.0,101.0,4.0,...|           0.53|
+--------------------+---------------+
only showing top 20 rows



In [72]:
test_df.show()

+--------------------+---------------+
|            features|Chance of Admit|
+--------------------+---------------+
|[290.0,104.0,4.0,...|           0.45|
|[293.0,97.0,2.0,2...|           0.64|
|[296.0,99.0,2.0,2...|           0.61|
|[297.0,96.0,2.0,2...|           0.34|
|[298.0,92.0,1.0,2...|           0.51|
|[298.0,97.0,3.121...|           0.45|
|[298.0,98.0,2.0,1...|           0.44|
|[298.0,98.0,2.0,4...|           0.34|
|[298.0,101.0,2.0,...|           0.54|
|[299.0,100.0,1.0,...|           0.59|
|[299.0,100.0,2.0,...|           0.51|
|[299.0,100.0,3.0,...|           0.63|
|[299.0,100.0,3.0,...|           0.42|
|[300.0,99.0,1.0,3...|           0.36|
|[300.0,100.0,3.12...|           0.62|
|[300.0,104.0,3.0,...|           0.71|
|[301.0,97.0,2.0,3...|           0.44|
|[301.0,104.0,3.0,...|           0.68|
|[302.0,99.0,2.0,1...|           0.56|
|[302.0,101.0,2.0,...|           0.46|
+--------------------+---------------+
only showing top 20 rows



In [73]:
lin_reg = LinearRegression(featuresCol = 'features', labelCol = 'Chance of Admit')
linear_model = lin_reg.fit(train_df)

In [74]:
print("Coefficients: " + str(linear_model.coefficients))
print("Intercept: " + str(linear_model.intercept))

Coefficients: [0.0014194034689669263,0.002199854326592215,0.002873833686401836,0.004274291562135011,0.019527468223737143,0.12936750799742042,0.024247480323427984]
Intercept: -1.1793019566947878


In [75]:
trainSummary = linear_model.summary
print("RMSE: %f" % trainSummary.rootMeanSquaredError)
print("r2: %f" % trainSummary.r2)

RMSE: 0.060772
r2: 0.816559


In [76]:
predictions = linear_model.transform(test_df)
predictions.select('prediction', 'Chance of Admit', 'features').show()

+-------------------+---------------+--------------------+
|         prediction|Chance of Admit|            features|
+-------------------+---------------+--------------------+
|0.49505410229616653|           0.45|[290.0,104.0,4.0,...|
|   0.57568932016219|           0.64|[293.0,97.0,2.0,2...|
|   0.56270016996397|           0.61|[296.0,99.0,2.0,2...|
|0.47013578388028643|           0.34|[297.0,96.0,2.0,2...|
| 0.5159602061865953|           0.51|[298.0,92.0,1.0,2...|
| 0.4659079736295577|           0.45|[298.0,97.0,3.121...|
| 0.5147475666428234|           0.44|[298.0,98.0,2.0,1...|
| 0.5795142940303828|           0.34|[298.0,98.0,2.0,4...|
| 0.5339082353388005|           0.54|[298.0,101.0,2.0,...|
| 0.5341349414898444|           0.59|[299.0,100.0,1.0,...|
| 0.5378522779547017|           0.51|[299.0,100.0,2.0,...|
| 0.5588376071755516|           0.63|[299.0,100.0,3.0,...|
| 0.5007445212510488|           0.42|[299.0,100.0,3.0,...|
|0.42300286652898156|           0.36|[300.0,99.0,1.0,3..

In [77]:
pred_evaluator = RegressionEvaluator(predictionCol = 'prediction', labelCol = 'Chance of Admit', metricName = 'r2')
print("R Squared (R2) on test data = ", pred_evaluator.evaluate(predictions))

R Squared (R2) on test data =  0.8254785936230882


In [78]:
featureIndexer = VectorIndexer(inputCol = 'features', outputCol = 'indexedFeatures', maxCategories = 4).fit(data)

In [79]:
featureIndexer = featureIndexer.transform(data)

In [83]:
new_indexed_data = featureIndexer.select('indexedFeatures', 'Chance of Admit')

In [84]:
training, test = new_indexed_data.randomSplit([0.7, 0.3])

In [85]:
training.show()

+--------------------+---------------+
|     indexedFeatures|Chance of Admit|
+--------------------+---------------+
|[290.0,100.0,1.0,...|           0.47|
|[290.0,104.0,4.0,...|           0.45|
|[293.0,97.0,2.0,2...|           0.64|
|[294.0,93.0,1.0,1...|           0.46|
|[294.0,95.0,1.0,1...|           0.49|
|[295.0,99.0,2.0,2...|           0.57|
|[296.0,97.0,2.0,1...|           0.49|
|[296.0,99.0,2.0,3...|           0.47|
|[296.0,101.0,1.0,...|            0.6|
|[297.0,96.0,2.0,2...|           0.43|
|[297.0,96.0,2.0,2...|           0.34|
|[297.0,98.0,2.0,2...|           0.59|
|[297.0,99.0,4.0,3...|           0.54|
|[297.0,100.0,1.0,...|           0.52|
|[297.0,101.0,3.0,...|           0.57|
|[298.0,92.0,1.0,2...|           0.51|
|[298.0,98.0,2.0,1...|           0.44|
|[298.0,99.0,1.0,1...|           0.53|
|[298.0,100.0,3.0,...|           0.58|
|[298.0,101.0,2.0,...|           0.54|
+--------------------+---------------+
only showing top 20 rows



In [86]:
test.show()

+--------------------+---------------+
|     indexedFeatures|Chance of Admit|
+--------------------+---------------+
|[295.0,93.0,1.0,2...|           0.46|
|[295.0,96.0,2.0,1...|           0.47|
|[295.0,99.0,1.0,2...|           0.37|
|[295.0,101.0,2.0,...|           0.69|
|[296.0,95.0,2.0,3...|           0.44|
|[296.0,99.0,2.0,2...|           0.61|
|[298.0,97.0,3.121...|           0.45|
|[298.0,98.0,2.0,4...|           0.34|
|[298.0,101.0,4.0,...|           0.53|
|[298.0,107.187751...|           0.46|
|[299.0,94.0,1.0,1...|           0.42|
|[299.0,100.0,1.0,...|           0.59|
|[299.0,100.0,2.0,...|           0.51|
|[299.0,100.0,3.0,...|           0.42|
|[300.0,98.0,1.0,2...|           0.61|
|[300.0,99.0,1.0,1...|           0.58|
|[300.0,99.0,1.0,3...|           0.36|
|[301.0,97.0,2.0,3...|           0.44|
|[301.0,99.0,2.0,3...|           0.64|
|[301.0,99.0,3.0,2...|           0.68|
+--------------------+---------------+
only showing top 20 rows



In [87]:
random_forest_reg = RandomForestRegressor(featuresCol = 'indexedFeatures', labelCol = 'Chance of Admit' )

In [88]:
model = random_forest_reg.fit(training)

In [89]:
predictions = model.transform(test)

In [90]:
predictions.show()

+--------------------+---------------+-------------------+
|     indexedFeatures|Chance of Admit|         prediction|
+--------------------+---------------+-------------------+
|[295.0,93.0,1.0,2...|           0.46|0.47653483613604297|
|[295.0,96.0,2.0,1...|           0.47| 0.4636263062324808|
|[295.0,99.0,1.0,2...|           0.37|0.49465438908586706|
|[295.0,101.0,2.0,...|           0.69| 0.5327098335814625|
|[296.0,95.0,2.0,3...|           0.44| 0.5034169315172099|
|[296.0,99.0,2.0,2...|           0.61| 0.5754368963760154|
|[298.0,97.0,3.121...|           0.45|0.47095523504650466|
|[298.0,98.0,2.0,4...|           0.34| 0.5977897521816082|
|[298.0,101.0,4.0,...|           0.53| 0.5981700015488112|
|[298.0,107.187751...|           0.46| 0.5590280084876056|
|[299.0,94.0,1.0,1...|           0.42| 0.4847285612474656|
|[299.0,100.0,1.0,...|           0.59| 0.5362772259169296|
|[299.0,100.0,2.0,...|           0.51| 0.5317752413824107|
|[299.0,100.0,3.0,...|           0.42| 0.535363861726057

In [91]:
evaluator = RegressionEvaluator(labelCol = 'Chance of Admit', predictionCol = 'prediction', metricName = 'rmse')
print ('Root Mean Squared Error (RMSE) on test data = ', evaluator.evaluate(predictions))

Root Mean Squared Error (RMSE) on test data =  0.06531402920070192


In [93]:
evaluator = RegressionEvaluator(labelCol = 'Chance of Admit', predictionCol = 'prediction', metricName = 'r2')
print('R Squared (R2) on test data = ', evaluator.evaluate(predictions))

R Squared (R2) on test data =  0.8082184090112785


Classification

In [94]:
diabetes = spark.read.csv('diabetes.csv', header = True)

In [96]:
diabetes.printSchema()

root
 |-- Pregnancies: string (nullable = true)
 |-- Glucose: string (nullable = true)
 |-- BloodPressure: string (nullable = true)
 |-- SkinThickness: string (nullable = true)
 |-- Insulin: string (nullable = true)
 |-- BMI: string (nullable = true)
 |-- DiabetesPedigreeFunction: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Outcome: string (nullable = true)



In [95]:
diabetes.show()

+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin| BMI|DiabetesPedigreeFunction|Age|Outcome|
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|          6|    148|           72|           35|      0|33.6|                   0.627| 50|      1|
|          1|     85|           66|           29|      0|26.6|                   0.351| 31|      0|
|          8|    183|           64|            0|      0|23.3|                   0.672| 32|      1|
|          1|     89|           66|           23|     94|28.1|                   0.167| 21|      0|
|          0|    137|           40|           35|    168|43.1|                   2.288| 33|      1|
|          5|    116|           74|            0|      0|25.6|                   0.201| 30|      0|
|          3|     78|           50|           32|     88|  31|                   0.248| 26|      1|


In [99]:
float_diabetes = diabetes.select(*(col(c).cast("float").alias(c) for c in diabetes.columns))

In [100]:
float_diabetes.printSchema()

root
 |-- Pregnancies: float (nullable = true)
 |-- Glucose: float (nullable = true)
 |-- BloodPressure: float (nullable = true)
 |-- SkinThickness: float (nullable = true)
 |-- Insulin: float (nullable = true)
 |-- BMI: float (nullable = true)
 |-- DiabetesPedigreeFunction: float (nullable = true)
 |-- Age: float (nullable = true)
 |-- Outcome: float (nullable = true)



In [101]:
float_diabetes.select([count(when(col(c).isNull(), c)).alias(c) for c in float_diabetes.columns]).show()

+-----------+-------+-------------+-------------+-------+---+------------------------+---+-------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin|BMI|DiabetesPedigreeFunction|Age|Outcome|
+-----------+-------+-------------+-------------+-------+---+------------------------+---+-------+
|          0|      0|            0|            0|      0|  0|                       0|  0|      0|
+-----------+-------+-------------+-------------+-------+---+------------------------+---+-------+



In [107]:
cols = float_diabetes.columns
cols.remove('Outcome')

In [108]:
assembler = VectorAssembler(inputCols = cols, outputCol = 'features')

In [110]:
data = assembler.transform(float_diabetes)

In [119]:
data.columns

['Pregnancies',
 'Glucose',
 'BloodPressure',
 'SkinThickness',
 'Insulin',
 'BMI',
 'DiabetesPedigreeFunction',
 'Age',
 'Outcome',
 'features',
 'Scaled_features']

In [111]:
data.select('features', 'Outcome').show(truncate = False)

+-----------------------------------------------------------------------+-------+
|features                                                               |Outcome|
+-----------------------------------------------------------------------+-------+
|[6.0,148.0,72.0,35.0,0.0,33.599998474121094,0.6269999742507935,50.0]   |1.0    |
|[1.0,85.0,66.0,29.0,0.0,26.600000381469727,0.35100001096725464,31.0]   |0.0    |
|[8.0,183.0,64.0,0.0,0.0,23.299999237060547,0.671999990940094,32.0]     |1.0    |
|[1.0,89.0,66.0,23.0,94.0,28.100000381469727,0.16699999570846558,21.0]  |0.0    |
|[0.0,137.0,40.0,35.0,168.0,43.099998474121094,2.2880001068115234,33.0] |1.0    |
|[5.0,116.0,74.0,0.0,0.0,25.600000381469727,0.20100000500679016,30.0]   |0.0    |
|[3.0,78.0,50.0,32.0,88.0,31.0,0.24799999594688416,26.0]                |1.0    |
|[10.0,115.0,0.0,0.0,0.0,35.29999923706055,0.1340000033378601,29.0]     |0.0    |
|[2.0,197.0,70.0,45.0,543.0,30.5,0.15800000727176666,53.0]              |1.0    |
|[8.0,125.0,96.0

In [121]:
standardscaler = StandardScaler().setInputCol('features').setOutputCol('scaled_features')
data = standardscaler.fit(data).transform(data)

In [122]:
data.select('features', 'Outcome', 'scaled_features').show(truncate = False)

+-----------------------------------------------------------------------+-------+---------------------------------------------------------------------------------------------------------------------------------------------------------+
|features                                                               |Outcome|scaled_features                                                                                                                                          |
+-----------------------------------------------------------------------+-------+---------------------------------------------------------------------------------------------------------------------------------------------------------+
|[6.0,148.0,72.0,35.0,0.0,33.599998474121094,0.6269999742507935,50.0]   |1.0    |[1.7806383732194306,4.628960915766174,3.7198138711154307,2.1940523222807116,0.0,4.261709202425419,1.8923810993699686,4.251616970894646]                  |
|[1.0,85.0,66.0,29.0,0.0,26.600000381469727,0.3510000109

In [123]:
assembled_data = data.select('scaled_features', 'Outcome')
assembled_data.show()

+--------------------+-------+
|     scaled_features|Outcome|
+--------------------+-------+
|[1.78063837321943...|    1.0|
|[0.29677306220323...|    0.0|
|[2.37418449762590...|    1.0|
|[0.29677306220323...|    0.0|
|[0.0,4.2849165233...|    1.0|
|[1.48386531101619...|    0.0|
|[0.89031918660971...|    1.0|
|[2.96773062203238...|    0.0|
|[0.59354612440647...|    1.0|
|[2.37418449762590...|    1.0|
|[1.18709224881295...|    0.0|
|[2.96773062203238...|    1.0|
|[2.96773062203238...|    0.0|
|[0.29677306220323...|    1.0|
|[1.48386531101619...|    1.0|
|[2.07741143542266...|    1.0|
|[0.0,3.6906580274...|    1.0|
|[2.07741143542266...|    1.0|
|[0.29677306220323...|    0.0|
|[0.29677306220323...|    1.0|
+--------------------+-------+
only showing top 20 rows



In [124]:
train, test = assembled_data.randomSplit([0.7, 0.3])

In [125]:
train.show()

+--------------------+-------+
|     scaled_features|Outcome|
+--------------------+-------+
|(8,[0,1,6,7],[0.5...|    0.0|
|(8,[0,1,6,7],[0.5...|    0.0|
|(8,[0,1,6,7],[0.8...|    0.0|
|(8,[0,1,6,7],[1.7...|    0.0|
|(8,[0,1,6,7],[2.0...|    0.0|
|(8,[0,1,6,7],[2.9...|    1.0|
|(8,[1,5,6,7],[2.2...|    0.0|
|(8,[1,5,6,7],[3.0...|    0.0|
|(8,[1,5,6,7],[3.6...|    0.0|
|(8,[1,5,6,7],[3.7...|    1.0|
|(8,[1,5,6,7],[4.4...|    1.0|
|(8,[1,5,6,7],[5.2...|    1.0|
|[0.0,2.0955431172...|    0.0|
|[0.0,2.3144804578...|    0.0|
|[0.0,2.4395875096...|    0.0|
|[0.0,2.6272480873...|    0.0|
|[0.0,2.6898016132...|    0.0|
|[0.0,2.8461854279...|    0.0|
|[0.0,2.8461854279...|    0.0|
|[0.0,2.9087389538...|    0.0|
+--------------------+-------+
only showing top 20 rows



In [126]:
test.show()

+--------------------+-------+
|     scaled_features|Outcome|
+--------------------+-------+
|(8,[1,5,6,7],[4.0...|    1.0|
|(8,[1,5,6,7],[4.3...|    1.0|
|(8,[1,5,6,7],[4.5...|    1.0|
|(8,[1,6,7],[2.940...|    0.0|
|[0.0,1.7827754878...|    0.0|
|[0.0,2.6272480873...|    0.0|
|[0.0,2.9087389538...|    0.0|
|[0.0,2.9712924797...|    1.0|
|[0.0,3.1589530573...|    0.0|
|[0.0,3.1902298203...|    0.0|
|[0.0,3.2527833462...|    0.0|
|[0.0,3.2840601091...|    0.0|
|[0.0,3.2840601091...|    1.0|
|[0.0,3.3153368721...|    0.0|
|[0.0,3.3466136350...|    1.0|
|[0.0,3.5342742127...|    1.0|
|[0.0,3.5342742127...|    0.0|
|[0.0,3.5655509756...|    0.0|
|[0.0,3.6593812644...|    0.0|
|[0.0,3.6906580274...|    1.0|
+--------------------+-------+
only showing top 20 rows



Logistic Regression

In [131]:
log_reg = LogisticRegression(labelCol = 'Outcome', featuresCol = 'scaled_features', maxIter = 40)
model = log_reg.fit(train)

In [136]:
prediction_test_log_reg = model.transform(test)

In [137]:
prediction_test_log_reg.show()

+--------------------+-------+--------------------+--------------------+----------+
|     scaled_features|Outcome|       rawPrediction|         probability|prediction|
+--------------------+-------+--------------------+--------------------+----------+
|(8,[1,5,6,7],[4.0...|    1.0|[-0.5959681143009...|[0.35526666646492...|       1.0|
|(8,[1,5,6,7],[4.3...|    1.0|[-0.6755272474826...|[0.33726030458151...|       1.0|
|(8,[1,5,6,7],[4.5...|    1.0|[-1.5241579745523...|[0.17885005236040...|       1.0|
|(8,[1,6,7],[2.940...|    0.0|[5.03779244869133...|[0.99355376827183...|       0.0|
|[0.0,1.7827754878...|    0.0|[4.83049445540611...|[0.99208064357935...|       0.0|
|[0.0,2.6272480873...|    0.0|[2.92066231178300...|[0.94885844804270...|       0.0|
|[0.0,2.9087389538...|    0.0|[2.95477108662740...|[0.95048849950547...|       0.0|
|[0.0,2.9712924797...|    1.0|[2.39497080633475...|[0.91644299753529...|       0.0|
|[0.0,3.1589530573...|    0.0|[3.41295661961563...|[0.96810701701975...|    

In [138]:
prediction_test_log_reg.select('Outcome', 'prediction').show(15)

+-------+----------+
|Outcome|prediction|
+-------+----------+
|    1.0|       1.0|
|    1.0|       1.0|
|    1.0|       1.0|
|    0.0|       0.0|
|    0.0|       0.0|
|    0.0|       0.0|
|    0.0|       0.0|
|    1.0|       0.0|
|    0.0|       0.0|
|    0.0|       0.0|
|    0.0|       0.0|
|    0.0|       0.0|
|    1.0|       0.0|
|    0.0|       0.0|
|    1.0|       0.0|
+-------+----------+
only showing top 15 rows



In [140]:
prediction_labels = prediction_test_log_reg.select('Outcome', 'prediction').rdd

In [147]:
prediction_labels.collect()

[Row(Outcome=1.0, prediction=1.0),
 Row(Outcome=1.0, prediction=1.0),
 Row(Outcome=1.0, prediction=1.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=1.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=1.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=1.0, prediction=0.0),
 Row(Outcome=1.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=1.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=1.0, prediction=0.0),
 Row(Outcome=1.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=1.0, prediction=0.0),
 Row(Outcome=1.0, prediction=0.0),
 Row(Outcome=1.0, prediction=1.0),
 Row(Outcome=1.0, pr

In [148]:
metrics = BinaryClassificationMetrics(prediction_labels)
print('Area under ROC = %s' % metrics.areaUnderROC)



Area under ROC = 0.7007786429365962


In [149]:
evaluator = MulticlassClassificationEvaluator(labelCol = 'Outcome', predictionCol = 'prediction', metricName = 'accuracy')
accuracy_LR = evaluator.evaluate(prediction_test)
print('Accuracy = ', accuracy_LR)

Accuracy =  0.711864406779661


Gradient Boost

In [150]:
gb_clf = GBTClassifier(labelCol = 'Outcome', featuresCol = 'scaled_features')

In [151]:
model = gb_clf.fit(train)

In [152]:
prediction_test_gb = model.transform(test)

In [153]:
prediction_test_gb.show()

+--------------------+-------+--------------------+--------------------+----------+
|     scaled_features|Outcome|       rawPrediction|         probability|prediction|
+--------------------+-------+--------------------+--------------------+----------+
|(8,[1,5,6,7],[4.0...|    1.0|[0.49604578242862...|[0.72950084756614...|       0.0|
|(8,[1,5,6,7],[4.3...|    1.0|[-0.1499047895267...|[0.42560403383118...|       1.0|
|(8,[1,5,6,7],[4.5...|    1.0|[-0.5069952591068...|[0.26619962646524...|       1.0|
|(8,[1,6,7],[2.940...|    0.0|[1.54632454612416...|[0.95658850688445...|       0.0|
|[0.0,1.7827754878...|    0.0|[-0.4657815538001...|[0.28260770149330...|       1.0|
|[0.0,2.6272480873...|    0.0|[1.02132385547847...|[0.88520260011588...|       0.0|
|[0.0,2.9087389538...|    0.0|[1.59850478571742...|[0.96072158687273...|       0.0|
|[0.0,2.9712924797...|    1.0|[1.59127532321922...|[0.96017232099541...|       0.0|
|[0.0,3.1589530573...|    0.0|[1.59154470273746...|[0.96019291880693...|    

In [154]:
prediction_test_gb.select('Outcome', 'prediction').show(15)

+-------+----------+
|Outcome|prediction|
+-------+----------+
|    1.0|       0.0|
|    1.0|       1.0|
|    1.0|       1.0|
|    0.0|       0.0|
|    0.0|       1.0|
|    0.0|       0.0|
|    0.0|       0.0|
|    1.0|       0.0|
|    0.0|       0.0|
|    0.0|       0.0|
|    0.0|       0.0|
|    0.0|       0.0|
|    1.0|       1.0|
|    0.0|       0.0|
|    1.0|       0.0|
+-------+----------+
only showing top 15 rows



In [155]:
prediction_labels = prediction_test.select('Outcome', 'prediction').rdd

In [156]:
metrics = BinaryClassificationMetrics(prediction_labels)
print('Area under ROC = %s' % metrics.areaUnderROC)



Area under ROC = 0.7007786429365962


In [159]:
evaluator = MulticlassClassificationEvaluator(labelCol = 'Outcome', predictionCol = 'prediction', metricName = 'accuracy')
accuracy_GBT = evaluator.evaluate(prediction_test_gb)
print('Accuracy ', accuracy_GBT)

Accuracy  0.6610169491525424


Naive Bayes

In [160]:
naive_bayes = NaiveBayes(featuresCol = 'scaled_features', labelCol = 'Outcome', smoothing = 1.0)

In [161]:
model = naive_bayes.fit(train) 

In [162]:
prediction_test_nb = model.transform(test)

In [163]:
prediction_test_nb.show()

+--------------------+-------+--------------------+--------------------+----------+
|     scaled_features|Outcome|       rawPrediction|         probability|prediction|
+--------------------+-------+--------------------+--------------------+----------+
|(8,[1,5,6,7],[4.0...|    1.0|[-21.723474825323...|[0.64723675932463...|       0.0|
|(8,[1,5,6,7],[4.3...|    1.0|[-25.780138557954...|[0.62017108295523...|       0.0|
|(8,[1,5,6,7],[4.5...|    1.0|[-26.253452790036...|[0.63532257554766...|       0.0|
|(8,[1,6,7],[2.940...|    0.0|[-11.223506982693...|[0.62533187895436...|       0.0|
|[0.0,1.7827754878...|    0.0|[-29.017479389648...|[0.75836133512652...|       0.0|
|[0.0,2.6272480873...|    0.0|[-33.225548428520...|[0.76254303538341...|       0.0|
|[0.0,2.9087389538...|    0.0|[-30.237888547788...|[0.72163005246824...|       0.0|
|[0.0,2.9712924797...|    1.0|[-30.554813171244...|[0.77895193844511...|       0.0|
|[0.0,3.1589530573...|    0.0|[-23.119943091638...|[0.74126457253940...|    

In [164]:
prediction_test_nb.select('Outcome', 'prediction').show(10)

+-------+----------+
|Outcome|prediction|
+-------+----------+
|    1.0|       0.0|
|    1.0|       0.0|
|    1.0|       0.0|
|    0.0|       0.0|
|    0.0|       0.0|
|    0.0|       0.0|
|    0.0|       0.0|
|    1.0|       0.0|
|    0.0|       0.0|
|    0.0|       0.0|
+-------+----------+
only showing top 10 rows



In [165]:
prediction_labels = prediction_test_nb.select("Outcome","prediction").rdd

In [166]:
evaluator = MulticlassClassificationEvaluator(labelCol = 'Outcome', predictionCol = 'prediction', metricName = 'accuracy')
accuracy_NB = evaluator.evaluate(prediction_test_nb)
print('Accuracy ', accuracy_NB)

Accuracy  0.6398305084745762


In [167]:
metrics = BinaryClassificationMetrics(prediction_labels)
print("Area under ROC = %s" % metrics.areaUnderROC)

Area under ROC = 0.7060695056289771


Random Forest Classifier

In [168]:
random_forest_classifier = RandomForestClassifier(labelCol = 'Outcome', featuresCol = 'scaled_features', numTrees = 40)

In [169]:
model = random_forest_classifier.fit(train)

In [170]:
prediction_test_rf = model.transform(test)

In [171]:
prediction_test_rf.show()

+--------------------+-------+--------------------+--------------------+----------+
|     scaled_features|Outcome|       rawPrediction|         probability|prediction|
+--------------------+-------+--------------------+--------------------+----------+
|(8,[1,5,6,7],[4.0...|    1.0|[19.0681334045313...|[0.47670333511328...|       1.0|
|(8,[1,5,6,7],[4.3...|    1.0|[18.3820445652025...|[0.45955111413006...|       1.0|
|(8,[1,5,6,7],[4.5...|    1.0|[18.0500676152413...|[0.45125169038103...|       1.0|
|(8,[1,6,7],[2.940...|    0.0|[38.7049278787575...|[0.96762319696893...|       0.0|
|[0.0,1.7827754878...|    0.0|[37.5459262882659...|[0.93864815720664...|       0.0|
|[0.0,2.6272480873...|    0.0|[33.8587381329806...|[0.84646845332451...|       0.0|
|[0.0,2.9087389538...|    0.0|[37.6224050114483...|[0.94056012528620...|       0.0|
|[0.0,2.9712924797...|    1.0|[33.5983297111408...|[0.83995824277852...|       0.0|
|[0.0,3.1589530573...|    0.0|[38.7551486293094...|[0.96887871573273...|    

In [172]:
prediction_test_rf.select('Outcome', 'prediction').show(15)

+-------+----------+
|Outcome|prediction|
+-------+----------+
|    1.0|       1.0|
|    1.0|       1.0|
|    1.0|       1.0|
|    0.0|       0.0|
|    0.0|       0.0|
|    0.0|       0.0|
|    0.0|       0.0|
|    1.0|       0.0|
|    0.0|       0.0|
|    0.0|       0.0|
|    0.0|       0.0|
|    0.0|       0.0|
|    1.0|       0.0|
|    0.0|       0.0|
|    1.0|       0.0|
+-------+----------+
only showing top 15 rows



In [173]:
prediction_labels = prediction_test_rf.select('Outcome', 'prediction').rdd

In [174]:
metrics = BinaryClassificationMetrics(prediction_labels)
print('Area under ROC = %s' % metrics.areaUnderROC)

Area under ROC = 0.696953781512605


In [176]:
evaluator = MulticlassClassificationEvaluator( labelCol = 'Outcome', predictionCol = 'prediction', metricName = 'accuracy')
accuracy_RF = evaluator.evaluate(prediction_test_rf)
print('Accuracy ', accuracy_RF)

Accuracy  0.711864406779661


In [178]:
print('Accuracy of Logistic Regression ', accuracy_LR)
print('Accuracy of Gradeinr Boost ', accuracy_GBT)
print('Accuracy of Naive Bayes ', accuracy_NB)
print('Accuracy of Random Forest ', accuracy_RF)

Accuracy of Logistic Regression  0.711864406779661
Accuracy of Gradeinr Boost  0.6610169491525424
Accuracy of Naive Bayes  0.6398305084745762
Accuracy of Random Forest  0.711864406779661
