# GBT and Random Forest Models: 
    

In [1]:
#read in file as dataframe 
# import pyspark modules
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql.types import *       # for datatype conversion
from pyspark.sql.functions import *   # for col() function
from pyspark.ml.feature import StandardScaler
from pyspark.ml.regression import LinearRegression
import pandas as pd
import os
import pyspark.sql.types as typ
import pyspark.sql.functions as F
from pyspark.sql import SparkSession, Row
from pyspark.ml.feature import OneHotEncoder, StringIndexer, Bucketizer
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
import pyspark.mllib.regression as reg
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.linalg import SparseVector, DenseVector
import functools 
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [2]:
from pyspark.sql import SparkSession 
spark = SparkSession \
    .builder \
    .master("local") \
    .appName("app") \
    .config("spark.executor.memory", '2g') \
    .config('spark.executor.cores', '2') \
    .config('spark.cores.max', '2') \
    .config("spark.driver.memory",'4g') \
    .getOrCreate()

sc = SparkContext.getOrCreate()
sqlCtx = SQLContext(sc)

In [3]:
path_to_data = os.path.join("/home/jovyan/FlightDelay/clean_data_no_hot_2.dms")

In [4]:
df = spark.read.format("csv") \
    .option("header", "true").option("inferschema","true").load(path_to_data)

In [5]:
#final_df.count()

In [6]:
pd_df = df.toPandas()

In [7]:
pd_df.head()

Unnamed: 0,_c0,YEAR,MONTH,DAY,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,...,DISTANCE,SCHEDULED_ARRIVAL,ARRIVAL_TIME,ARRIVAL_DELAY,DIVERTED,CANCELLED,B_SCHEDULED_ARRIVAL,B_ARRIVAL_TIME,B_SCHEDULED_DEPARTURE,B_DEPARTURE_TIME
0,0,2015,1,1,4,US,840,SFO,CLT,20,...,2296,806,811,5,0,0,2.0,2.0,0.0,0.0
1,1,2015,1,1,4,AA,1674,LAS,MIA,35,...,2174,803,753,-10,0,0,2.0,2.0,0.0,0.0
2,2,2015,1,1,4,AS,136,ANC,SEA,135,...,1448,600,1476,4,0,1,2.0,4.0,0.0,4.0
3,3,2015,1,1,4,AA,2459,PHX,DFW,200,...,868,500,1476,4,0,1,1.0,4.0,0.0,4.0
4,4,2015,1,1,4,B6,1990,SJU,EWR,206,...,1608,512,516,4,0,0,1.0,1.0,0.0,0.0


# One Hot Encoder 

since vector type didn't transfer

In [8]:
# for each level, count freq. val=0 for most freq, then 1, ...

stringIndexer = StringIndexer(inputCol="AIRLINE", outputCol="AIRLINE_Index")
model = stringIndexer.fit(df)
indexed = model.transform(df)

encoder = OneHotEncoder(inputCol="AIRLINE_Index", outputCol="AIRLINE_Vec")
encoded = encoder.transform(indexed)


In [9]:
# for each level, count freq. val=0 for most freq, then 1, ...

stringIndexer2 = StringIndexer(inputCol="ORIGIN_AIRPORT", outputCol="ORIGIN_AIRPORT_Index")
model2 = stringIndexer2.fit(encoded)
indexed2 = model2.transform(encoded)

encoder2 = OneHotEncoder(inputCol="ORIGIN_AIRPORT_Index", outputCol="ORIGIN_AIRPORT_Vec")
encoded2 = encoder2.transform(indexed2)



In [10]:
# for each level, count freq. val=0 for most freq, then 1, ...

stringIndexer3 = StringIndexer(inputCol="DESTINATION_AIRPORT", outputCol="DESTINATION_AIRPORT_Index")
model3 = stringIndexer3.fit(encoded2)
indexed3 = model3.transform(encoded2)

encoder3 = OneHotEncoder(inputCol="DESTINATION_AIRPORT_Index", outputCol="DESTINATION_AIRPORT_Vec")
encoded3 = encoder3.transform(indexed3)
encoded3.select('DESTINATION_AIRPORT','DESTINATION_AIRPORT_Index', "DESTINATION_AIRPORT_Vec").show()
#encoded3.cache()b

+-------------------+-------------------------+-----------------------+
|DESTINATION_AIRPORT|DESTINATION_AIRPORT_Index|DESTINATION_AIRPORT_Vec|
+-------------------+-------------------------+-----------------------+
|                CLT|                     14.0|       (618,[14],[1.0])|
|                MIA|                     24.0|       (618,[24],[1.0])|
|                SEA|                     10.0|       (618,[10],[1.0])|
|                DFW|                      2.0|        (618,[2],[1.0])|
|                EWR|                     15.0|       (618,[15],[1.0])|
|                CLT|                     14.0|       (618,[14],[1.0])|
|                MCO|                     11.0|       (618,[11],[1.0])|
|                JFK|                     18.0|       (618,[18],[1.0])|
|                DEN|                      3.0|        (618,[3],[1.0])|
|                ATL|                      0.0|        (618,[0],[1.0])|
|                LAX|                      4.0|        (618,[4],

In [11]:
new_cols_to_drop = ['AIRLINE_Index', 'AIRLINE', 'ORIGIN_AIRPORT_Index', 
                    'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT_Index',
                    'DESTINATION_AIRPORT', 'FLIGHT_NUMBER']

final_encoded = encoded3.drop(*new_cols_to_drop)

In [12]:
final_encoded.columns

['_c0',
 'YEAR',
 'MONTH',
 'DAY',
 'DAY_OF_WEEK',
 'SCHEDULED_DEPARTURE',
 'DEPARTURE_TIME',
 'DEPARTURE_DELAY',
 'SCHEDULED_TIME',
 'ELAPSED_TIME',
 'DISTANCE',
 'SCHEDULED_ARRIVAL',
 'ARRIVAL_TIME',
 'ARRIVAL_DELAY',
 'DIVERTED',
 'CANCELLED',
 'B_SCHEDULED_ARRIVAL',
 'B_ARRIVAL_TIME',
 'B_SCHEDULED_DEPARTURE',
 'B_DEPARTURE_TIME',
 'AIRLINE_Vec',
 'ORIGIN_AIRPORT_Vec',
 'DESTINATION_AIRPORT_Vec']

In [13]:
#final_df = final_encoded.withColumn('CANCELLED2', final_encoded.CANCELLED)

In [14]:
final_df = final_encoded.drop('_c0', 'YEAR', 'DEPARTURE_DELAY', 'ARRIVAL_DELAY', 'ARRIVAL_TIME', 
                              'B_ARRIVAL_TIME', 'ELAPSED_TIME', 'B_ARRIVAL_TIME', 
                              'B_DEPARTURE_TIME', 'DEPARTURE_TIME')

In [15]:
final_df.printSchema()

root
 |-- MONTH: integer (nullable = true)
 |-- DAY: integer (nullable = true)
 |-- DAY_OF_WEEK: integer (nullable = true)
 |-- SCHEDULED_DEPARTURE: integer (nullable = true)
 |-- SCHEDULED_TIME: integer (nullable = true)
 |-- DISTANCE: integer (nullable = true)
 |-- SCHEDULED_ARRIVAL: integer (nullable = true)
 |-- DIVERTED: integer (nullable = true)
 |-- CANCELLED: integer (nullable = true)
 |-- B_SCHEDULED_ARRIVAL: double (nullable = true)
 |-- B_SCHEDULED_DEPARTURE: double (nullable = true)
 |-- AIRLINE_Vec: vector (nullable = true)
 |-- ORIGIN_AIRPORT_Vec: vector (nullable = true)
 |-- DESTINATION_AIRPORT_Vec: vector (nullable = true)



# Scale Data

In [16]:
#convery final_encoded to rdd 
# we cannpot scale bucketized or vec columns, so we omit those form the scaling process
input_data = final_df.rdd.map(lambda x: (x[8], DenseVector(x[1:8])))

In [17]:
input_data.take(3)

[(0, DenseVector([1.0, 4.0, 20.0, 286.0, 2296.0, 806.0, 0.0])),
 (0, DenseVector([1.0, 4.0, 35.0, 268.0, 2174.0, 803.0, 0.0])),
 (1, DenseVector([1.0, 4.0, 135.0, 205.0, 1448.0, 600.0, 0.0]))]

In [18]:
df2 = sqlCtx.createDataFrame(input_data, ["label","features2"])

In [19]:
SS = StandardScaler(inputCol = "features2", outputCol = "features_scaled")

scaler = SS.fit(df2)

In [20]:
#transform the data in df2 with our scaler 
scaled_df = scaler.transform(df2)
#join scalable feature with columns  'AIRLINE_Vec', 'ORIGIN_AIRPORT_Vec', 
#'DESTINATION_AIRPORT_Vec', 'B_SCHEDULED_ARRIVAL', 'B_ARRIVAL_TIME', 
#'B_SCHEDULED_DEPARTURE','B_DEPARTURE_TIME'


In [21]:
scaled_df.show(4)

+-----+--------------------+--------------------+
|label|           features2|     features_scaled|
+-----+--------------------+--------------------+
|    0|[1.0,4.0,20.0,286...|[0.11379250643083...|
|    0|[1.0,4.0,35.0,268...|[0.11379250643083...|
|    1|[1.0,4.0,135.0,20...|[0.11379250643083...|
|    1|[1.0,4.0,200.0,12...|[0.11379250643083...|
+-----+--------------------+--------------------+
only showing top 4 rows



In [22]:
# since there is no common column between these two dataframes add row_index so that it can be joined
scaled_df = scaled_df.withColumn('row_index', F.monotonically_increasing_id())
final_df = final_df.withColumn('row_index', F.monotonically_increasing_id())

In [23]:
# combine scaled df and final_df
total_df = scaled_df.join(final_df, scaled_df.row_index == final_df.row_index)

In [24]:
total_df.count()

278016

In [25]:
total_df.columns

['label',
 'features2',
 'features_scaled',
 'row_index',
 'MONTH',
 'DAY',
 'DAY_OF_WEEK',
 'SCHEDULED_DEPARTURE',
 'SCHEDULED_TIME',
 'DISTANCE',
 'SCHEDULED_ARRIVAL',
 'DIVERTED',
 'CANCELLED',
 'B_SCHEDULED_ARRIVAL',
 'B_SCHEDULED_DEPARTURE',
 'AIRLINE_Vec',
 'ORIGIN_AIRPORT_Vec',
 'DESTINATION_AIRPORT_Vec',
 'row_index']

In [26]:
#drop columns of already scaled predictors 
total_df = total_df.drop(*['MONTH', 'DAY', 'DAY_OF_WEEK', 'SCHEDULED_DEPARTURE', 'SCHEDULED_TIME', 'DISTANCE',
                           'SCHEDULED_ARRIVAL', 'DIVERTED', 
                           'CANCELLED2', 'row_index']) 

In [27]:
total_df.count()

278016

In [28]:
#final scaled dataframe for predicting Cancelled flights 
total_df.columns

['label',
 'features2',
 'features_scaled',
 'CANCELLED',
 'B_SCHEDULED_ARRIVAL',
 'B_SCHEDULED_DEPARTURE',
 'AIRLINE_Vec',
 'ORIGIN_AIRPORT_Vec',
 'DESTINATION_AIRPORT_Vec']

# Scaled: split data  

In [29]:
def unionAll(dfs):
    return functools.reduce(lambda df1,df2: df1.union(df2.select(df1.columns)), dfs)

In [30]:
(train_data_scaled, test_data_scaled) = total_df.randomSplit([0.8, 0.2], seed = 314)

In [31]:
c = train_data_scaled.where(total_df.label == 1)

In [32]:
## How to filter out large portion of non-canclelled flights in training data to make data more even 
not_c = train_data_scaled.where(total_df.label == 0).sample(False, .018, 454) 

In [33]:
not_c.count()

KeyboardInterrupt: 

In [None]:
c.count()

In [34]:
training_data_scaled = unionAll([c, not_c])
training_data_scaled.show(4) 

+-----+--------------------+--------------------+---------+-------------------+---------------------+--------------+------------------+-----------------------+
|label|           features2|     features_scaled|CANCELLED|B_SCHEDULED_ARRIVAL|B_SCHEDULED_DEPARTURE|   AIRLINE_Vec|ORIGIN_AIRPORT_Vec|DESTINATION_AIRPORT_Vec|
+-----+--------------------+--------------------+---------+-------------------+---------------------+--------------+------------------+-----------------------+
|    1|[2.0,1.0,1800.0,7...|[0.22758501286166...|        1|                6.0|                  6.0|(13,[8],[1.0])|  (614,[16],[1.0])|       (618,[12],[1.0])|
|    1|[3.0,2.0,825.0,12...|[0.34137751929250...|        1|                3.0|                  2.0|(13,[0],[1.0])|   (614,[0],[1.0])|       (618,[20],[1.0])|
|    1|[6.0,2.0,900.0,83...|[0.68275503858500...|        1|                3.0|                  3.0|(13,[8],[1.0])|  (614,[16],[1.0])|       (618,[21],[1.0])|
|    1|[8.0,4.0,1710.0,1...|[0.910340051

In [35]:
training_data_scaled.columns

['label',
 'features2',
 'features_scaled',
 'CANCELLED',
 'B_SCHEDULED_ARRIVAL',
 'B_SCHEDULED_DEPARTURE',
 'AIRLINE_Vec',
 'ORIGIN_AIRPORT_Vec',
 'DESTINATION_AIRPORT_Vec']

# Gradient Boosted Trees

In [36]:
vars_to_keep = [
 'features_scaled',
 'B_SCHEDULED_ARRIVAL',
 'B_SCHEDULED_DEPARTURE',
 'AIRLINE_Vec',
 'ORIGIN_AIRPORT_Vec',
 'DESTINATION_AIRPORT_Vec'
]

In [37]:
#training_data_scaled_no_hot.show(2)

In [38]:
# class = 2 for benign (negative class, 4 for malignant (positive class)
target = 'label'
positive_label = 1
negative_label = 0

SEED = 314
ITERS = 10
FOLDS = 5

In [39]:
assembler = VectorAssembler(
                            inputCols=[c for c in vars_to_keep],
                            outputCol='features')

In [40]:
train = (assembler.transform(training_data_scaled).select(target, "features"))

In [41]:
train

DataFrame[label: bigint, features: vector]

In [42]:
train.show(5)

+-----+--------------------+
|label|            features|
+-----+--------------------+
|    1|(1254,[0,1,2,3,4,...|
|    1|(1254,[0,1,2,3,4,...|
|    1|(1254,[0,1,2,3,4,...|
|    1|(1254,[0,1,2,3,4,...|
|    1|(1254,[0,1,2,3,4,...|
+-----+--------------------+
only showing top 5 rows



In [43]:
gbt = GBTClassifier(labelCol=target, featuresCol="features", maxIter=ITERS)

In [44]:
evaluator = BinaryClassificationEvaluator(labelCol=target)

In [45]:
paramGrid = ParamGridBuilder() \
            .addGrid(gbt.maxIter, [1, 10]) \
            .addGrid(gbt.maxDepth, [1, 2]) \
            .build()

In [46]:
# k-fold cross validation
crossval = CrossValidator(
            estimator=gbt, 
            estimatorParamMaps=paramGrid, 
            evaluator=evaluator, 
            numFolds=FOLDS)

In [48]:
# part that returns error
model = crossval.fit(train)

In [49]:
model

CrossValidatorModel_d5b8ef056e93

In [50]:
predictions = model.transform(assembler.transform(test_data_scaled.select(vars_to_keep)).select("features"))

In [51]:
test_data_scaled.show(2)

+-----+--------------------+--------------------+---------+-------------------+---------------------+--------------+------------------+-----------------------+
|label|           features2|     features_scaled|CANCELLED|B_SCHEDULED_ARRIVAL|B_SCHEDULED_DEPARTURE|   AIRLINE_Vec|ORIGIN_AIRPORT_Vec|DESTINATION_AIRPORT_Vec|
+-----+--------------------+--------------------+---------+-------------------+---------------------+--------------+------------------+-----------------------+
|    0|[1.0,2.0,705.0,19...|[0.11379250643083...|        0|                4.0|                  2.0|(13,[2],[1.0])|   (614,[4],[1.0])|       (618,[14],[1.0])|
|    0|[1.0,2.0,1022.0,1...|[0.11379250643083...|        0|                4.0|                  3.0|(13,[4],[1.0])|  (614,[11],[1.0])|      (618,[126],[1.0])|
+-----+--------------------+--------------------+---------+-------------------+---------------------+--------------+------------------+-----------------------+
only showing top 2 rows



Convert the dataframe to an rdd. Then select only the prediction and feature fields

In [52]:
predictions.show(5)

+--------------------+--------------------+--------------------+----------+
|            features|       rawPrediction|         probability|prediction|
+--------------------+--------------------+--------------------+----------+
|(1254,[0,1,2,3,4,...|[0.12936935904792...|[0.56432621553613...|       0.0|
|(1254,[0,1,2,3,4,...|[-0.0260722828258...|[0.48696681161706...|       1.0|
|(1254,[0,1,2,3,4,...|[0.60117963212024...|[0.76894421757660...|       0.0|
|(1254,[0,1,2,3,4,...|[0.06796111079743...|[0.53392833639751...|       0.0|
|(1254,[0,1,2,3,4,...|[-0.0701705668954...|[0.46497218893408...|       1.0|
+--------------------+--------------------+--------------------+----------+
only showing top 5 rows



In [53]:
predictions = predictions.withColumn('row_index', F.monotonically_increasing_id())
test_data_scaled = test_data_scaled.withColumn('row_index', F.monotonically_increasing_id())

In [54]:
# combine scaled df and final_df
tot_preds = predictions.join(test_data_scaled, predictions.row_index == test_data_scaled.row_index)

In [56]:
tot_preds.show(5)

+--------------------+--------------------+--------------------+----------+-----------+-----+--------------------+--------------------+---------+-------------------+---------------------+---------------+------------------+-----------------------+-----------+
|            features|       rawPrediction|         probability|prediction|  row_index|label|           features2|     features_scaled|CANCELLED|B_SCHEDULED_ARRIVAL|B_SCHEDULED_DEPARTURE|    AIRLINE_Vec|ORIGIN_AIRPORT_Vec|DESTINATION_AIRPORT_Vec|  row_index|
+--------------------+--------------------+--------------------+----------+-----------+-----+--------------------+--------------------+---------+-------------------+---------------------+---------------+------------------+-----------------------+-----------+
|(1254,[0,1,2,3,4,...|[-0.1138986851704...|[0.44329565241512...|       1.0|         26|    0|[5.0,1.0,1907.0,6...|[0.56896253215417...|        0|                6.0|                  6.0| (13,[4],[1.0])| (614,[390],[1.0])| 

# Model Evaliuation for GBT method 

Model Evaluation: For classification, this would include: i. accuracy ii. precision, recall, F1 score ii. confusion matrix iv. area under ROC curve (AUROC)

Test Error: 

In [57]:
evaluator = MulticlassClassificationEvaluator(
    labelCol="CANCELLED", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(tot_preds)
print("Test Error = %g" % (1.0 - accuracy))

Test Error = 0.190571


Area under the curve:

In [58]:
print("evaluation (area under ROC): %f" % accuracy)

evaluation (area under ROC): 0.809429


Precision = TP / (TP+FP) 

Recall = TP / (TP+FN)

In [59]:
# tp = df[(df.target_index == 1) & (df.prediction == 1)].count()
tp = tot_preds[(tot_preds.label == 1) & (tot_preds.prediction == 1)].count()
tn = tot_preds[(tot_preds.label == 0) & (tot_preds.prediction == 0)].count()
fp = tot_preds[(tot_preds.label == 0) & (tot_preds.prediction == 1)].count()
fn = tot_preds[(tot_preds.label == 1) & (tot_preds.prediction == 0)].count()

Confusion Matrix

In [60]:
# Confusion matrix 
print("True Positives:", tp)
print("True Negatives:", tn)
print("False Positives:", fp)
print("False Negatives:", fn)
print("Total", df.count())

True Positives: 432
True Negatives: 44467
False Positives: 10082
False Negatives: 489
Total 278016


Precision and Recall

In [61]:
# Calculate Precision and recall: 
precision = tp / (tp + fp)
recall = tp / (tp + fn) 

In [62]:
print("Precision = %g" % (precision))

Precision = 0.0410881


In [63]:
print("Recall = %g" % (recall))

Recall = 0.469055


In [64]:
model.bestModel

GBTClassificationModel (uid=GBTClassifier_b5b67696efe6) with 10 trees

# Random Forest

Add cross validation to this model 

In [65]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Index labels, adding metadata to the label column.
# Fit on whole dataset to include all labels in index.
labelIndexer = StringIndexer(inputCol="CANCELLED", outputCol="indexedLabel").fit(final_df)

In [66]:
assembler = VectorAssembler(
                            inputCols=[c for c in vars_to_keep],
                            outputCol='features')
data = (assembler.transform(total_df).select(target, "features"))


In [67]:
scaled_df.show(3)

+-----+--------------------+--------------------+---------+
|label|           features2|     features_scaled|row_index|
+-----+--------------------+--------------------+---------+
|    0|[1.0,4.0,20.0,286...|[0.11379250643083...|        0|
|    0|[1.0,4.0,35.0,268...|[0.11379250643083...|        1|
|    1|[1.0,4.0,135.0,20...|[0.11379250643083...|        2|
+-----+--------------------+--------------------+---------+
only showing top 3 rows



In [68]:
# Automatically identify categorical features, and index them.
# Set maxCategories so features with > 4 distinct values are treated as continuous.
featureIndexer =\
    VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(data)

In [69]:
(trainingData, testData) = data.randomSplit([0.8, 0.2], seed = 314)

In [70]:
trainingData.show(5)

+-----+--------------------+
|label|            features|
+-----+--------------------+
|    0|(1254,[0,1,2,3,4,...|
|    0|(1254,[0,1,2,3,4,...|
|    0|(1254,[0,1,2,3,4,...|
|    0|(1254,[0,1,2,3,4,...|
|    0|(1254,[0,1,2,3,4,...|
+-----+--------------------+
only showing top 5 rows



In [71]:
c3 = trainingData.where(data.label == 1)
not_c3 = trainingData.where(data.label == 0).sample(False, 0.018, 99)

trainingData2 = unionAll([c3, not_c3])

In [72]:
testData.show(3)

+-----+--------------------+
|label|            features|
+-----+--------------------+
|    0|(1254,[0,1,2,3,4,...|
|    0|(1254,[0,1,2,3,4,...|
|    0|(1254,[0,1,2,3,4,...|
+-----+--------------------+
only showing top 3 rows



In [73]:
# Train a RandomForest model.
rf = RandomForestClassifier(labelCol="label", featuresCol="indexedFeatures", numTrees=10)

In [74]:
pipeline = Pipeline(stages=[featureIndexer, rf])

In [75]:
# class = 2 for benign (negative class, 4 for malignant (positive class)
target = 'label'
positive_label = 1
negative_label = 0

SEED = 314
ITERS = 10
FOLDS = 5

In [76]:
evaluator = BinaryClassificationEvaluator(labelCol=target)


In [80]:
crossval = CrossValidator(
    estimator=pipeline,
    estimatorParamMaps=paramGrid,
    evaluator=evaluator,
    numFolds=FOLDS)

In [81]:
# Train model.  This also runs the indexers.
model = crossval.fit(trainingData2)

In [82]:
# Make predictions.
predictions = model.transform(testData)

In [83]:
# Select example rows to display.
predictions.select("prediction", "label", "features").show(5)

+----------+-----+--------------------+
|prediction|label|            features|
+----------+-----+--------------------+
|       0.0|    0|(1254,[0,1,2,3,4,...|
|       0.0|    0|(1254,[0,1,2,3,4,...|
|       0.0|    0|(1254,[0,1,2,3,4,...|
|       0.0|    0|(1254,[0,1,2,3,4,...|
|       0.0|    0|(1254,[0,1,2,3,4,...|
+----------+-----+--------------------+
only showing top 5 rows



# Model Evaluation: Random Forest

In [84]:
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % (1.0 - accuracy))

Test Error = 0.175717


Area Under the Curve: 

In [85]:
print("evaluation (area under ROC): %f" % accuracy)

evaluation (area under ROC): 0.824283


In [86]:
# tp = df[(df.target_index == 1) & (df.prediction == 1)].count()
tp = predictions[(predictions.label == 1) & (predictions.prediction == 1)].count()
tn = predictions[(predictions.label == 0) & (predictions.prediction == 0)].count()
fp = predictions[(predictions.label == 0) & (predictions.prediction == 1)].count()
fn = predictions[(predictions.label == 1) & (predictions.prediction == 0)].count()

Confusion matrix: 

In [87]:
# Confusion matrix 
print("True Positives:", tp)
print("True Negatives:", tn)
print("False Positives:", fp)
print("False Negatives:", fn)
print("Total", df.count())

True Positives: 371
True Negatives: 45352
False Positives: 9197
False Negatives: 550
Total 278016


Precision and Recall: 

In [88]:
# Calculate Precision and recall: 
precision = tp / (tp + fp)
recall = tp / (tp + fn) 

In [89]:
print("Precision = %g" % (precision))

Precision = 0.0387751


In [90]:
print("Recall = %g" % (recall))

Recall = 0.402823
