# GBT and Random Forest Models: 
    

In [50]:
#read in file as dataframe 
# import pyspark modules
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql.types import *       # for datatype conversion
from pyspark.sql.functions import *   # for col() function
from pyspark.ml.feature import StandardScaler
from pyspark.ml.regression import LinearRegression
import pandas as pd
import os
import pyspark.sql.types as typ
import pyspark.sql.functions as F
from pyspark.sql import SparkSession, Row
from pyspark.ml.feature import OneHotEncoder, StringIndexer, Bucketizer
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
import pyspark.mllib.regression as reg
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.linalg import SparseVector, DenseVector
import functools 
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [51]:
from pyspark.sql import SparkSession 
spark = SparkSession \
    .builder \
    .master("local") \
    .appName("app") \
    .config("spark.executor.memory", '2g') \
    .config('spark.executor.cores', '2') \
    .config('spark.cores.max', '2') \
    .config("spark.driver.memory",'4g') \
    .getOrCreate()

sc = SparkContext.getOrCreate()
sqlCtx = SQLContext(sc)

In [3]:
path_to_data = os.path.join("/home/jovyan/FlightDelay/clean_data_no_hot_2")

In [4]:
df = spark.read.format("csv") \
    .option("header", "true").option("inferschema","true").load(path_to_data)

In [5]:
a= df.count()
b = df.filter(df.CANCELLED == 1).count()
b/a

0.016178924953959485

In [6]:
#final_df.count()

In [7]:
pd_df = df.toPandas()

In [8]:
pd_df.head()

Unnamed: 0,_c0,YEAR,MONTH,DAY,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,...,DISTANCE,SCHEDULED_ARRIVAL,ARRIVAL_TIME,ARRIVAL_DELAY,DIVERTED,CANCELLED,B_SCHEDULED_ARRIVAL,B_ARRIVAL_TIME,B_SCHEDULED_DEPARTURE,B_DEPARTURE_TIME
0,0,2015,1,1,4,US,840,SFO,CLT,20,...,2296,806,811,5,0,0,2.0,2.0,0.0,0.0
1,1,2015,1,1,4,AA,1674,LAS,MIA,35,...,2174,803,753,-10,0,0,2.0,2.0,0.0,0.0
2,2,2015,1,1,4,AS,136,ANC,SEA,135,...,1448,600,1476,4,0,1,2.0,4.0,0.0,4.0
3,3,2015,1,1,4,AA,2459,PHX,DFW,200,...,868,500,1476,4,0,1,1.0,4.0,0.0,4.0
4,4,2015,1,1,4,B6,1990,SJU,EWR,206,...,1608,512,516,4,0,0,1.0,1.0,0.0,0.0


# One Hot Encoder 

since vector type didn't transfer

In [9]:
# for each level, count freq. val=0 for most freq, then 1, ...

stringIndexer = StringIndexer(inputCol="AIRLINE", outputCol="AIRLINE_Index")
model = stringIndexer.fit(df)
indexed = model.transform(df)

encoder = OneHotEncoder(inputCol="AIRLINE_Index", outputCol="AIRLINE_Vec")
encoded = encoder.transform(indexed)


In [10]:
# for each level, count freq. val=0 for most freq, then 1, ...

stringIndexer2 = StringIndexer(inputCol="ORIGIN_AIRPORT", outputCol="ORIGIN_AIRPORT_Index")
model2 = stringIndexer2.fit(encoded)
indexed2 = model2.transform(encoded)

encoder2 = OneHotEncoder(inputCol="ORIGIN_AIRPORT_Index", outputCol="ORIGIN_AIRPORT_Vec")
encoded2 = encoder2.transform(indexed2)



In [11]:
# for each level, count freq. val=0 for most freq, then 1, ...

stringIndexer3 = StringIndexer(inputCol="DESTINATION_AIRPORT", outputCol="DESTINATION_AIRPORT_Index")
model3 = stringIndexer3.fit(encoded2)
indexed3 = model3.transform(encoded2)

encoder3 = OneHotEncoder(inputCol="DESTINATION_AIRPORT_Index", outputCol="DESTINATION_AIRPORT_Vec")
encoded3 = encoder3.transform(indexed3)
encoded3.select('DESTINATION_AIRPORT','DESTINATION_AIRPORT_Index', "DESTINATION_AIRPORT_Vec").show()
#encoded3.cache()b

+-------------------+-------------------------+-----------------------+
|DESTINATION_AIRPORT|DESTINATION_AIRPORT_Index|DESTINATION_AIRPORT_Vec|
+-------------------+-------------------------+-----------------------+
|                CLT|                     14.0|       (618,[14],[1.0])|
|                MIA|                     24.0|       (618,[24],[1.0])|
|                SEA|                     10.0|       (618,[10],[1.0])|
|                DFW|                      2.0|        (618,[2],[1.0])|
|                EWR|                     15.0|       (618,[15],[1.0])|
|                CLT|                     14.0|       (618,[14],[1.0])|
|                MCO|                     11.0|       (618,[11],[1.0])|
|                JFK|                     18.0|       (618,[18],[1.0])|
|                DEN|                      3.0|        (618,[3],[1.0])|
|                ATL|                      0.0|        (618,[0],[1.0])|
|                LAX|                      4.0|        (618,[4],

In [12]:
new_cols_to_drop = ['AIRLINE_Index', 'AIRLINE', 'ORIGIN_AIRPORT_Index', 
                    'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT_Index',
                    'DESTINATION_AIRPORT', 'FLIGHT_NUMBER']

final_encoded = encoded3.drop(*new_cols_to_drop)

In [13]:
final_encoded.columns

['_c0',
 'YEAR',
 'MONTH',
 'DAY',
 'DAY_OF_WEEK',
 'SCHEDULED_DEPARTURE',
 'DEPARTURE_TIME',
 'DEPARTURE_DELAY',
 'SCHEDULED_TIME',
 'ELAPSED_TIME',
 'DISTANCE',
 'SCHEDULED_ARRIVAL',
 'ARRIVAL_TIME',
 'ARRIVAL_DELAY',
 'DIVERTED',
 'CANCELLED',
 'B_SCHEDULED_ARRIVAL',
 'B_ARRIVAL_TIME',
 'B_SCHEDULED_DEPARTURE',
 'B_DEPARTURE_TIME',
 'AIRLINE_Vec',
 'ORIGIN_AIRPORT_Vec',
 'DESTINATION_AIRPORT_Vec']

In [14]:
#final_df = final_encoded.withColumn('CANCELLED2', final_encoded.CANCELLED)

In [15]:
final_df = final_encoded.drop('_c0', 'YEAR', 'DEPARTURE_DELAY', 'ARRIVAL_DELAY', 'ARRIVAL_TIME', 
                              'B_ARRIVAL_TIME', 'ELAPSED_TIME', 'B_ARRIVAL_TIME', 
                              'B_DEPARTURE_TIME', 'DEPARTURE_TIME')

In [16]:
final_df.printSchema()

root
 |-- MONTH: integer (nullable = true)
 |-- DAY: integer (nullable = true)
 |-- DAY_OF_WEEK: integer (nullable = true)
 |-- SCHEDULED_DEPARTURE: integer (nullable = true)
 |-- SCHEDULED_TIME: integer (nullable = true)
 |-- DISTANCE: integer (nullable = true)
 |-- SCHEDULED_ARRIVAL: integer (nullable = true)
 |-- DIVERTED: integer (nullable = true)
 |-- CANCELLED: integer (nullable = true)
 |-- B_SCHEDULED_ARRIVAL: double (nullable = true)
 |-- B_SCHEDULED_DEPARTURE: double (nullable = true)
 |-- AIRLINE_Vec: vector (nullable = true)
 |-- ORIGIN_AIRPORT_Vec: vector (nullable = true)
 |-- DESTINATION_AIRPORT_Vec: vector (nullable = true)



In [17]:
final_df.columns

['MONTH',
 'DAY',
 'DAY_OF_WEEK',
 'SCHEDULED_DEPARTURE',
 'SCHEDULED_TIME',
 'DISTANCE',
 'SCHEDULED_ARRIVAL',
 'DIVERTED',
 'CANCELLED',
 'B_SCHEDULED_ARRIVAL',
 'B_SCHEDULED_DEPARTURE',
 'AIRLINE_Vec',
 'ORIGIN_AIRPORT_Vec',
 'DESTINATION_AIRPORT_Vec']

# Scale Data

In [18]:
#convery final_encoded to rdd 
# we cannpot scale bucketized or vec columns, so we omit those form the scaling process
input_data = final_df.rdd.map(lambda x: (x[8], DenseVector(x[1:8])))

In [19]:
input_data.take(3)

[(0, DenseVector([1.0, 4.0, 20.0, 286.0, 2296.0, 806.0, 0.0])),
 (0, DenseVector([1.0, 4.0, 35.0, 268.0, 2174.0, 803.0, 0.0])),
 (1, DenseVector([1.0, 4.0, 135.0, 205.0, 1448.0, 600.0, 0.0]))]

In [20]:
df2 = sqlCtx.createDataFrame(input_data, ["label","features2"])

In [21]:
SS = StandardScaler(inputCol = "features2", outputCol = "features_scaled")

scaler = SS.fit(df2)

In [22]:
#transform the data in df2 with our scaler 
scaled_df = scaler.transform(df2)
#join scalable feature with columns  'AIRLINE_Vec', 'ORIGIN_AIRPORT_Vec', 
#'DESTINATION_AIRPORT_Vec', 'B_SCHEDULED_ARRIVAL', 'B_ARRIVAL_TIME', 
#'B_SCHEDULED_DEPARTURE','B_DEPARTURE_TIME'


In [23]:
scaled_df.show(4)

+-----+--------------------+--------------------+
|label|           features2|     features_scaled|
+-----+--------------------+--------------------+
|    0|[1.0,4.0,20.0,286...|[0.11379250643083...|
|    0|[1.0,4.0,35.0,268...|[0.11379250643083...|
|    1|[1.0,4.0,135.0,20...|[0.11379250643083...|
|    1|[1.0,4.0,200.0,12...|[0.11379250643083...|
+-----+--------------------+--------------------+
only showing top 4 rows



In [24]:
# since there is no common column between these two dataframes add row_index so that it can be joined
scaled_df = scaled_df.withColumn('row_index', F.monotonically_increasing_id())
final_df = final_df.withColumn('row_index', F.monotonically_increasing_id())

In [25]:
# combine scaled df and final_df
total_df = scaled_df.join(final_df, scaled_df.row_index == final_df.row_index)

In [26]:
total_df.count()

278016

In [27]:
total_df.columns

['label',
 'features2',
 'features_scaled',
 'row_index',
 'MONTH',
 'DAY',
 'DAY_OF_WEEK',
 'SCHEDULED_DEPARTURE',
 'SCHEDULED_TIME',
 'DISTANCE',
 'SCHEDULED_ARRIVAL',
 'DIVERTED',
 'CANCELLED',
 'B_SCHEDULED_ARRIVAL',
 'B_SCHEDULED_DEPARTURE',
 'AIRLINE_Vec',
 'ORIGIN_AIRPORT_Vec',
 'DESTINATION_AIRPORT_Vec',
 'row_index']

In [28]:
#drop columns of already scaled predictors 
total_df = total_df.drop(*['MONTH', 'DAY', 'DAY_OF_WEEK', 'SCHEDULED_DEPARTURE', 'SCHEDULED_TIME', 'DISTANCE',
                           'SCHEDULED_ARRIVAL', 'DIVERTED', 
                           'CANCELLED2', 'row_index']) 

In [29]:
total_df.count()

278016

In [30]:
#final scaled dataframe for predicting Cancelled flights 
total_df.columns

['label',
 'features2',
 'features_scaled',
 'CANCELLED',
 'B_SCHEDULED_ARRIVAL',
 'B_SCHEDULED_DEPARTURE',
 'AIRLINE_Vec',
 'ORIGIN_AIRPORT_Vec',
 'DESTINATION_AIRPORT_Vec']

# Scaled: split data  

In [31]:
def unionAll(dfs):
    return functools.reduce(lambda df1,df2: df1.union(df2.select(df1.columns)), dfs)

In [32]:
(train_data_scaled, test_data_scaled) = total_df.randomSplit([0.7, 0.3], seed = 314)

In [33]:
c = train_data_scaled.where(train_data_scaled.label == 1)

In [34]:
#Filter out large portion of non-canclelled flights in training data to make data more even 
not_c = train_data_scaled.where(train_data_scaled.label == 0).sample(False, .018, 454) 

In [35]:
not_c.count()

3422

In [36]:
c.count()

3125

In [37]:
training_data_scaled = unionAll([c, not_c])
training_data_scaled.show(4) 

+-----+--------------------+--------------------+---------+-------------------+---------------------+--------------+------------------+-----------------------+
|label|           features2|     features_scaled|CANCELLED|B_SCHEDULED_ARRIVAL|B_SCHEDULED_DEPARTURE|   AIRLINE_Vec|ORIGIN_AIRPORT_Vec|DESTINATION_AIRPORT_Vec|
+-----+--------------------+--------------------+---------+-------------------+---------------------+--------------+------------------+-----------------------+
|    1|[2.0,1.0,1800.0,7...|[0.22758501286166...|        1|                6.0|                  6.0|(13,[8],[1.0])|  (614,[16],[1.0])|       (618,[12],[1.0])|
|    1|[3.0,2.0,825.0,12...|[0.34137751929250...|        1|                3.0|                  2.0|(13,[0],[1.0])|   (614,[0],[1.0])|       (618,[20],[1.0])|
|    1|[8.0,4.0,1710.0,1...|[0.91034005144667...|        1|                6.0|                  5.0|(13,[0],[1.0])|  (614,[44],[1.0])|       (618,[20],[1.0])|
|    1|[8.0,7.0,1716.0,9...|[0.910340051

In [38]:
training_data_scaled.columns

['label',
 'features2',
 'features_scaled',
 'CANCELLED',
 'B_SCHEDULED_ARRIVAL',
 'B_SCHEDULED_DEPARTURE',
 'AIRLINE_Vec',
 'ORIGIN_AIRPORT_Vec',
 'DESTINATION_AIRPORT_Vec']

# Gradient Boosted Trees

In [116]:
vars_to_keep = [
 'features_scaled',
 'B_SCHEDULED_ARRIVAL',
 'B_SCHEDULED_DEPARTURE',
 'AIRLINE_Vec',
 'ORIGIN_AIRPORT_Vec',
 'DESTINATION_AIRPORT_Vec'
]

In [117]:
training_data_scaled.show(2)

+-----+--------------------+--------------------+---------+-------------------+---------------------+--------------+------------------+-----------------------+
|label|           features2|     features_scaled|CANCELLED|B_SCHEDULED_ARRIVAL|B_SCHEDULED_DEPARTURE|   AIRLINE_Vec|ORIGIN_AIRPORT_Vec|DESTINATION_AIRPORT_Vec|
+-----+--------------------+--------------------+---------+-------------------+---------------------+--------------+------------------+-----------------------+
|    1|[2.0,1.0,1800.0,7...|[0.22758501286166...|        1|                6.0|                  6.0|(13,[8],[1.0])|  (614,[16],[1.0])|       (618,[12],[1.0])|
|    1|[3.0,2.0,825.0,12...|[0.34137751929250...|        1|                3.0|                  2.0|(13,[0],[1.0])|   (614,[0],[1.0])|       (618,[20],[1.0])|
+-----+--------------------+--------------------+---------+-------------------+---------------------+--------------+------------------+-----------------------+
only showing top 2 rows



In [118]:
# class = 2 for benign (negative class, 4 for malignant (positive class)
target = 'label'
positive_label = 1
negative_label = 0

SEED = 314
ITERS = 10
FOLDS = 5

In [119]:
assembler = VectorAssembler(
                            inputCols=[c for c in vars_to_keep],
                            outputCol='features')

In [120]:
train = (assembler.transform(training_data_scaled).select(target, "features"))

In [121]:
train

DataFrame[label: bigint, features: vector]

In [122]:
train.show(5)

+-----+--------------------+
|label|            features|
+-----+--------------------+
|    1|(1254,[0,1,2,3,4,...|
|    1|(1254,[0,1,2,3,4,...|
|    1|(1254,[0,1,2,3,4,...|
|    1|(1254,[0,1,2,3,4,...|
|    1|(1254,[0,1,2,3,4,...|
+-----+--------------------+
only showing top 5 rows



In [123]:
gbt = GBTClassifier(labelCol=target, featuresCol="features", maxIter=ITERS)

In [124]:
evaluator = BinaryClassificationEvaluator(labelCol=target)

In [125]:
paramGrid = ParamGridBuilder() \
            .addGrid(gbt.maxIter, [1, 10]) \
            .addGrid(gbt.maxDepth, [1, 2]) \
            .build()

In [126]:
# k-fold cross validation
crossval = CrossValidator(
            estimator=gbt, 
            estimatorParamMaps=paramGrid, 
            evaluator=evaluator, 
            numFolds=FOLDS)

In [127]:
model = crossval.fit(train)

In [128]:
model

CrossValidatorModel_48e08d0a8ee9

In [129]:
predictions = model.transform(assembler.transform(test_data_scaled.select(vars_to_keep)).select("features"))

In [130]:
test_data_scaled.show(2)

+-----+--------------------+--------------------+---------+-------------------+---------------------+--------------+------------------+-----------------------+
|label|           features2|     features_scaled|CANCELLED|B_SCHEDULED_ARRIVAL|B_SCHEDULED_DEPARTURE|   AIRLINE_Vec|ORIGIN_AIRPORT_Vec|DESTINATION_AIRPORT_Vec|
+-----+--------------------+--------------------+---------+-------------------+---------------------+--------------+------------------+-----------------------+
|    0|[1.0,2.0,705.0,19...|[0.11379250643083...|        0|                4.0|                  2.0|(13,[2],[1.0])|   (614,[4],[1.0])|       (618,[14],[1.0])|
|    0|[1.0,2.0,1022.0,1...|[0.11379250643083...|        0|                4.0|                  3.0|(13,[4],[1.0])|  (614,[11],[1.0])|      (618,[126],[1.0])|
+-----+--------------------+--------------------+---------+-------------------+---------------------+--------------+------------------+-----------------------+
only showing top 2 rows



Convert the dataframe to an rdd. Then select only the prediction and feature fields

In [131]:
predictions.show(5)

+--------------------+--------------------+--------------------+----------+
|            features|       rawPrediction|         probability|prediction|
+--------------------+--------------------+--------------------+----------+
|(1254,[0,1,2,3,4,...|[0.16321724349500...|[0.58089158054675...|       0.0|
|(1254,[0,1,2,3,4,...|[0.00828064407615...|[0.50414022740800...|       0.0|
|(1254,[0,1,2,3,4,...|[-0.0022010289368...|[0.49889948730871...|       1.0|
|(1254,[0,1,2,3,4,...|[0.64720937076465...|[0.78489416792349...|       0.0|
|(1254,[0,1,2,3,4,...|[0.12511903371143...|[0.56223509652493...|       0.0|
+--------------------+--------------------+--------------------+----------+
only showing top 5 rows



In [132]:
predictions = predictions.withColumn('row_index', F.monotonically_increasing_id())
test_data_scaled = test_data_scaled.withColumn('row_index', F.monotonically_increasing_id())

In [133]:
# combine scaled df and final_df
tot_preds = predictions.join(test_data_scaled, predictions.row_index == test_data_scaled.row_index)

In [134]:
tot_preds.show(5)

+--------------------+--------------------+--------------------+----------+-----------+-----+--------------------+--------------------+---------+-------------------+---------------------+--------------+------------------+-----------------------+-----------+
|            features|       rawPrediction|         probability|prediction|  row_index|label|           features2|     features_scaled|CANCELLED|B_SCHEDULED_ARRIVAL|B_SCHEDULED_DEPARTURE|   AIRLINE_Vec|ORIGIN_AIRPORT_Vec|DESTINATION_AIRPORT_Vec|  row_index|
+--------------------+--------------------+--------------------+----------+-----------+-----+--------------------+--------------------+---------+-------------------+---------------------+--------------+------------------+-----------------------+-----------+
|(1254,[0,1,2,3,4,...|[0.00828064407615...|[0.50414022740800...|       0.0|         26|    0|[4.0,4.0,1018.0,9...|[0.45517002572333...|        0|                3.0|                  3.0|(13,[4],[1.0])|   (614,[7],[1.0])|     

In [None]:
feat_imp = DenseVector(model.bestModel.featureImportances)
feat_imp

# Model Evaliuation for GBT method 

Model Evaluation: For classification, this would include: i. accuracy ii. precision, recall, F1 score ii. confusion matrix iv. area under ROC curve (AUROC)

Test Error: 

In [58]:
evaluator = MulticlassClassificationEvaluator(
    labelCol="CANCELLED", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(tot_preds)
print("Test Error = %g" % (1.0 - accuracy))

Test Error = 0.180315


In [59]:
print('Accuracy = %g' % accuracy)

Accuracy = 0.819685


In [60]:
evaluator2 = BinaryClassificationEvaluator(
    labelCol="CANCELLED")
AUC = evaluator2.evaluate(tot_preds)
print("AUC = %g" % (AUC))  

AUC = 0.692591


Area under the curve:

In [61]:
evaluator3 = MulticlassClassificationEvaluator(
    labelCol="CANCELLED", predictionCol="prediction")
F1 = evaluator3.evaluate(tot_preds)
#print("F1 Score %f" (F1))

In [62]:
F1

0.8865472951109932

Precision = TP / (TP+FP) 

Recall = TP / (TP+FN)

In [63]:
# tp = df[(df.target_index == 1) & (df.prediction == 1)].count()
tp = tot_preds[(tot_preds.label == 1) & (tot_preds.prediction == 1)].count()
tn = tot_preds[(tot_preds.label == 0) & (tot_preds.prediction == 0)].count()
fp = tot_preds[(tot_preds.label == 0) & (tot_preds.prediction == 1)].count()
fn = tot_preds[(tot_preds.label == 1) & (tot_preds.prediction == 0)].count()

Confusion Matrix

In [64]:
# Confusion matrix 
print("True Positives:", tp)
print("True Negatives:", tn)
print("False Positives:", fp)
print("False Negatives:", fn)
print("Total", df.count())

True Positives: 590
True Negatives: 67957
False Positives: 14296
False Negatives: 783
Total 278016


Precision and Recall

In [65]:
# Calculate Precision and recall: 
precision = tp / (tp + fp)
recall = tp / (tp + fn) 

In [66]:
print("Precision = %g" % (precision))

Precision = 0.0396346


In [67]:
print("Recall = %g" % (recall))

Recall = 0.429716


In [68]:
model.bestModel

GBTClassificationModel (uid=GBTClassifier_f4c1589c2de9) with 10 trees

# Random Forest

Add cross validation to this model 

In [69]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Index labels, adding metadata to the label column.
# Fit on whole dataset to include all labels in index.
labelIndexer = StringIndexer(inputCol="CANCELLED", outputCol="indexedLabel").fit(final_df)

In [70]:
assembler = VectorAssembler(
                            inputCols=[c for c in vars_to_keep],
                            outputCol='features')
data = (assembler.transform(total_df).select(target, "features"))


In [71]:
scaled_df.show(3)

+-----+--------------------+--------------------+---------+
|label|           features2|     features_scaled|row_index|
+-----+--------------------+--------------------+---------+
|    0|[1.0,4.0,20.0,286...|[0.11379250643083...|        0|
|    0|[1.0,4.0,35.0,268...|[0.11379250643083...|        1|
|    1|[1.0,4.0,135.0,20...|[0.11379250643083...|        2|
+-----+--------------------+--------------------+---------+
only showing top 3 rows



In [72]:
# Automatically identify categorical features, and index them.
# Set maxCategories so features with > 4 distinct values are treated as continuous.
featureIndexer =\
    VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(data)

In [73]:
(trainingData, testData) = data.randomSplit([0.8, 0.2], seed = 314)

In [74]:
trainingData.show(5)

+-----+--------------------+
|label|            features|
+-----+--------------------+
|    0|(1254,[0,1,2,3,4,...|
|    0|(1254,[0,1,2,3,4,...|
|    0|(1254,[0,1,2,3,4,...|
|    0|(1254,[0,1,2,3,4,...|
|    0|(1254,[0,1,2,3,4,...|
+-----+--------------------+
only showing top 5 rows



In [75]:
c3 = trainingData.where(trainingData.label == 1)
not_c3 = trainingData.where(trainingData.label == 0).sample(False, 0.018, 99)

trainingData2 = unionAll([c3, not_c3])

In [76]:
testData.show(3)

+-----+--------------------+
|label|            features|
+-----+--------------------+
|    0|(1254,[0,1,2,3,4,...|
|    0|(1254,[0,1,2,3,4,...|
|    0|(1254,[0,1,2,3,4,...|
+-----+--------------------+
only showing top 3 rows



In [77]:
# Train a RandomForest model.
rf = RandomForestClassifier(labelCol="label", featuresCol="indexedFeatures", numTrees=10)

In [78]:
pipeline = Pipeline(stages=[featureIndexer, rf])

In [79]:
# class = 2 for benign (negative class, 4 for malignant (positive class)
target = 'label'
positive_label = 1
negative_label = 0

SEED = 314
ITERS = 10
FOLDS = 5

In [99]:
paramGrid = ParamGridBuilder() \
            .addGrid(rf.maxDepth, [1, 4])\
            .addGrid(rf.numTrees, [1,4,5,7,8,10,12,13,14])\
            .build()

In [96]:
evaluator = BinaryClassificationEvaluator(labelCol=target)

In [100]:
crossval = CrossValidator(
    estimator=pipeline,
    estimatorParamMaps=paramGrid,
    evaluator=evaluator,
    numFolds=FOLDS)

In [98]:
# Train model.  This also runs the indexers.
model = crossval.fit(trainingData2)

----------------------------------------
Exception happened during processing of request from ('127.0.0.1', 53920)
ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1159, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 985, in send_command
    response = connection.send_command(command)
  File "/usr/local/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1164, in send_command
    "Error while receiving", e, proto.ERROR_ON_RECEIVE)
py4j.protocol.Py4JNetworkError: Error while receiving
Traceback (most recent call last):
  File "/opt/conda/lib/python3.6/socketserver.py", line 320, in _ha

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:38129)
Traceback (most recent call last):
  File "/opt/conda/lib/python3.6/socketserver.py", line 320, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/opt/conda/lib/python3.6/socketserver.py", line 351, in process_request
    self.finish_request(request, client_address)
  File "/opt/conda/lib/python3.6/socketserver.py", line 364, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/opt/conda/lib/python3.6/socketserver.py", line 724, in __init__
    self.handle()
  File "/usr/local/spark/python/pyspark/accumulators.py", line 268, in handle
    poll(accum_updates)
  File "/usr/local/spark/python/pyspark/accumulators.py", line 241, in poll
    if func():
  File "/usr/local/spark/python/pyspark/accumulators.py", line 245, in accum_updates
    num_updates = read_int(self.rfile)
  File "/usr/local/spark/python/pyspark/serializ

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:38129)
Traceback (most recent call last):
  File "/opt/conda/lib/python3.6/socketserver.py", line 320, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/opt/conda/lib/python3.6/socketserver.py", line 351, in process_request
    self.finish_request(request, client_address)
  File "/opt/conda/lib/python3.6/socketserver.py", line 364, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/opt/conda/lib/python3.6/socketserver.py", line 724, in __init__
    self.handle()
  File "/usr/local/spark/python/pyspark/accumulators.py", line 268, in handle
    poll(accum_updates)
  File "/usr/local/spark/python/pyspark/accumulators.py", line 241, in poll
    if func():
  File "/usr/local/spark/python/pyspark/accumulators.py", line 245, in accum_updates
    num_updates = read_int(self.rfile)
  File "/usr/local/spark/python/pyspark/serializ

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:38129)
Traceback (most recent call last):
  File "/opt/conda/lib/python3.6/socketserver.py", line 320, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/opt/conda/lib/python3.6/socketserver.py", line 351, in process_request
    self.finish_request(request, client_address)
  File "/opt/conda/lib/python3.6/socketserver.py", line 364, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/opt/conda/lib/python3.6/socketserver.py", line 724, in __init__
    self.handle()
  File "/usr/local/spark/python/pyspark/accumulators.py", line 268, in handle
    poll(accum_updates)
  File "/usr/local/spark/python/pyspark/accumulators.py", line 241, in poll
    if func():
  File "/usr/local/spark/python/pyspark/accumulators.py", line 245, in accum_updates
    num_updates = read_int(self.rfile)
  File "/usr/local/spark/python/pyspark/serializ

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:38129)
Traceback (most recent call last):
  File "/opt/conda/lib/python3.6/socketserver.py", line 320, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/opt/conda/lib/python3.6/socketserver.py", line 351, in process_request
    self.finish_request(request, client_address)
  File "/opt/conda/lib/python3.6/socketserver.py", line 364, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/opt/conda/lib/python3.6/socketserver.py", line 724, in __init__
    self.handle()
  File "/usr/local/spark/python/pyspark/accumulators.py", line 268, in handle
    poll(accum_updates)
  File "/usr/local/spark/python/pyspark/accumulators.py", line 241, in poll
    if func():
  File "/usr/local/spark/python/pyspark/accumulators.py", line 245, in accum_updates
    num_updates = read_int(self.rfile)
  File "/usr/local/spark/python/pyspark/serializ

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:38129)
Traceback (most recent call last):
  File "/opt/conda/lib/python3.6/socketserver.py", line 320, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/opt/conda/lib/python3.6/socketserver.py", line 351, in process_request
    self.finish_request(request, client_address)
  File "/opt/conda/lib/python3.6/socketserver.py", line 364, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/opt/conda/lib/python3.6/socketserver.py", line 724, in __init__
    self.handle()
  File "/usr/local/spark/python/pyspark/accumulators.py", line 268, in handle
    poll(accum_updates)
  File "/usr/local/spark/python/pyspark/accumulators.py", line 241, in poll
    if func():
  File "/usr/local/spark/python/pyspark/accumulators.py", line 245, in accum_updates
    num_updates = read_int(self.rfile)
  File "/usr/local/spark/python/pyspark/serializ

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:38129)
Traceback (most recent call last):
  File "/opt/conda/lib/python3.6/socketserver.py", line 320, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/opt/conda/lib/python3.6/socketserver.py", line 351, in process_request
    self.finish_request(request, client_address)
  File "/opt/conda/lib/python3.6/socketserver.py", line 364, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/opt/conda/lib/python3.6/socketserver.py", line 724, in __init__
    self.handle()
  File "/usr/local/spark/python/pyspark/accumulators.py", line 268, in handle
    poll(accum_updates)
  File "/usr/local/spark/python/pyspark/accumulators.py", line 241, in poll
    if func():
  File "/usr/local/spark/python/pyspark/accumulators.py", line 245, in accum_updates
    num_updates = read_int(self.rfile)
  File "/usr/local/spark/python/pyspark/serializ

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:38129)
Traceback (most recent call last):
  File "/opt/conda/lib/python3.6/socketserver.py", line 320, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/opt/conda/lib/python3.6/socketserver.py", line 351, in process_request
    self.finish_request(request, client_address)
  File "/opt/conda/lib/python3.6/socketserver.py", line 364, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/opt/conda/lib/python3.6/socketserver.py", line 724, in __init__
    self.handle()
  File "/usr/local/spark/python/pyspark/accumulators.py", line 268, in handle
    poll(accum_updates)
  File "/usr/local/spark/python/pyspark/accumulators.py", line 241, in poll
    if func():
  File "/usr/local/spark/python/pyspark/accumulators.py", line 245, in accum_updates
    num_updates = read_int(self.rfile)
  File "/usr/local/spark/python/pyspark/serializ

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:38129)
Traceback (most recent call last):
  File "/opt/conda/lib/python3.6/socketserver.py", line 320, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/opt/conda/lib/python3.6/socketserver.py", line 351, in process_request
    self.finish_request(request, client_address)
  File "/opt/conda/lib/python3.6/socketserver.py", line 364, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/opt/conda/lib/python3.6/socketserver.py", line 724, in __init__
    self.handle()
  File "/usr/local/spark/python/pyspark/accumulators.py", line 268, in handle
    poll(accum_updates)
  File "/usr/local/spark/python/pyspark/accumulators.py", line 241, in poll
    if func():
  File "/usr/local/spark/python/pyspark/accumulators.py", line 245, in accum_updates
    num_updates = read_int(self.rfile)
  File "/usr/local/spark/python/pyspark/serializ

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:38129)
Traceback (most recent call last):
  File "/opt/conda/lib/python3.6/socketserver.py", line 320, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/opt/conda/lib/python3.6/socketserver.py", line 351, in process_request
    self.finish_request(request, client_address)
  File "/opt/conda/lib/python3.6/socketserver.py", line 364, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/opt/conda/lib/python3.6/socketserver.py", line 724, in __init__
    self.handle()
  File "/usr/local/spark/python/pyspark/accumulators.py", line 268, in handle
    poll(accum_updates)
  File "/usr/local/spark/python/pyspark/accumulators.py", line 241, in poll
    if func():
  File "/usr/local/spark/python/pyspark/accumulators.py", line 245, in accum_updates
    num_updates = read_int(self.rfile)
  File "/usr/local/spark/python/pyspark/serializ

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:38129)
Traceback (most recent call last):
  File "/opt/conda/lib/python3.6/socketserver.py", line 320, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/opt/conda/lib/python3.6/socketserver.py", line 351, in process_request
    self.finish_request(request, client_address)
  File "/opt/conda/lib/python3.6/socketserver.py", line 364, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/opt/conda/lib/python3.6/socketserver.py", line 724, in __init__
    self.handle()
  File "/usr/local/spark/python/pyspark/accumulators.py", line 268, in handle
    poll(accum_updates)
  File "/usr/local/spark/python/pyspark/accumulators.py", line 241, in poll
    if func():
  File "/usr/local/spark/python/pyspark/accumulators.py", line 245, in accum_updates
    num_updates = read_int(self.rfile)
  File "/usr/local/spark/python/pyspark/serializ

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:38129)
Traceback (most recent call last):
  File "/opt/conda/lib/python3.6/socketserver.py", line 320, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/opt/conda/lib/python3.6/socketserver.py", line 351, in process_request
    self.finish_request(request, client_address)
  File "/opt/conda/lib/python3.6/socketserver.py", line 364, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/opt/conda/lib/python3.6/socketserver.py", line 724, in __init__
    self.handle()
  File "/usr/local/spark/python/pyspark/accumulators.py", line 268, in handle
    poll(accum_updates)
  File "/usr/local/spark/python/pyspark/accumulators.py", line 241, in poll
    if func():
  File "/usr/local/spark/python/pyspark/accumulators.py", line 245, in accum_updates
    num_updates = read_int(self.rfile)
  File "/usr/local/spark/python/pyspark/serializ

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:38129)
Traceback (most recent call last):
  File "/opt/conda/lib/python3.6/socketserver.py", line 320, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/opt/conda/lib/python3.6/socketserver.py", line 351, in process_request
    self.finish_request(request, client_address)
  File "/opt/conda/lib/python3.6/socketserver.py", line 364, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/opt/conda/lib/python3.6/socketserver.py", line 724, in __init__
    self.handle()
  File "/usr/local/spark/python/pyspark/accumulators.py", line 268, in handle
    poll(accum_updates)
  File "/usr/local/spark/python/pyspark/accumulators.py", line 241, in poll
    if func():
  File "/usr/local/spark/python/pyspark/accumulators.py", line 245, in accum_updates
    num_updates = read_int(self.rfile)
  File "/usr/local/spark/python/pyspark/serializ

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:38129)
Traceback (most recent call last):
  File "/opt/conda/lib/python3.6/socketserver.py", line 320, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/opt/conda/lib/python3.6/socketserver.py", line 351, in process_request
    self.finish_request(request, client_address)
  File "/opt/conda/lib/python3.6/socketserver.py", line 364, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/opt/conda/lib/python3.6/socketserver.py", line 724, in __init__
    self.handle()
  File "/usr/local/spark/python/pyspark/accumulators.py", line 268, in handle
    poll(accum_updates)
  File "/usr/local/spark/python/pyspark/accumulators.py", line 241, in poll
    if func():
  File "/usr/local/spark/python/pyspark/accumulators.py", line 245, in accum_updates
    num_updates = read_int(self.rfile)
  File "/usr/local/spark/python/pyspark/serializ

Py4JError: An error occurred while calling o17381.fit

In [84]:
# Make predictions.
predictions = model.transform(testData)

In [85]:
# Select example rows to display.
predictions.select("prediction", "label", "features").show(5)

+----------+-----+--------------------+
|prediction|label|            features|
+----------+-----+--------------------+
|       1.0|    0|(1254,[0,1,2,3,4,...|
|       0.0|    0|(1254,[0,1,2,3,4,...|
|       0.0|    0|(1254,[0,1,2,3,4,...|
|       0.0|    0|(1254,[0,1,2,3,4,...|
|       0.0|    0|(1254,[0,1,2,3,4,...|
+----------+-----+--------------------+
only showing top 5 rows



# Model Evaluation: Random Forest

In [86]:
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % (1.0 - accuracy))

Test Error = 0.163332


Area Under the Curve: 

In [87]:
print("Accuracy: %f" % accuracy)

Accuracy: 0.836668


In [88]:
evaluator2 = BinaryClassificationEvaluator(
    labelCol="label")
AUC = evaluator2.evaluate(predictions)
print("AUC = %g" % (AUC))  

AUC = 0.668702


In [89]:
evaluator2_ = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction")
F1 = evaluator2_.evaluate(predictions)
print("F1 Score = %g" % (F1)) 

F1 Score = 0.896482


In [90]:
# tp = df[(df.target_index == 1) & (df.prediction == 1)].count()
tp = predictions[(predictions.label == 1) & (predictions.prediction == 1)].count()
tn = predictions[(predictions.label == 0) & (predictions.prediction == 0)].count()
fp = predictions[(predictions.label == 0) & (predictions.prediction == 1)].count()
fn = predictions[(predictions.label == 1) & (predictions.prediction == 0)].count()

Confusion matrix: 

### Confusion matrix 


Precision and Recall: 

In [91]:
# Calculate Precision and recall: 

precision = tp / (tp + fp)
recall = tp / (tp + fn) 

In [92]:
print("Precision = %g" % (precision))

Precision = 0.0360278


In [93]:
# true positive rate is recall! 
print("Recall = %g" % (recall))

Recall = 0.343105


In [None]:
# Confusion matrix 
print("True Positives:", tp)
print("True Negatives:", tn)
print("False Positives:", fp)
print("False Negatives:", fn)
print("Total", df.count())

## Baseline Linear Regression

In [66]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.classification import LogisticRegression

In [76]:
#train_data_base = training_data_scaled.select(["ORIGIN_AIRPORT_Vec", "CANCELLED"])
#test_data_base =  test_data_scaled.select(["ORIGIN_AIRPORT_Vec", "CANCELLED"])

vars_to_keep2 = ['ORIGIN_AIRPORT_Vec']

In [73]:
target = 'CANCELLED'
SEED = 314
ITERS = 10

assembler = VectorAssembler(
                            inputCols=[c for c in vars_to_keep2],
                            outputCol='features')

train = (assembler.transform(train_data_scaled).select(target, "features"))

train.show(3)

+---------+----------------+
|CANCELLED|        features|
+---------+----------------+
|        0|(614,[31],[1.0])|
|        0| (614,[1],[1.0])|
|        0| (614,[3],[1.0])|
+---------+----------------+
only showing top 3 rows



In [75]:
test_data_scaled.show(4)

+-----+--------------------+--------------------+---------+-------------------+---------------------+--------------+------------------+-----------------------+
|label|           features2|     features_scaled|CANCELLED|B_SCHEDULED_ARRIVAL|B_SCHEDULED_DEPARTURE|   AIRLINE_Vec|ORIGIN_AIRPORT_Vec|DESTINATION_AIRPORT_Vec|
+-----+--------------------+--------------------+---------+-------------------+---------------------+--------------+------------------+-----------------------+
|    0|[1.0,2.0,705.0,19...|[0.11379250643083...|        0|                4.0|                  2.0|(13,[2],[1.0])|   (614,[4],[1.0])|       (618,[14],[1.0])|
|    0|[1.0,2.0,1022.0,1...|[0.11379250643083...|        0|                4.0|                  3.0|(13,[4],[1.0])|  (614,[11],[1.0])|      (618,[126],[1.0])|
|    0|[1.0,2.0,1315.0,7...|[0.11379250643083...|        0|                4.0|                  4.0|(13,[3],[1.0])| (614,[281],[1.0])|        (618,[3],[1.0])|
|    0|[1.0,4.0,610.0,15...|[0.113792506

In [None]:
lr = LogisticRegression(featuresCol = 'ORIGIN_AIRPORT_Vec', labelCol='CANCELLED', maxIter=10, regParam=0.3, 
                        elasticNetParam=0.8)
logModel = lr.fit(train_data_base)

In [93]:
test = assembler.transform(test_data_scaled.select("ORIGIN_AIRPORT_Vec", "CANCELLED"))

In [94]:
test.show(4)

+------------------+---------+-----------------+
|ORIGIN_AIRPORT_Vec|CANCELLED|         features|
+------------------+---------+-----------------+
|   (614,[4],[1.0])|        0|  (614,[4],[1.0])|
|  (614,[11],[1.0])|        0| (614,[11],[1.0])|
| (614,[281],[1.0])|        0|(614,[281],[1.0])|
|  (614,[26],[1.0])|        0| (614,[26],[1.0])|
+------------------+---------+-----------------+
only showing top 4 rows



In [95]:
predicted = logModel.transform(test)

In [108]:
predicted.filter(predicted.CANCELLED == 1).show(7)

+------------------+---------+----------------+--------------------+--------------------+----------+
|ORIGIN_AIRPORT_Vec|CANCELLED|        features|       rawPrediction|         probability|prediction|
+------------------+---------+----------------+--------------------+--------------------+----------+
|  (614,[16],[1.0])|        1|(614,[16],[1.0])|[0.09079089228163...|[0.52268214449366...|       0.0|
|  (614,[15],[1.0])|        1|(614,[15],[1.0])|[0.09079089228163...|[0.52268214449366...|       0.0|
|  (614,[11],[1.0])|        1|(614,[11],[1.0])|[0.09079089228163...|[0.52268214449366...|       0.0|
|   (614,[5],[1.0])|        1| (614,[5],[1.0])|[0.09079089228163...|[0.52268214449366...|       0.0|
|  (614,[22],[1.0])|        1|(614,[22],[1.0])|[0.09079089228163...|[0.52268214449366...|       0.0|
|  (614,[54],[1.0])|        1|(614,[54],[1.0])|[0.09079089228163...|[0.52268214449366...|       0.0|
|  (614,[26],[1.0])|        1|(614,[26],[1.0])|[0.09079089228163...|[0.52268214449366...|  

In [97]:
evaluator11 = BinaryClassificationEvaluator(labelCol="CANCELLED")
AUC = evaluator11.evaluate(predicted)
print("AUC: %f" % AUC)

AUC: 0.500000


In [106]:
evaluator22 = MulticlassClassificationEvaluator(
    labelCol="CANCELLED", metricName="accuracy")
accuracy = evaluator22.evaluate(predicted)
print("accuracy: %f" % accuracy)

accuracy: 0.983582


In [99]:
evaluator33 = MulticlassClassificationEvaluator(
    labelCol="CANCELLED", predictionCol="prediction")
f1 = evaluator33.evaluate(predicted)
print("f1: %f" % f1)

f1: 0.975440


In [100]:
#evaluator44 = MulticlassClassificationEvaluator(
    #labelCol="CANCELLED", predictionCol="weightedRecall")
evaluator44 = MulticlassClassificationEvaluator(metricName="weightedRecall", labelCol="CANCELLED", predictionCol="prediction")
recall = evaluator44.evaluate(predicted)
print("recall: %f" % recall)


recall: 0.983582


In [101]:
evaluator55 = MulticlassClassificationEvaluator(
    labelCol="CANCELLED", metricName="weightedPrecision", predictionCol = "prediction")
pre = evaluator55.evaluate(predicted)
print("Weighted Precison: %f" % pre) 

Weighted Precison: 0.967433


In [110]:
# tp = df[(df.target_index == 1) & (df.prediction == 1)].count()
tp = predicted[(predicted.CANCELLED == 1) & (predicted.prediction == 1)].count()
tn = predicted[(predicted.CANCELLED == 0) & (predicted.prediction == 0)].count()
fp = predicted[(predicted.CANCELLED == 0) & (predicted.prediction == 1)].count()
fn = predicted[(predicted.CANCELLED == 1) & (predicted.prediction == 0)].count()

In [111]:
recall = tp/ (fn + tp)

precision = tp / (tp + fp)

recall 

ZeroDivisionError: division by zero

In [112]:
tp

0

In [113]:
tn

82253

In [114]:
fp

0

In [115]:
fn

1373