## Initialise SparkSession and Load Data

In [1]:
import pyspark
conf = pyspark.SparkConf()

conf.set('spark.local.dir', 'path')
sc = pyspark.SparkContext(conf=conf)

## Connection to Google BigQuery

In [2]:
from pythonbq import pythonbq

myProject=pythonbq(
  bq_key_path='json_file',
  project_id='project_id'
)

In [2]:
SQL_CODE="""
SELECT Country, MarketSegment, ArrivalDateMonth, DepositType, CustomerType, LeadTime, ArrivalDateYear, ArrivalDateWeekNumber, ArrivalDateDayOfMonth, RequiredCarParkingSpaces, IsCanceled FROM `table.H2data`
"""
output=myProject.query(sql=SQL_CODE)
output

Unnamed: 0,Country,MarketSegment,ArrivalDateMonth,DepositType,CustomerType,LeadTime,ArrivalDateYear,ArrivalDateWeekNumber,ArrivalDateDayOfMonth,RequiredCarParkingSpaces,IsCanceled
0,PRT,Offline TA/TO,July,No Deposit,Transient,6,2015,27,1,0,0
1,PRT,Online TA,July,No Deposit,Transient,88,2015,27,1,0,1
2,PRT,Online TA,July,No Deposit,Transient,65,2015,27,1,0,1
3,PRT,Online TA,July,No Deposit,Transient,92,2015,27,1,0,1
4,PRT,Online TA,July,No Deposit,Transient,100,2015,27,2,0,1
...,...,...,...,...,...,...,...,...,...,...,...
29991,PRT,Groups,June,Non Refund,Transient,178,2017,23,9,0,1
29992,PRT,Groups,June,Non Refund,Transient,178,2017,23,9,0,1
29993,PRT,Groups,June,Non Refund,Transient,178,2017,23,9,0,1
29994,PRT,Groups,June,Non Refund,Transient,178,2017,23,9,0,1


In [5]:
# import pyspark
import pyarrow
import pandas as pd

In [7]:
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)
dataset = sqlContext.createDataFrame(output)

# If reading from CSV:
# dataset= spark.read.load("H2.csv", format="csv", header="true", inferSchema=True)

dataset
cols = dataset.columns

In [8]:
type(dataset)

pyspark.sql.dataframe.DataFrame

## Pipeline and Data Transformation

In [10]:
# https://towardsdatascience.com/machine-learning-with-pyspark-and-mllib-solving-a-binary-classification-problem-96396065d2aa
# https://docs.databricks.com/applications/machine-learning/mllib/binary-classification-mllib-pipelines.html

from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, VectorAssembler
categoricalColumns = ["Country", "MarketSegment", "ArrivalDateMonth", "DepositType", "CustomerType"]

stages = []
for categoricalCol in categoricalColumns:
    stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol + "Index")
    encoder = OneHotEncoderEstimator(inputCols=[stringIndexer.getOutputCol()], outputCols=[categoricalCol + "classVec"])
    stages += [stringIndexer, encoder]

## Convert label into label indices using the StringIndexer

In [11]:
label_stringIdx = StringIndexer(inputCol="IsCanceled", outputCol="label")
stages += [label_stringIdx]

## Transform all features into a vector using VectorAssembler

In [12]:
numericCols = ["LeadTime", "ArrivalDateYear", "ArrivalDateWeekNumber", "ArrivalDateDayOfMonth", "RequiredCarParkingSpaces"]
assemblerInputs = [c + "classVec" for c in categoricalColumns] + numericCols
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
stages += [assembler]

## GBTClassifier

In [13]:
from pyspark.ml.classification import GBTClassifier
  
partialPipeline = Pipeline().setStages(stages)
pipelineModel = partialPipeline.fit(dataset)
preppedDataDF = pipelineModel.transform(dataset)

In [14]:
gbtClassifier = GBTClassifier()
trainedModel = gbtClassifier.fit(preppedDataDF)

In [None]:
selectedcols = ["label", "features"] + cols
dataset = preppedDataDF.select(selectedcols)

## Train-Test Split

In [17]:
(trainingData, testData) = dataset.randomSplit([0.8, 0.2], seed=100)
print(trainingData.count())
print(testData.count())

24002
5994


In [18]:
gbtModel = gbtClassifier.fit(trainingData)

In [None]:
predictions = gbtModel.transform(testData)

In [20]:
selected = predictions.select("label", "prediction", "probability")
display(selected)

DataFrame[label: double, prediction: double, probability: vector]

## Model Evaluation

In [21]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
evaluator.evaluate(predictions)

0.9804772438666579

In [22]:
evaluator.getMetricName()

'areaUnderROC'

In [24]:
predictions = gbtModel.transform(testData)

In [25]:
evaluator.evaluate(predictions)

0.9804772438666579

In [26]:
selected = predictions.select("label", "prediction", "probability")
display(selected)

DataFrame[label: double, prediction: double, probability: vector]

In [27]:
type(selected)

pyspark.sql.dataframe.DataFrame

In [28]:
selected.toPandas().to_csv('h2predictions.csv')

## Predictions on Test Data

In [29]:
mydata=pd.read_csv("h2predictions.csv")
mydata

Unnamed: 0.1,Unnamed: 0,label,prediction,probability
0,0,0.0,0.0,"[0.9314425575701684,0.06855744242983164]"
1,1,0.0,0.0,"[0.8090487269487285,0.19095127305127146]"
2,2,0.0,0.0,"[0.8090487269487285,0.19095127305127146]"
3,3,0.0,0.0,"[0.8090487269487285,0.19095127305127146]"
4,4,0.0,0.0,"[0.8090487269487285,0.19095127305127146]"
...,...,...,...,...
5989,5989,1.0,1.0,"[0.04929192444983682,0.9507080755501631]"
5990,5990,1.0,1.0,"[0.05568750145988222,0.9443124985401178]"
5991,5991,1.0,1.0,"[0.04689273875807077,0.9531072612419292]"
5992,5992,1.0,1.0,"[0.11509902109403011,0.8849009789059699]"


In [30]:
import numpy as np
from numpy.random import seed
seed(1)
from sklearn.metrics import classification_report,confusion_matrix

In [31]:
label=mydata['label']
prediction=mydata['prediction']

In [32]:
print(confusion_matrix(label,prediction))
print(classification_report(label,prediction))

[[4701  157]
 [ 236  900]]
              precision    recall  f1-score   support

         0.0       0.95      0.97      0.96      4858
         1.0       0.85      0.79      0.82      1136

    accuracy                           0.93      5994
   macro avg       0.90      0.88      0.89      5994
weighted avg       0.93      0.93      0.93      5994

