## Initialise SparkSession and Load Data

In [1]:
import pyspark
conf = pyspark.SparkConf()

conf.set('spark.local.dir', 'path')
sc = pyspark.SparkContext(conf=conf)

## Connection to Google BigQuery

In [2]:
from pythonbq import pythonbq

myProject=pythonbq(
  bq_key_path='json_file',
  project_id='project_id'
)

In [3]:
SQL_CODE="""
SELECT * FROM `mghotels.H2data`
"""
output=myProject.query(sql=SQL_CODE)

Downloading: 100%|██████████| 29996/29996 [00:07<00:00, 3966.95rows/s]


In [4]:
output

Unnamed: 0,IsCanceled,LeadTime,ArrivalDateYear,ArrivalDateMonth,ArrivalDateWeekNumber,ArrivalDateDayOfMonth,StaysInWeekendNights,StaysInWeekNights,Adults,Children,...,DepositType,Agent,Company,DaysInWaitingList,CustomerType,ADR,RequiredCarParkingSpaces,TotalOfSpecialRequests,ReservationStatus,ReservationStatusDate
0,0,3,2015,July,27,2,0,3,1,0,...,No Deposit,1,,0,Transient-Party,58.67,0,0,Check-Out,2015-07-05
1,0,43,2015,July,27,3,0,2,2,0,...,No Deposit,1,,0,Transient-Party,86.00,0,0,Check-Out,2015-07-05
2,0,43,2015,July,27,3,0,2,2,0,...,No Deposit,1,,0,Transient-Party,43.00,0,0,Check-Out,2015-07-05
3,0,43,2015,July,27,3,0,2,2,0,...,No Deposit,1,,0,Transient-Party,86.00,0,0,Check-Out,2015-07-05
4,0,4,2015,July,27,3,0,2,1,0,...,No Deposit,1,,0,Transient-Party,63.00,0,0,Check-Out,2015-07-05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29991,1,0,2016,December,50,6,0,0,0,0,...,No Deposit,,279,0,Transient,0.00,0,0,Canceled,2016-12-06
29992,1,0,2017,April,15,10,0,0,0,0,...,No Deposit,,279,0,Transient,0.00,0,0,Canceled,2017-04-10
29993,1,0,2017,April,15,10,0,0,0,0,...,No Deposit,,279,0,Transient,0.00,0,0,Canceled,2017-04-10
29994,1,0,2017,April,15,10,0,0,0,0,...,No Deposit,,279,0,Transient,0.00,0,0,Canceled,2017-04-10


In [5]:
# import pyspark
import pyarrow
import pandas as pd

In [6]:
type(output)

pandas.core.frame.DataFrame

In [7]:
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)
dataset = sqlContext.createDataFrame(output)

# If reading from CSV:
# dataset= spark.read.load("H2full.csv", format="csv", header="true", inferSchema=True)

dataset
cols = dataset.columns

In [8]:
type(dataset)

pyspark.sql.dataframe.DataFrame

## Print Schema

In [9]:
dataset.printSchema()

root
 |-- IsCanceled: long (nullable = true)
 |-- LeadTime: long (nullable = true)
 |-- ArrivalDateYear: long (nullable = true)
 |-- ArrivalDateMonth: string (nullable = true)
 |-- ArrivalDateWeekNumber: long (nullable = true)
 |-- ArrivalDateDayOfMonth: long (nullable = true)
 |-- StaysInWeekendNights: long (nullable = true)
 |-- StaysInWeekNights: long (nullable = true)
 |-- Adults: long (nullable = true)
 |-- Children: long (nullable = true)
 |-- Babies: long (nullable = true)
 |-- Meal: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- MarketSegment: string (nullable = true)
 |-- DistributionChannel: string (nullable = true)
 |-- IsRepeatedGuest: long (nullable = true)
 |-- PreviousCancellations: long (nullable = true)
 |-- PreviousBookingsNotCanceled: long (nullable = true)
 |-- ReservedRoomType: string (nullable = true)
 |-- AssignedRoomType: string (nullable = true)
 |-- BookingChanges: long (nullable = true)
 |-- DepositType: string (nullable = true)
 |-- Age

## Pipeline and Data Transformation

In [10]:
# https://towardsdatascience.com/machine-learning-with-pyspark-and-mllib-solving-a-binary-classification-problem-96396065d2aa
# https://docs.databricks.com/applications/machine-learning/mllib/binary-classification-mllib-pipelines.html

from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, VectorAssembler
categoricalColumns = ["Country", "MarketSegment", "ArrivalDateMonth", "DepositType", "CustomerType"]

stages = []
for categoricalCol in categoricalColumns:
    stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol + "Index")
    encoder = OneHotEncoderEstimator(inputCols=[stringIndexer.getOutputCol()], outputCols=[categoricalCol + "classVec"])
    stages += [stringIndexer, encoder]

## Convert label into label indices using the StringIndexer

In [11]:
label_stringIdx = StringIndexer(inputCol="IsCanceled", outputCol="label")
stages += [label_stringIdx]

## Transform all features into a vector using VectorAssembler

In [12]:
numericCols = ["LeadTime", "ArrivalDateYear", "ArrivalDateWeekNumber", "ArrivalDateDayOfMonth", "RequiredCarParkingSpaces"]
assemblerInputs = [c + "classVec" for c in categoricalColumns] + numericCols
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
stages += [assembler]

## GBTClassifier

In [13]:
from pyspark.ml.classification import GBTClassifier
  
partialPipeline = Pipeline().setStages(stages)
pipelineModel = partialPipeline.fit(dataset)
preppedDataDF = pipelineModel.transform(dataset)

In [14]:
gbtClassifier = GBTClassifier()
trainedModel = gbtClassifier.fit(preppedDataDF)

In [15]:
display(gbtClassifier, preppedDataDF)

GBTClassifier_2fd3b609b6b8

DataFrame[IsCanceled: bigint, LeadTime: bigint, ArrivalDateYear: bigint, ArrivalDateMonth: string, ArrivalDateWeekNumber: bigint, ArrivalDateDayOfMonth: bigint, StaysInWeekendNights: bigint, StaysInWeekNights: bigint, Adults: bigint, Children: bigint, Babies: bigint, Meal: string, Country: string, MarketSegment: string, DistributionChannel: string, IsRepeatedGuest: bigint, PreviousCancellations: bigint, PreviousBookingsNotCanceled: bigint, ReservedRoomType: string, AssignedRoomType: string, BookingChanges: bigint, DepositType: string, Agent: string, Company: string, DaysInWaitingList: bigint, CustomerType: string, ADR: double, RequiredCarParkingSpaces: bigint, TotalOfSpecialRequests: bigint, ReservationStatus: string, ReservationStatusDate: timestamp, CountryIndex: double, CountryclassVec: vector, MarketSegmentIndex: double, MarketSegmentclassVec: vector, ArrivalDateMonthIndex: double, ArrivalDateMonthclassVec: vector, DepositTypeIndex: double, DepositTypeclassVec: vector, CustomerType

In [16]:
selectedcols = ["label", "features"] + cols
dataset = preppedDataDF.select(selectedcols)
display(dataset)

DataFrame[label: double, features: vector, IsCanceled: bigint, LeadTime: bigint, ArrivalDateYear: bigint, ArrivalDateMonth: string, ArrivalDateWeekNumber: bigint, ArrivalDateDayOfMonth: bigint, StaysInWeekendNights: bigint, StaysInWeekNights: bigint, Adults: bigint, Children: bigint, Babies: bigint, Meal: string, Country: string, MarketSegment: string, DistributionChannel: string, IsRepeatedGuest: bigint, PreviousCancellations: bigint, PreviousBookingsNotCanceled: bigint, ReservedRoomType: string, AssignedRoomType: string, BookingChanges: bigint, DepositType: string, Agent: string, Company: string, DaysInWaitingList: bigint, CustomerType: string, ADR: double, RequiredCarParkingSpaces: bigint, TotalOfSpecialRequests: bigint, ReservationStatus: string, ReservationStatusDate: timestamp]

## Train-Test Split

In [17]:
(trainingData, testData) = dataset.randomSplit([0.8, 0.2], seed=100)
print(trainingData.count())
print(testData.count())

24002
5994


In [18]:
gbtModel = gbtClassifier.fit(trainingData)

In [19]:
predictions = gbtModel.transform(testData)
predictions

DataFrame[label: double, features: vector, IsCanceled: bigint, LeadTime: bigint, ArrivalDateYear: bigint, ArrivalDateMonth: string, ArrivalDateWeekNumber: bigint, ArrivalDateDayOfMonth: bigint, StaysInWeekendNights: bigint, StaysInWeekNights: bigint, Adults: bigint, Children: bigint, Babies: bigint, Meal: string, Country: string, MarketSegment: string, DistributionChannel: string, IsRepeatedGuest: bigint, PreviousCancellations: bigint, PreviousBookingsNotCanceled: bigint, ReservedRoomType: string, AssignedRoomType: string, BookingChanges: bigint, DepositType: string, Agent: string, Company: string, DaysInWaitingList: bigint, CustomerType: string, ADR: double, RequiredCarParkingSpaces: bigint, TotalOfSpecialRequests: bigint, ReservationStatus: string, ReservationStatusDate: timestamp, rawPrediction: vector, probability: vector, prediction: double]

In [20]:
selected = predictions.select("label", "prediction", "probability")
display(selected)

DataFrame[label: double, prediction: double, probability: vector]

## Model Evaluation

In [21]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
evaluator.evaluate(predictions)

0.9804772438666579

In [22]:
evaluator.getMetricName()

'areaUnderROC'

In [23]:
print(gbtModel.explainParams())

cacheNodeIds: If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees. (default: False)
checkpointInterval: set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations. Note: this setting will be ignored if the checkpoint directory is not set in the SparkContext (default: 10)
featureSubsetStrategy: The number of features to consider for splits at each tree node. Supported options: auto, all, onethird, sqrt, log2, (0.0-1.0], [1-n]. (default: all)
featuresCol: features column name (default: features)
impurity: Criterion used for information gain calculation (case-insensitive). Supported options: entropy, gini (undefined)
labelCol: label column name (default: label)
lossType: Loss function which GBT tries to minimize (case-insensitive). Supported options: logistic (default: logistic)
ma

In [24]:
predictions = gbtModel.transform(testData)

In [25]:
evaluator.evaluate(predictions)

0.9804772438666579

In [26]:
selected = predictions.select("label", "prediction", "probability")
display(selected)

DataFrame[label: double, prediction: double, probability: vector]

In [27]:
type(selected)

pyspark.sql.dataframe.DataFrame

In [28]:
selected.toPandas().to_csv('h2predictions.csv')

## Predictions on Test Data

In [29]:
mydata=pd.read_csv("h2predictions.csv")
mydata

Unnamed: 0.1,Unnamed: 0,label,prediction,probability
0,0,0.0,0.0,"[0.9314425575701684,0.06855744242983164]"
1,1,0.0,0.0,"[0.8090487269487285,0.19095127305127146]"
2,2,0.0,0.0,"[0.8090487269487285,0.19095127305127146]"
3,3,0.0,0.0,"[0.8090487269487285,0.19095127305127146]"
4,4,0.0,0.0,"[0.8090487269487285,0.19095127305127146]"
...,...,...,...,...
5989,5989,1.0,1.0,"[0.04929192444983682,0.9507080755501631]"
5990,5990,1.0,1.0,"[0.05568750145988222,0.9443124985401178]"
5991,5991,1.0,1.0,"[0.04689273875807077,0.9531072612419292]"
5992,5992,1.0,1.0,"[0.11509902109403011,0.8849009789059699]"


In [30]:
import numpy as np
from numpy.random import seed
seed(1)
from sklearn.metrics import classification_report,confusion_matrix

In [31]:
label=mydata['label']
prediction=mydata['prediction']

In [32]:
print(confusion_matrix(label,prediction))
print(classification_report(label,prediction))

[[4701  157]
 [ 236  900]]
              precision    recall  f1-score   support

         0.0       0.95      0.97      0.96      4858
         1.0       0.85      0.79      0.82      1136

    accuracy                           0.93      5994
   macro avg       0.90      0.88      0.89      5994
weighted avg       0.93      0.93      0.93      5994

