## Load the csv file and convert features to numeric

In [1]:
# Finds the spark path 
import findspark
findspark.init()

import pyspark
import pyspark.sql
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline

spark = SparkSession.builder \
     .master("local") \
     .appName("hotels") \
     .getOrCreate()

df = spark.read.csv("../input/Hotels_data_Changed.csv", header=True)
# Transform string values to numeric
indexers = [StringIndexer(inputCol="WeekDay", outputCol="WeekDayIndex"),
            StringIndexer(inputCol="Hotel Name", outputCol="HotelNameIndex"),]
pipeline = Pipeline(stages=indexers)
indexed_df = pipeline.fit(df).transform(df)

In [2]:
from pyspark.sql.functions import col, udf
from pyspark.sql.types import DoubleType, IntegerType
import pandas as pd

#Transform date values to numeric
dateFormatter = udf(lambda x:  pd.to_datetime(x).toordinal(), IntegerType())

indexed_df = indexed_df.withColumn('SnapshotDateIndex', dateFormatter(col('Snapshot Date')))
indexed_df = indexed_df.withColumn('CheckinDateIndex', dateFormatter(col('Checkin Date')))

print (indexed_df.head())

Row(_c0='0', Snapshot ID='1', Snapshot Date='2015-07-17', Checkin Date='2015-08-12', Days='5', Original Price='1178', Discount Price='1040', Discount Code='1', Available Rooms='6', Hotel Name='Best Western Plus Seaport Inn Downtown', Hotel Stars='3', DayDiff='26', WeekDay='Wed', DiscountDiff='138', DiscountPerc='11.714770797962649', WeekDayIndex=0.0, HotelNameIndex=153.0, SnapshotDateIndex=735796, CheckinDateIndex=735822)


### Get the highest discount code

In [3]:
def rowToKeyValue(row):
    key = (row['WeekDayIndex'], row["SnapshotDateIndex"], row["CheckinDateIndex"], float(row["DayDiff"]), row["HotelNameIndex"])
    val = ([row["Discount Code"]], row['DiscountPerc'])
    return (key,val)

def reduceToMaxDiscountPerKey(val1, val2):
    codes1, discount1 = val1
    codes2, discount2 = val2
    if (discount1 > discount2):
        return val1
    elif(discount2 > discount1):
        return val2
    else: # In case the discounts are equals, merge the prices to same array
        return (codes1+ codes2, discount1)

def flatMapDiscountCodes(row):
    key, val = row
    codes = val[0]
    # Return list of key & code
    return [(key, code) for code in codes]
    
rdd = indexed_df.rdd.map(rowToKeyValue)\
    .reduceByKey(reduceToMaxDiscountPerKey)\
    .flatMap(flatMapDiscountCodes)


## Create test & training data

In [4]:
from pyspark.mllib.tree import DecisionTree, DecisionTreeModel
from pyspark.mllib.regression import LabeledPoint

def mapToLabeledPoint(tup):
    key, val = tup
    # Change range of values from 1-4 to 0-3
    return LabeledPoint(int(val) -1, list(key))

# Split into test and train data
test_data, training_data = rdd.map(mapToLabeledPoint).randomSplit(weights=[0.3, 0.7], seed=1)


## Run decision tree

In [5]:
tree_model = DecisionTree.trainClassifier(training_data, numClasses=4, 
                                          categoricalFeaturesInfo={},
                                          impurity='gini', maxDepth=20, maxBins=200)

# Print results
print('features 0: Week Day')
print('features 1: Snapshot Date')
print('features 2: Checkin Date')
print('features 3: Day Diff')
print('features 4: Hotel Name')
# print(tree_model.toDebugString())

features 0: Week Day
features 1: Snapshot Date
features 2: Checkin Date
features 3: Day Diff
features 4: Hotel Name


## Print decision tree statistics

In [6]:
from pyspark.mllib.evaluation import MulticlassMetrics

predictions = tree_model.predict(test_data.map(lambda p: p.features))
predictionAndLabels = predictions.zip(test_data.map(lambda p: p.label))

metrics = MulticlassMetrics(predictionAndLabels)

print('Accuracy {}'.format(metrics.accuracy))
print('False positive rate {}'.format(metrics.weightedFalsePositiveRate))
print(metrics.confusionMatrix())

Accuracy 0.6326970620364515
False positive rate 0.1335184927564076
DenseMatrix([[ 4813.,  1375.,   876.,   670.],
             [ 1342.,  7542.,  1572.,   702.],
             [  910.,  1957.,  6118.,   789.],
             [  744.,   952.,   888.,  3536.]])


## Run naive bayes

In [7]:
from pyspark.mllib.classification import NaiveBayes, NaiveBayesModel

# Train a naive Bayes model.
model = NaiveBayes.train(training_data, 1.0)

# Make prediction
NaiveBayes_predictionAndLabel = test_data.map(lambda p: (float(model.predict(p.features)), p.label))

naive_metrics = MulticlassMetrics(NaiveBayes_predictionAndLabel)

print('Accuracy {}'.format(naive_metrics.accuracy))
print('False positive rate {}'.format(naive_metrics.weightedFalsePositiveRate))
print(naive_metrics.confusionMatrix())

Accuracy 0.25142298625883974
False positive rate 0.21618546897854687
DenseMatrix([[  130.,  2424.,   230.,  4950.],
             [  148.,  3547.,   649.,  6814.],
             [  136.,  2872.,   656.,  6110.],
             [   44.,  1426.,   237.,  4413.]])
