## Load the csv file and convert features to numeric

In [2]:
# Finds the spark path 
import findspark
findspark.init()

import pyspark
import pyspark.sql
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline

spark = SparkSession.builder \
     .master("local") \
     .appName("hotels") \
     .getOrCreate()

df = spark.read.csv("../input/Hotels_data_Changed.csv", header=True)
# Transform string values to numeric
indexers = [StringIndexer(inputCol="WeekDay", outputCol="WeekDayIndex"),
            StringIndexer(inputCol="Hotel Name", outputCol="HotelNameIndex"),]
pipeline = Pipeline(stages=indexers)
indexed_df = pipeline.fit(df).transform(df)

In [3]:
from pyspark.sql.functions import col, udf
from pyspark.sql.types import DoubleType, IntegerType
import pandas as pd

#Transform date values to numeric
dateFormatter = udf(lambda x:  pd.to_datetime(x).toordinal(), IntegerType())

indexed_df = indexed_df.withColumn('SnapshotDateIndex', dateFormatter(col('Snapshot Date')))
indexed_df = indexed_df.withColumn('CheckinDateIndex', dateFormatter(col('Checkin Date')))

print (indexed_df.head())

Row(_c0='0', Snapshot ID='1', Snapshot Date='2015-07-17', Checkin Date='2015-08-12', Days='5', Original Price='1178', Discount Price='1040', Discount Code='1', Available Rooms='6', Hotel Name='Best Western Plus Seaport Inn Downtown', Hotel Stars='3', DayDiff='26', WeekDay='Wed', DiscountDiff='138', DiscountPerc='11.714770797962649', WeekDayIndex=0.0, HotelNameIndex=153.0, SnapshotDateIndex=735796, CheckinDateIndex=735822)


### Get the highest discount code

In [4]:
def rowToKeyValue(row):
    key = (row['WeekDayIndex'], row["SnapshotDateIndex"], row["CheckinDateIndex"], float(row["DayDiff"]), row["HotelNameIndex"])
    val = (row['DiscountPerc'], row["Discount Code"])
    return (key,val)

def reduceToMaxPrice(val1, val2):
    price1, code1 = val1
    price2, code2 = val2
    if (price1 > price2):
        return (price1, code1)
    else:
        return (price2, code2)
    
rdd = indexed_df.rdd.map(rowToKeyValue)\
    .reduceByKey(reduceToMaxPrice)\
    .mapValues(lambda val: val[1])

## Create test & training data

In [5]:
from pyspark.mllib.tree import DecisionTree, DecisionTreeModel
from pyspark.mllib.regression import LabeledPoint

def mapToLabeledPoint(tup):
    key, val = tup
    # Change range of values from 1-4 to 0-3
    return LabeledPoint(int(val) -1, list(key))

# Split into test and train data
test_data, training_data = rdd.map(mapToLabeledPoint).randomSplit(weights=[0.3, 0.7], seed=1)


## Run decision tree

In [31]:
tree_model = DecisionTree.trainClassifier(training_data, numClasses=4, 
                                          categoricalFeaturesInfo={},
                                          impurity='gini', maxDepth=20, maxBins=200)

# Print results
print('features 0: Week Day')
print('features 1: Snapshot Date')
print('features 2: Checkin Date')
print('features 3: Day Diff')
print('features 4: Hotel Name')
# print(tree_model.toDebugString())

features 0: Week Day
features 1: Snapshot Date
features 2: Checkin Date
features 3: Day Diff
features 4: Hotel Name


## Print decision tree statistics

In [32]:
from pyspark.mllib.evaluation import MulticlassMetrics

predictions = tree_model.predict(test_data.map(lambda p: p.features))
predictionAndLabels = predictions.zip(test_data.map(lambda p: p.label))

metrics = MulticlassMetrics(predictionAndLabels)

print('Accuracy {}'.format(metrics.accuracy))
print('False positive rate {}'.format(metrics.weightedFalsePositiveRate))
print(metrics.confusionMatrix())

Accuracy 0.677765376984127
False positive rate 0.11711902113715371
DenseMatrix([[ 4487.,  1076.,   824.,   480.],
             [  938.,  7263.,  1361.,   522.],
             [  748.,  1445.,  6352.,   697.],
             [  513.,   881.,   909.,  3760.]])


## Run naive bayes

In [37]:
from pyspark.mllib.classification import NaiveBayes, NaiveBayesModel

# Train a naive Bayes model.
training_data.toDF().toPandas().to_csv('training.csv')
model = NaiveBayes.train(training_data, 1.0)

# Make prediction
NaiveBayes_predictionAndLabel = test_data.map(lambda p: (float(model.predict(p.features)), p.label))

naive_metrics = MulticlassMetrics(NaiveBayes_predictionAndLabel)

print('Accuracy {}'.format(naive_metrics.accuracy))
print('False positive rate {}'.format(naive_metrics.weightedFalsePositiveRate))
print(naive_metrics.confusionMatrix())

Accuracy 0.25806051587301587
False positive rate 0.22261778824380216
DenseMatrix([[   82.,  2216.,   230.,  4339.],
             [  116.,  3312.,   566.,  6090.],
             [   88.,  2847.,   597.,  5710.],
             [   42.,  1470.,   218.,  4333.]])
