## Load the csv file and convert features to numeric

In [48]:
# Finds the spark path 
import findspark
findspark.init()

import pyspark
import pyspark.sql
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline

spark = SparkSession.builder \
     .master("local") \
     .appName("hotels") \
     .getOrCreate()

df = spark.read.csv("../input/Hotels_data_Changed.csv", header=True)
# Transform string values to numeric
indexers = [StringIndexer(inputCol="WeekDay", outputCol="WeekDayIndex"),
            StringIndexer(inputCol="Hotel Name", outputCol="HotelNameIndex"),]
pipeline = Pipeline(stages=indexers)
indexed_df = pipeline.fit(df).transform(df)

In [49]:
from pyspark.sql.functions import col, udf
from pyspark.sql.types import DoubleType, IntegerType
import pandas as pd

#Transform date values to numeric
dateFormatter = udf(lambda x:  pd.to_datetime(x).toordinal(), IntegerType())

indexed_df = indexed_df.withColumn('SnapshotDateIndex', dateFormatter(col('Snapshot Date')))
indexed_df = indexed_df.withColumn('CheckinDateIndex', dateFormatter(col('Checkin Date')))

### Get the highest discount code

In [50]:
def rowToKeyValue(row):
    key = (row['WeekDayIndex'], row["SnapshotDateIndex"], row["CheckinDateIndex"], row["DayDiff"], row["HotelNameIndex"])
    val = (row['DiscountPerc'], row["Discount Code"])
    return (key,val)

def reduceToMaxPrice(val1, val2):
    price1, code1 = val1
    price2, code2 = val2
    if (price1 > price2):
        return (price1, code1)
    else:
        return (price2, code2)
    
rdd = indexed_df.rdd.map(rowToKeyValue)\
    .reduceByKey(reduceToMaxPrice)\
    .mapValues(lambda val: val[1])

## Create test & training data

In [51]:
from pyspark.mllib.tree import DecisionTree, DecisionTreeModel
from pyspark.mllib.regression import LabeledPoint

def mapToLabeledPoint(tup):
    key, val = tup
    return LabeledPoint(val, list(key))

# Split into test and train data
test_data, training_data = rdd.map(mapToLabeledPoint).randomSplit(weights=[0.3, 0.7], seed=1)

## Run decision tree

In [53]:
tree_model = DecisionTree.trainClassifier(training_data, numClasses=5, 
                                          categoricalFeaturesInfo={},
                                          impurity='entropy', maxDepth=5, maxBins=30)

predictions = tree_model.predict(test_data.map(lambda p: p.features))
labels_and_preds = test_data.map(lambda p: p.label).zip(predictions)
test_accuracy = labels_and_preds.filter(lambda zipped: zipped[0] == zipped[1]).count() / float(test_data.count())
print(test_accuracy)

0.35379464285714285


## Run naive bayes

In [55]:
from pyspark.mllib.classification import NaiveBayes, NaiveBayesModel

# Train a naive Bayes model.
model = NaiveBayes.train(training_data, 1.0)

# Make prediction and test accuracy.
predictionAndLabel = test_data.map(lambda p: (model.predict(p.features), p.label))
accuracy = 1.0 * predictionAndLabel.filter(lambda pl: pl[0] == pl[1]).count() / test.count()
print('model accuracy {}'.format(accuracy))

model accuracy 0.259734623015873
