## Load the csv file and convert features to numeric

In [1]:
# Finds the spark path 
import findspark
findspark.init()

import pyspark
import pyspark.sql
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline

spark = SparkSession.builder \
     .master("local") \
     .appName("hotels") \
     .getOrCreate()

df = spark.read.csv("../input/Hotels_data_Changed.csv", header=True)
# Transform string values to numeric
indexers = [StringIndexer(inputCol="WeekDay", outputCol="WeekDayIndex"),
            StringIndexer(inputCol="Hotel Name", outputCol="HotelNameIndex"),]
pipeline = Pipeline(stages=indexers)
indexed_df = pipeline.fit(df).transform(df)

In [2]:
from pyspark.sql.functions import col, udf
from pyspark.sql.types import DoubleType, IntegerType
import pandas as pd

#Transform date values to numeric
dateFormatter = udf(lambda x:  pd.to_datetime(x).toordinal(), IntegerType())

indexed_df = indexed_df.withColumn('SnapshotDateIndex', dateFormatter(col('Snapshot Date')))
indexed_df = indexed_df.withColumn('CheckinDateIndex', dateFormatter(col('Checkin Date')))

### Get the highest discount code

In [23]:
def rowToKeyValue(row):
    key = (row['WeekDayIndex'], row["SnapshotDateIndex"], row["CheckinDateIndex"], float(row["DayDiff"]), row["HotelNameIndex"])
    val = ([row["Discount Code"]], row['DiscountPerc'])
    return (key,val)

def reduceToMaxDiscountPerKey(val1, val2):
    codes1, discount1 = val1
    codes2, discount2 = val2
    if (discount1 > discount2):
        return val1
    elif(discount2 > discount1):
        return val2
    else: # In case the discounts are equals, merge the prices to same array
        return (codes1+ codes2, discount1)

def flatMapDiscountCodes(row):
    key, val = row
    codes = val[0]
    # Return list of key & code
    return [(key, code) for code in codes]
    
rdd = indexed_df.rdd.map(rowToKeyValue)\
    .reduceByKey(reduceToMaxDiscountPerKey)\
    .flatMap(flatMapDiscountCodes)


PythonRDD[143] at RDD at PythonRDD.scala:48


## Create test & training data

In [5]:
from pyspark.mllib.tree import DecisionTree, DecisionTreeModel
from pyspark.mllib.regression import LabeledPoint

def mapToLabeledPoint(tup):
    key, val = tup
    # Change range of values from 1-4 to 0-3
    return LabeledPoint(int(val) -1, list(key))

# Split into test and train data
test_data, training_data = rdd.map(mapToLabeledPoint).randomSplit(weights=[0.3, 0.7], seed=1)

## Run decision tree

In [6]:
tree_model = DecisionTree.trainClassifier(training_data, numClasses=4, 
                                          categoricalFeaturesInfo={},
                                          impurity='entropy', maxDepth=5, maxBins=30)

# Print results
print('features 0: Week Day')
print('features 1: Snapshot Date')
print('features 2: Checkin Date')
print('features 3: Day Diff')
print('features 4: Hotel Name')
print(tree_model.toDebugString())

features 0: Week Day
features 1: Snapshot Date
features 2: Checkin Date
features 3: Day Diff
features 4: Hotel Name
DecisionTreeModel classifier of depth 5 with 63 nodes
  If (feature 0 <= 5.0)
   If (feature 0 <= 4.0)
    If (feature 0 <= 1.0)
     If (feature 4 <= 84.0)
      If (feature 0 <= 0.0)
       Predict: 0.0
      Else (feature 0 > 0.0)
       Predict: 1.0
     Else (feature 4 > 84.0)
      If (feature 2 <= 735961.0)
       Predict: 1.0
      Else (feature 2 > 735961.0)
       Predict: 1.0
    Else (feature 0 > 1.0)
     If (feature 0 <= 3.0)
      If (feature 4 <= 13.0)
       Predict: 1.0
      Else (feature 4 > 13.0)
       Predict: 1.0
     Else (feature 0 > 3.0)
      If (feature 4 <= 28.0)
       Predict: 1.0
      Else (feature 4 > 28.0)
       Predict: 1.0
   Else (feature 0 > 4.0)
    If (feature 4 <= 7.0)
     If (feature 4 <= 1.0)
      If (feature 2 <= 735947.0)
       Predict: 3.0
      Else (feature 2 > 735947.0)
       Predict: 2.0
     Else (feature 4 > 1.0)


## Print decision tree statistics

In [8]:
from pyspark.mllib.evaluation import MulticlassMetrics

predictions = tree_model.predict(test_data.map(lambda p: p.features))
predictionAndLabels = predictions.zip(test_data.map(lambda p: p.label))

metrics = MulticlassMetrics(predictionAndLabels)

print('Accuracy {}'.format(metrics.accuracy))
print('False positive rate {}'.format(metrics.weightedFalsePositiveRate))
print(metrics.confusionMatrix())

Accuracy 0.35315931696659575
False positive rate 0.266797804895072
DenseMatrix([[2552., 4528.,  513.,  127.],
             [2058., 8213.,  671.,   87.],
             [1727., 6729., 1286.,  151.],
             [1493., 3669.,  748.,  234.]])


## Run naive bayes

In [34]:
from pyspark.mllib.classification import NaiveBayes, NaiveBayesModel

# Train a naive Bayes model.
model = NaiveBayes.train(training_data, 1.0)

# Make prediction
NaiveBayes_predictionAndLabel = test_data.map(lambda p: (float(model.predict(p.features)), p.label))

naive_metrics = MulticlassMetrics(NaiveBayes_predictionAndLabel)

print('Accuracy {}'.format(naive_metrics.accuracy))
print('False positive rate {}'.format(naive_metrics.weightedFalsePositiveRate))
print(naive_metrics.confusionMatrix())

Accuracy 0.259734623015873
False positive rate 0.2240254536536272
DenseMatrix([[  84., 2222.,  208., 4262.],
             [ 103., 3363.,  514., 6248.],
             [  85., 2804.,  551., 5717.],
             [  35., 1479.,  201., 4380.]])
