In [307]:
import findspark
findspark.init()

from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import desc

spark = SparkSession.builder \
            .master('local') \
            .appName('hotels') \
            .getOrCreate()
            
sc = spark.sparkContext 

In [308]:
def mapByHotelName(row):
    key = row['Hotel Name']
    val = 1
    return (key,val)

def reduceByCount(val1, val2):
    return (val1 + val2)
    
hotels_rdd = spark.read.csv('../input/Hotels_Data_Changed.csv',header=True).rdd
hotel_names_top150 = hotels_rdd \
            .map(mapByHotelName) \
            .reduceByKey(reduceByCount) \
            .sortBy(lambda x: x[1], ascending = False) \
            .map(lambda x: x[0]) \
            .take(150)
            
hotels_top150 = hotels_rdd.filter(lambda x: x['Hotel Name'] in hotel_names_top150)

In [309]:
def mapByCheckinDate(row):
    key = row['Checkin Date']
    val = 1
    return (key,val)

checkin_top40_keys = hotels_top150 \
            .map(mapByCheckinDate) \
            .reduceByKey(reduceByCount) \
            .sortBy(lambda x: x[1], ascending = False) \
            .map(lambda x: x[0]) \
            .take(40)
            
top_hotels_in_date = hotels_top150.filter(lambda x: x['Checkin Date'] in checkin_top40_keys)

In [357]:
def mapHotelDatesDiscountCode(row):
    key = (row['Hotel Name'],row['Checkin Date'],row['Discount Code'])
    val = float(row['Discount Price'])
    return (key,val)

def reduceByHotelAndCheckinDate(val1,val2):
    if val1 == -1:
        return val2
    elif val2 == -1:
        return val1
    else:
        return min(val1,val2)

def createAllKeys(hotels,dates,codes):
    names_keys = sc.parallelize(hotels)
    dates_keys = sc.parallelize(dates)
    codes_keys = sc.parallelize(codes)
    return names_keys.cartesian(dates_keys.cartesian(codes_keys)).map(lambda x: (x[0], x[1][0], x[1][1]))

hotels_date_price_missing_data = createAllKeys(hotel_names_top150,checkin_top40_keys,['1','2','3','4']).map(lambda x: (x, -1))

hotels_date_price = \
        top_hotels_in_date \
                .map(mapHotelDatesDiscountCode)
            
hotels_date_price_all_values = sc.union([hotels_date_price_missing_data, hotels_date_price]).reduceByKey(reduceByHotelAndCheckinDate)
hotels_date_price_all_values = hotels_date_price_all_values.map(lambda x: (x[0][0], x[1])).groupByKey().mapValues(list)

In [367]:
def normalise(prices):
    max_price = max(prices)
    min_price = min(filter(lambda x: x != -1, prices))
    
    diff = max_price - min_price
    
    arr = []
    for price in prices:
        if price != -1:
            arr.append(float(price - min_price) * 100/diff)
        else:
            arr.append(price)
            
    return arr

normalised_hotels_prices = hotels_date_price_all_values.map(lambda x: (x[0], normalise(x[1])))

-1