In [1]:
import datetime
import pandas as pd
import numpy as np

from pyspark import SparkContext
from pyspark import SQLContext
from pyspark.sql.session import SparkSession


from pyspark.sql import DataFrameStatFunctions as statFunc

from pyspark.sql.functions import row_number
from pyspark.sql.window import Window
from pyspark.sql.functions import desc

from pyspark.sql import functions as F
from pyspark.sql import types as T
from pyspark.sql.types import IntegerType,FloatType,DoubleType

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
43,application_1600892099892_0044,pyspark3,idle,Link,Link,âœ”


SparkSession available as 'spark'.


## 1 Read Prediction

In [3]:
# This is the prediction from the model with data ranged from 10/01/2019 - 01/01/2020.
prediction = sqlContext.read.parquet('s3://ege-ds-workshops-corp/yixli/prediction/prediction')

### 1.1 Calculate normalized rate revenue

In [None]:
def calculateNormalizedRateRevenue(prediction):
    # sum probabilities by hotel
    prediction = prediction.withColumn('sum_prob',F.sum('prob').over(Window.partitionBy("message_id","hotel_id","check_in_date","check_out_date",'tuid')))

    # normalized rate-picking up probabilities
    prediction = prediction.withColumn('normalized_prob',F.col('prob')/F.col('sum_prob'))

    # normalized rate revenue
    prediction = prediction.withColumn('normalized_rate_revenue',F.col('normalized_prob')*F.col('src_supply_revenue_usd'))
    return prediction

### 1.2 Sum rate level revenue to hotel level revenue

In [8]:
def  calculateNormalizedHotelRevenue(prediction):
    # get duration of stay
    prediction = prediction.\
                         withColumn('duration', F.datediff(F.col("check_out_date"),F.col( "check_in_date")).cast(IntegerType()))
    # get hotel revenue df by sum up rate revenue for each hotel
    revenue_prediction = prediction.withColumn('normalized_hotel_revenue',
                                       F.sum(F.col('normalized_rate_revenue')*F.col('duration')).\
                                       over(Window.partitionBy("message_id","hotel_id","check_in_date","check_out_date",'tuid'))).\
                            select("message_id","hotel_id","check_in_date","check_out_date","tuid",'normalized_hotel_revenue','hotel_index','score_1','bk_hotel_index','message_date').\
                            dropDuplicates()
    return revenue_prediction

## 2 Bands

In [None]:
def getBands(n_band,revenue_prediction):
    # exclude those hotels whose scores are NULL
    revenue_prediction = revenue_prediction.filter(F.col('score_1').isNotNull())
    # create band_df with upper and lower bounds of each band
    band_df = revenue_prediction.\
    groupby("message_id", "tuid").\
    agg(F.count("hotel_id").alias("n"),F.max("score_1").alias("ub"),F.min("score_1").alias("lb")).\
    filter(F.col("n")>=1).\
    filter(F.col("n")<=30).\
    withColumn("n_band", F.lit(n_band)).\
    withColumn("sz_band", F.expr("(ub - lb)/n_band")).\
    select("message_id", "tuid", "sz_band", "ub", "lb")
    # join hotel revenue df to get band for each hotel
    revenue_prediction2 = revenue_prediction.\
    join(band_df, ["message_id", "tuid"]).\
    withColumn("band", F.when(F.col("sz_band") == 0, 1).otherwise(F.expr("int((score_1-lb)/sz_band)")))
    return revenue_prediction2

## 3 New revenue with normalized probabilities

In [None]:
def getNewHotelIndex(revenue_prediction2):
    # add month column to df
    revenue_prediction2 = revenue_prediction2.withColumn('month',F.month('message_date'))
    # Sort by 'normalized_hotel_revenue' and obtain the new hotel index
    revenue_prediction2 = revenue_prediction2.\
    withColumn("band_index_normalized", row_number().over(Window.partitionBy("message_id", "tuid","band").\
                                                          orderBy(F.desc('normalized_hotel_revenue')))).\
    withColumn("new_hotel_index2", row_number().over(Window.partitionBy("message_id", "tuid").\
                                                    orderBy(F.asc("band"),F.asc("band_index_normalized"))))
    return revenue_prediction2

In [21]:
# total supply revenue
revenue_prediction2.filter(F.col('bk_hotel_index')==F.col('new_hotel_index2')).\
select('normalized_hotel_revenue').\
agg(F.sum('normalized_hotel_revenue')).show()

+-----------------------------+
|sum(normalized_hotel_revenue)|
+-----------------------------+
|         2.8679587695917085E7|
+-----------------------------+

In [26]:
# supply revenue by month
revenue_prediction2.filter(F.col('bk_hotel_index')==F.col('new_hotel_index2')).\
select('normalized_hotel_revenue','month').\
groupBy('month').agg(F.sum('normalized_hotel_revenue'),F.count('month')).show()

+-----+-----------------------------+------------+
|month|sum(normalized_hotel_revenue)|count(month)|
+-----+-----------------------------+------------+
|   12|           7795188.4543378055|      130326|
|    1|            10532.18705316058|          99|
|   10|         1.1140306050966235E7|      183587|
|   11|            9733561.003559899|      167541|
+-----+-----------------------------+------------+

In [39]:
# save hotel revenue df
dir = 's3://ege-ds-workshops-corp/yixli/prediction/'
revenue_prediction2.repartition(1).write.mode('overwrite').parquet(dir+'revenue_estimation3')