In [2]:
import datetime
import pandas as pd
import numpy as np

from pyspark import SparkContext
from pyspark import SQLContext
from pyspark.sql.session import SparkSession


from pyspark.sql import DataFrameStatFunctions as statFunc

from pyspark.sql.functions import row_number
from pyspark.sql.window import Window
from pyspark.sql.functions import desc

from pyspark.sql import functions as F
from pyspark.sql import types as T
from pyspark.sql.types import IntegerType,FloatType,DoubleType

## 1 Read Prediction

This is the prediction from the model with data ranged from 10/01/2019 - 01/01/2020.

In [3]:
prediction = sqlContext.read.parquet('s3://ege-ds-workshops-corp/yixli/prediction/prediction')

In [4]:
prediction = prediction.\
                     withColumn('duration', F.datediff(F.col("check_out_date"),F.col( "check_in_date")).cast(IntegerType()))

### top seven hotel revenue

In [5]:
top_seven_prediction = prediction.withColumn('rate_rn',row_number().over(Window.partitionBy("message_id","hotel_id","check_in_date","check_out_date",'tuid').orderBy(desc('prob')))).\
filter(F.col('rate_rn')<=7)

In [6]:
top_seven_prediction.count()

57578260

In [7]:
top_seven_prediction = top_seven_prediction.withColumn('trun_rate_n',F.count('rate_index').over(Window.partitionBy("message_id","hotel_id","check_in_date","check_out_date",'tuid')))

In [8]:
top_seven_prediction = top_seven_prediction.withColumn('trun_hotel_revenue',F.sum(F.col('rate_revenue')*F.col('duration')).over(Window.partitionBy("message_id","hotel_id","check_in_date","check_out_date",'tuid')))

In [9]:
top_seven_prediction = top_seven_prediction.withColumn('avg_trun_hotel_revenue',F.col('trun_hotel_revenue')/F.col('trun_rate_n'))

### normalized hotel revenue

In [10]:
prediction = prediction.withColumn('sum_prob',F.sum('prob').over(Window.partitionBy("message_id","hotel_id","check_in_date","check_out_date",'tuid')))

In [11]:
prediction = prediction.withColumn('normalized_prob',F.col('prob')/F.col('sum_prob'))

In [12]:
prediction = prediction.withColumn('normalized_rate_revenue',F.col('normalized_prob')*F.col('src_supply_revenue_usd'))

In [13]:
prediction = prediction.withColumn('normalized_hotel_revenue',F.sum(F.col('normalized_rate_revenue')*F.col('duration')).over(Window.partitionBy("message_id","hotel_id","check_in_date","check_out_date",'tuid')))

### Join two estimation together

In [14]:
revenue_prediction = prediction.select("message_id","hotel_id","check_in_date","check_out_date","tuid",'normalized_hotel_revenue','hotel_index','score_1','bk_hotel_index','message_date').\
join(top_seven_prediction .\
select("message_id","hotel_id","check_in_date","check_out_date","tuid",'avg_trun_hotel_revenue','hotel_index','score_1','bk_hotel_index'),
    ["message_id","hotel_id","check_in_date","check_out_date","tuid",'hotel_index','score_1','bk_hotel_index'],how='left').dropDuplicates()

In [15]:
revenue_prediction.count()

10670838

In [16]:
# num of bookings
revenue_prediction.filter(F.col('bk_hotel_index')==F.col('hotel_index')).count()

537428

About 7.9% scores are NULL. After excluding those samples, about 8.5% of bookings are missed.

In [17]:
revenue_prediction = revenue_prediction.filter(F.col('score_1').isNotNull())

In [18]:
revenue_prediction.count()

9826186

In [19]:
# num of bookings after excluding NULL scores
revenue_prediction.filter(F.col('bk_hotel_index')==F.col('hotel_index')).count()

491393

## 2 Bands

In [20]:
band_df = revenue_prediction.\
    groupby("message_id", "tuid").\
    agg(F.count("hotel_id").alias("n"),F.max("score_1").alias("ub"),F.min("score_1").alias("lb")).\
    filter(F.col("n")>=1).\
    filter(F.col("n")<=30).\
    withColumn("n_band", F.lit(5)).\
    withColumn("sz_band", F.expr("(ub - lb)/n_band")).\
    select("message_id", "tuid", "sz_band", "ub", "lb")

In [21]:
revenue_prediction2 = revenue_prediction.\
    join(band_df, ["message_id", "tuid"]).\
    withColumn("band", F.when(F.col("sz_band") == 0, 1).otherwise(F.expr("int((score_1-lb)/sz_band)")))

In [22]:
revenue_prediction2.groupby("band").agg(F.count("*").alias("n")).show(10)

+----+-------+
|band|      n|
+----+-------+
|   1|1611039|
|   3|1782026|
|   5| 491026|
|   4|1711397|
|   2|1504973|
|   0|2527710|
+----+-------+

In [23]:
revenue_prediction2.count()

9628171

After banding, additional 1.5% of bookings are missed.

In [24]:
revenue_prediction2.filter(F.col('bk_hotel_index')==F.col('hotel_index')).count()

483813

In [25]:
revenue_prediction2 = revenue_prediction2.withColumn('month',F.month('message_date'))

## 3 New revenue with top 7

In [27]:
revenue_prediction2.filter(F.col('bk_hotel_index')==F.col('hotel_index')).\
select('normalized_hotel_revenue').\
agg(F.sum('normalized_hotel_revenue')).show()

+-----------------------------+
|sum(normalized_hotel_revenue)|
+-----------------------------+
|         2.4983206671227984E7|
+-----------------------------+

In [28]:
revenue_prediction2.filter(F.col('bk_hotel_index')==F.col('hotel_index')).\
select('normalized_hotel_revenue','month').\
groupBy('month').agg(F.sum('normalized_hotel_revenue')).show()

+-----+-----------------------------+
|month|sum(normalized_hotel_revenue)|
+-----+-----------------------------+
|   12|            6668063.733493605|
|    1|           10982.181742027944|
|   10|            9914859.241060503|
|   11|             8389301.51493186|
+-----+-----------------------------+

In [29]:
revenue_prediction2 = revenue_prediction2.\
 withColumn("band_index_top7", row_number().over(Window.partitionBy("message_id", "tuid","band").orderBy(F.desc('avg_trun_hotel_revenue')))).\
withColumn("new_hotel_index1", row_number().over(Window.partitionBy("message_id", "tuid").\
                                                orderBy(F.asc("band"),F.asc("band_index_top7"))))

In [30]:
revenue_prediction2.filter(F.col('bk_hotel_index')==F.col('new_hotel_index1')).\
select('avg_trun_hotel_revenue','normalized_hotel_revenue').\
agg(F.sum('normalized_hotel_revenue')).show()

+-----------------------------+
|sum(normalized_hotel_revenue)|
+-----------------------------+
|         2.8399222036889426E7|
+-----------------------------+

In [31]:
revenue_prediction2.filter(F.col('bk_hotel_index')==F.col('new_hotel_index1')).\
select('normalized_hotel_revenue','month').\
groupBy('month').agg(F.sum('normalized_hotel_revenue')).show()

+-----+-----------------------------+
|month|sum(normalized_hotel_revenue)|
+-----+-----------------------------+
|   12|            7713810.819885342|
|    1|           10700.185977735482|
|   10|         1.1027879571481427E7|
|   11|            9644683.758919638|
+-----+-----------------------------+

In [32]:
revenue_prediction2.filter(F.col('hotel_index')!=F.col('new_hotel_index1')).count()

9371347

## 4 New revenue with normalized probabilities

In [33]:
revenue_prediction2.filter(F.col('bk_hotel_index')==F.col('hotel_index')).\
select('normalized_hotel_revenue').\
agg(F.sum('normalized_hotel_revenue')).show()

+-----------------------------+
|sum(normalized_hotel_revenue)|
+-----------------------------+
|         2.4983206671227977E7|
+-----------------------------+

In [34]:
revenue_prediction2.filter(F.col('bk_hotel_index')==F.col('hotel_index')).\
select('normalized_hotel_revenue','month').\
groupBy('month').agg(F.sum('normalized_hotel_revenue')).show()

+-----+-----------------------------+
|month|sum(normalized_hotel_revenue)|
+-----+-----------------------------+
|   12|            6668063.733493605|
|    1|           10982.181742027948|
|   10|            9914859.241060501|
|   11|            8389301.514931852|
+-----+-----------------------------+

In [35]:
revenue_prediction2 = revenue_prediction2.\
 withColumn("band_index_normalized", row_number().over(Window.partitionBy("message_id", "tuid","band").orderBy(F.desc('normalized_hotel_revenue')))).\
withColumn("new_hotel_index2", row_number().over(Window.partitionBy("message_id", "tuid").\
                                                orderBy(F.asc("band"),F.asc("band_index_normalized"))))

In [36]:
revenue_prediction2.filter(F.col('bk_hotel_index')==F.col('new_hotel_index2')).\
select('normalized_hotel_revenue').\
agg(F.sum('normalized_hotel_revenue')).show()

+-----------------------------+
|sum(normalized_hotel_revenue)|
+-----------------------------+
|          2.867964139604674E7|
+-----------------------------+

In [37]:
revenue_prediction2.filter(F.col('bk_hotel_index')==F.col('new_hotel_index2')).\
select('normalized_hotel_revenue','month').\
groupBy('month').agg(F.sum('normalized_hotel_revenue')).show()

+-----+-----------------------------+
|month|sum(normalized_hotel_revenue)|
+-----+-----------------------------+
|   12|            7796310.409051943|
|    1|            10532.18705316058|
|   10|         1.1139744879113207E7|
|   11|            9732178.442727523|
+-----+-----------------------------+

In [38]:
revenue_prediction2.filter(F.col('hotel_index')!=F.col('new_hotel_index2')).count()

9371152

In [39]:
dir = 's3://ege-ds-workshops-corp/yixli/prediction/'
revenue_prediction2.repartition(1).write.mode('overwrite').parquet(dir+'revenue_estimation2')