In [2]:
import datetime
import pandas as pd
import numpy as np

from pyspark import SparkContext
from pyspark import SQLContext
from pyspark.sql.session import SparkSession


from pyspark.sql import DataFrameStatFunctions as statFunc

from pyspark.sql.functions import row_number
from pyspark.sql.window import Window
from pyspark.sql.functions import desc

from pyspark.sql import functions as F
from pyspark.sql import types as T
from pyspark.sql.types import IntegerType,FloatType,DoubleType

## Read Prediction

In [61]:
#Oct
file_loc = "s3://ege-ds-workshops-corp/yixli/data_understanding/09-22-2020_prediction.csv"

#Nov
#file_loc = "s3://ege-ds-workshops-corp/yixli/data_understanding/09-23-2020_prediction.csv"

#Dec
#file_loc = "s3://ege-ds-workshops-corp/yixli/data_understanding/09-25-2020_prediction.csv"
prediction = sqlContext.read.format('csv').\
            options(header='True', inferSchema='True', delimiter=',').\
            load(file_loc)


prediction = prediction.\
            filter(F.col("hotel_id")>0).\
            filter(F.col('src_rate_amount_usd').isNotNull())
print(prediction.count())

4281060

In [62]:
prediction = prediction.\
                     withColumn('duration', F.datediff(F.col("check_out_date"),F.col( "check_in_date")).cast(IntegerType()))

### top seven hotel revenue

In [63]:
top_seven_prediction = prediction.withColumn('rate_rn',row_number().over(Window.partitionBy("message_id","hotel_id","check_in_date","check_out_date",'tuid').orderBy(desc('prob')))).\
filter(F.col('rate_rn')<=7)

In [64]:
top_seven_prediction.count()

2566729

In [65]:
top_seven_prediction = top_seven_prediction.withColumn('trun_rate_n',F.count('rate_index').over(Window.partitionBy("message_id","hotel_id","check_in_date","check_out_date",'tuid')))

In [66]:
top_seven_prediction = top_seven_prediction.withColumn('trun_hotel_revenue',F.sum(F.col('rate_revenue')*F.col('duration')).over(Window.partitionBy("message_id","hotel_id","check_in_date","check_out_date",'tuid')))

In [67]:
top_seven_prediction = top_seven_prediction.withColumn('avg_trun_hotel_revenue',F.col('trun_hotel_revenue')/F.col('trun_rate_n'))

### normalized hotel revenue

In [68]:
prediction = prediction.withColumn('sum_prob',F.sum('prob').over(Window.partitionBy("message_id","hotel_id","check_in_date","check_out_date",'tuid')))

In [69]:
prediction = prediction.withColumn('normalized_prob',F.col('prob')/F.col('sum_prob'))

In [70]:
prediction = prediction.withColumn('normalized_rate_revenue',F.col('normalized_prob')*F.col('src_supply_revenue_usd'))

In [71]:
prediction = prediction.withColumn('normalized_hotel_revenue',F.sum(F.col('normalized_rate_revenue')*F.col('duration')).over(Window.partitionBy("message_id","hotel_id","check_in_date","check_out_date",'tuid')))

In [72]:
prediction.select("message_id","hotel_id","check_in_date","check_out_date","tuid",'normalized_hotel_revenue','hotel_index','score_1','bk_hotel_index').\
dropDuplicates().count()

449231

In [73]:
revenue_prediction = prediction.select("message_id","hotel_id","check_in_date","check_out_date","tuid",'normalized_hotel_revenue','hotel_index','score_1','bk_hotel_index').\
join(top_seven_prediction .\
select("message_id","hotel_id","check_in_date","check_out_date","tuid",'avg_trun_hotel_revenue','hotel_index','score_1','bk_hotel_index'),
    ["message_id","hotel_id","check_in_date","check_out_date","tuid",'hotel_index','score_1','bk_hotel_index'],how='left').dropDuplicates()

In [74]:
revenue_prediction.count()

449231

In [75]:
revenue_prediction.filter(F.col('bk_hotel_index')==F.col('hotel_index')).count()

122393

In [76]:
revenue_prediction = revenue_prediction.filter(F.col('score_1').isNotNull())

### Bands

In [77]:
band_df = revenue_prediction.\
    groupby("message_id", "tuid").\
    agg(F.count("hotel_id").alias("n"),F.max("score_1").alias("ub"),F.min("score_1").alias("lb")).\
    filter(F.col("n")>=1).\
    filter(F.col("n")<=30).\
    withColumn("n_band", F.lit(5)).\
    withColumn("sz_band", F.expr("(ub - lb)/n_band")).\
    select("message_id", "tuid", "sz_band", "ub", "lb")

In [78]:
revenue_prediction2 = revenue_prediction.\
    join(band_df, ["message_id", "tuid"]).\
    withColumn("band", F.when(F.col("sz_band") == 0, 1).otherwise(F.expr("int((score_1-lb)/sz_band)")))

In [79]:
revenue_prediction2.groupby("band").agg(F.count("*").alias("n")).show(10)

+----+------+
|band|     n|
+----+------+
|   1| 55426|
|   3| 35900|
|   5| 85728|
|   4| 35224|
|   2| 36073|
|   0|150952|
+----+------+

In [80]:
revenue_prediction2.count()

399303

In [81]:
revenue_prediction2.filter(F.col('bk_hotel_index')==F.col('hotel_index')).count()

95207

## New revenue with top 7

In [82]:
revenue_prediction2.filter(F.col('bk_hotel_index')==F.col('hotel_index')).\
select('avg_trun_hotel_revenue','normalized_hotel_revenue').\
agg(F.sum('avg_trun_hotel_revenue'),F.sum('normalized_hotel_revenue')).show()

+---------------------------+-----------------------------+
|sum(avg_trun_hotel_revenue)|sum(normalized_hotel_revenue)|
+---------------------------+-----------------------------+
|            4711294.6662152|            5464323.978237547|
+---------------------------+-----------------------------+

In [83]:
revenue_prediction2 = revenue_prediction2.\
 withColumn("band_index_top7", row_number().over(Window.partitionBy("message_id", "tuid","band").orderBy(F.desc('avg_trun_hotel_revenue')))).\
withColumn("new_hotel_index1", row_number().over(Window.partitionBy("message_id", "tuid").\
                                                orderBy(F.asc("band"),F.asc("band_index_top7"))))

In [84]:
revenue_prediction2.filter(F.col('bk_hotel_index')==F.col('new_hotel_index1')).\
select('avg_trun_hotel_revenue','normalized_hotel_revenue').\
agg(F.sum('avg_trun_hotel_revenue'),F.sum('normalized_hotel_revenue')).show()

+---------------------------+-----------------------------+
|sum(avg_trun_hotel_revenue)|sum(normalized_hotel_revenue)|
+---------------------------+-----------------------------+
|          5064760.280179547|             6025628.22742913|
+---------------------------+-----------------------------+

In [85]:
revenue_prediction2.filter(F.col('hotel_index')!=F.col('new_hotel_index1')).count()

340303

## Normalized revenue

In [86]:
revenue_prediction2.filter(F.col('bk_hotel_index')==F.col('hotel_index')).\
select('normalized_hotel_revenue').\
agg(F.sum('normalized_hotel_revenue')).show()

+-----------------------------+
|sum(normalized_hotel_revenue)|
+-----------------------------+
|            5464323.978237551|
+-----------------------------+

In [87]:
revenue_prediction2 = revenue_prediction2.\
 withColumn("band_index_normalized", row_number().over(Window.partitionBy("message_id", "tuid","band").orderBy(F.desc('normalized_hotel_revenue')))).\
withColumn("new_hotel_index2", row_number().over(Window.partitionBy("message_id", "tuid").\
                                                orderBy(F.asc("band"),F.asc("band_index_normalized"))))

In [88]:
revenue_prediction2.filter(F.col('bk_hotel_index')==F.col('new_hotel_index2')).\
select('normalized_hotel_revenue').\
agg(F.sum('normalized_hotel_revenue')).show()

+-----------------------------+
|sum(normalized_hotel_revenue)|
+-----------------------------+
|            6039877.320793726|
+-----------------------------+

In [89]:
revenue_prediction2.filter(F.col('hotel_index')!=F.col('new_hotel_index2')).count()

340350

In [90]:
dir = 's3://ege-ds-workshops-corp/yixli/data_understanding/'
datestamp = datetime.datetime.now().strftime('%m-%d-%Y')
revenue_prediction2.repartition(1).write.format('com.databricks.spark.csv').mode('overwrite')\
.save(dir+datestamp+'_revenue_prediction.csv',header = 'true')