In [2]:
import datetime
from datetime import date
import pandas as pd
import numpy as np

from pyspark import SparkContext
from pyspark import SQLContext
from pyspark.sql.session import SparkSession


from pyspark.sql import DataFrameStatFunctions as statFunc

from pyspark.sql.functions import row_number
from pyspark.sql.window import Window
from pyspark.sql.functions import desc

from pyspark.sql import functions as F
from pyspark.sql import types as T
from pyspark.sql.types import IntegerType

## 1 Booking revenue for hotels below PL 

## 1.1 read search data

In [3]:
# read rate level df that we used for hotel revenue estimation 
rate_all_usd_df = sqlContext.read.parquet('s3://ege-ds-workshops-corp/yixli/data_preparation/rate_all_usd_2019')

In [4]:
# obtain hotel df 
last_hotels_below_pl_df = rate_all_usd_df.filter(F.col('score_1').isNotNull()).select("message_id","hotel_id","check_in_date","check_out_date","tuid",
                                                "hotel_index","bk_hotel_index",'hotel_result_index').dropDuplicates()

In [5]:
last_hotels_below_pl_df.count()

9826186

## 1.2 bookings below PL

In [6]:
# collect bookings from Redshift
sqlBookings = """SELECT 
      a.hotelfactid,
      a.hotelid as hotel_id,
     trunc(a.issuedate) as issue_date,
     trunc(a.traveldatestart) as check_in_date,
     trunc(a.traveldateend) as check_out_date,
     hsp.quarter as quarter,
      (case when a.customersystemid=1 then a.TUIDTraveler else g.TUIDInternal end) as tuid,      
      a.travelproductid AS travel_product_id,
      a.GroupAccountID AS group_account_id,
      hsp.rate_type AS hotel_rate_type,
      (case  
        when a.hotelrateTypeSupplyid IN (1,2,3,4,5,10,11,14,16,17,18,20,25,26,28,29,30,32) then 'GDS'
        when a.hotelrateTypeSupplyid IN (12,13,15,19,21,22,23,27,31,33,34) then 'NEG'
        when a.hotelrateTypeSupplyid = 6 then 'ESRA'
        when a.hotelrateTypeSupplyid = 7 then 'ESRM'
        when a.hotelrateTypeSupplyid = 8 then 'EPRA'
        when a.hotelrateTypeSupplyid = 9 then 'EPRM'
        when a.hotelrateTypeSupplyid IN (24,35) then 'HOTMIP'
        else 'UNK'
        end) as bk_rate_type,
      CASE WHEN a.OnlineBool = 1 THEN 'online' ELSE 'offline' END AS booking_type,
      a.bookingamtgross::DOUBLE precision*COALESCE(ex.exchangerate,1::DOUBLE precision) AS gross_booking_value_usd,
      a.bookingamtcommissionest::DOUBLE precision*COALESCE(ex.exchangerate,1::DOUBLE precision)
        + a.bookingamtgross::DOUBLE precision*COALESCE(ex.exchangerate,1::DOUBLE precision)*NVL(hsp.commission,0.00) AS total_commission,
      a.bookingamtmargin::DOUBLE precision*COALESCE(ex.exchangerate,1::DOUBLE precision) AS total_markup,
      4.3 AS total_gds,
      NVL(total_commission,0.00) + NVL(total_markup,0.00) + NVL(total_gds,0.00) AS total_supply_revenue
FROM public.HotelFact a
  JOIN public.ExchangeRateDailySubset ex
    ON a.IssueDateTimeID = ex.TimeID AND a.CurrencyCode = ex.FromCurrencyCode AND ex.ToCurrencyCode = 'USD'
  JOIN TravelerAccountDim g 
    ON a.TUIDTraveler = g.TUID AND a.customersystemid=g.customersystemid 
  JOIN public.GroupAccountdim d 
    ON a.GroupAccountID = d.GroupAccountID 
  JOIN public.hotel_supply hsp 
    ON hsp.year = EXTRACT (year FROM a.IssueDate)
    AND hsp.quarter = EXTRACT (quarter FROM a.IssueDate)
    AND hsp.rate_type = CASE
         WHEN a.HotelRateTypeSupplyID IN (7,9,24) THEN 'Expedia Collect'
         WHEN a.HotelRateTypeSupplyID IN (6,8,35) THEN 'Hotel Collect'
         WHEN a.HotelRateTypeSupplyID IN (12,13,15,19,21,22,23,27,31,33,34) THEN 'Negotiated'
         ELSE 'Published & GDS' END
    AND hsp.travel_product_id = a.TravelProductID
WHERE a.CustomerSystemID IN (1,2)
AND   (d.groupaccountinternaltypeid = 1 OR d.groupaccountinternaltypeid = 3)
AND a.BookingTypeID in (1,3) 
and a.issuedate>=to_date('20191001','YYYYMMDD') 
and a.issuedate<=to_date('20200101','YYYYMMDD')""".replace('\n',' ')

connection_string = "jdbc:redshift://egencia-reporting.czjkedodj6lc.us-west-2.redshift.amazonaws.com:5439/egedatamart?user=ds_rpt_user&password=Jan2019!";

bookings_df = sqlContext.read.\
    format("com.databricks.spark.redshift").\
    option("url", connection_string).\
    option("query", sqlBookings).\
    option("tempdir", "s3a://ege-ds-workshops-corp/yixli/").\
    load()

bookings_df.cache()
bookings_df.count()

1999891

In [7]:
# filter outliers
bookings_df = bookings_df.filter(F.col('total_supply_revenue')<1000)

In [9]:
# join with last_hotels_below_pl_df
bookings_below_pl = bookings_df.filter(F.col('bk_rate_type')!='NEG').join(last_hotels_below_pl_df,["hotel_id","check_in_date", "check_out_date","tuid"])
bookings_below_pl.count()

In [12]:
# total supply revenue
bookings_below_pl.select('total_supply_revenue').agg(F.sum("total_supply_revenue")).show()

+-------------------------+
|sum(total_supply_revenue)|
+-------------------------+
|     2.6355646240413312E7|
+-------------------------+

In [13]:
# supply revenue by month
bookings_below_pl = bookings_below_pl.withColumn('month',F.month('issue_date'))
bookings_below_pl.select('month','total_supply_revenue').groupBy('month').agg(F.sum("total_supply_revenue"),F.count('month')).show()

+-----+-------------------------+------------+
|month|sum(total_supply_revenue)|count(month)|
+-----+-------------------------+------------+
|   12|        6923447.194805484|      138121|
|   10|     1.0524547245132282E7|      197153|
|   11|        8907651.800475556|      179447|
+-----+-------------------------+------------+

## 2 Booking revenue for hotels above PL 

## 2.1 Get hotels above PL 

In [None]:
# Collect hotels above PL from search data
# count = 49569610
'''
use eandev; 
drop table if exists eandev.yixli_hotels_ab_pl;
create table eandev.yixli_hotels_ab_pl as  
select distinct tb1.message_id
  ,tb1.hotel_id
  ,tb1.tuid
  ,tb1.check_in_date
  ,tb1.check_out_date
  ,tb1.hotel_result_index
  ,tb1.message_date
  from
 ( select a.message_id
  ,a.hotel_id
  ,a.tuid
  ,a.check_in_date
  ,a.check_out_date
  ,a.hotel_result_index
  ,a.message_date
  from yixli_src_amentity_2019 as a
join yixli_last_idx_peterman_line_2019 as c on (a.message_id=c.message_id)
where (a.hotel_result_index<=c.last_idx_peterman_line)
order by a.message_id, a.hotel_result_index) as tb1
inner join (select b.message_id
  ,b.hotel_id
  ,b.tuid
  ,b.check_in_date
  ,b.check_out_date
  ,b.hotel_result_index
  ,b.message_date
  from yixli_search_commission_2019_v3 as b
join yixli_last_idx_peterman_line_2019 as c on (b.message_id=c.message_id)
where (b.hotel_result_index<=c.last_idx_peterman_line)
order by b.message_id, b.hotel_result_index) as tb2 on
(tb1.message_id=tb2.message_id 
    and  tb1.tuid=tb2.tuid
    and tb1.hotel_id=tb2.hotel_id
    and tb1.check_in_date=tb2.check_in_date
    and tb1.check_out_date=tb2.check_out_date
    and tb1.hotel_result_index=tb2.hotel_result_index
    and tb1.message_date=tb2.message_date)
'''

# filter last search data
# count = 32604178
'''
use eandev; 
drop table if exists eandev.yixli_last_src_hotels_ab_pl;
create table eandev.yixli_last_src_hotels_ab_pl as  
select b.*
from
(select a.*
,row_number() over(partition by hotel_id,check_in_date,check_out_date,tuid order by message_date desc) as rn
from yixli_hotels_ab_pl as a) as b
where rn=1
'''

In [None]:
# transfer from hdfs to s3
'''
ssh Chwxedwhdc002.datawarehouse.expecn.com

hive -e "use eandev; INSERT OVERWRITE LOCAL DIRECTORY '/home/yixli/temp' ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' select * from yixli_last_src_hotels_ab_pl;"&

cd temp
ls | xargs -I '{}' -n1 sed -i 's/\\N//g' '{}' &
cd ..
cat temp/* >last_src_hotels_ab_pl.csv


~/.linuxbrew/bin/aws_key_gen login

aws s3 cp last_src_hotels_ab_pl.csv s3://ege-ds-workshops-corp/yixli/data_preparation/last_src_hotels_ab_pl.csv

'''

In [9]:
# Read last_hotels_above_pl_df
schema = T.StructType([
    T.StructField("message_id", T.StringType(), True),              
      T.StructField("hotel_id", T.IntegerType(), True),                                   
     T.StructField("tuid", T.IntegerType(), True),                                        
    T.StructField("check_in_date", T.DateType(), True),                             
    T.StructField("check_out_date", T.DateType(), True),                                
     T.StructField("hotel_result_index", T.IntegerType(), True),                          
     T.StructField("message_date", T.TimestampType(), True),  
     T.StructField("rn", T.IntegerType(), True)])

file_loc = "s3://ege-ds-workshops-corp/yixli/data_preparation/last_src_hotels_ab_pl.csv"

last_hotels_above_pl_df = sqlContext.read.format('csv').\
            options(header='false', inferSchema='false', delimiter=',').\
            schema(schema).load(file_loc)

print(last_hotels_above_pl_df.count())

In [None]:
# further filter scores are not NULL for last_hotels_above_pl_df to keep it consistent with hotels below PL
# read score data and filter score_1 is not NULL
dispHotelsDF = sqlContext.read.parquet('s3://ege-ds-workshops-corp/yixli/prediction/rate_score')
dispHotelsDF = dispHotelsDF.select('score_1',"message_id","tuid","hotel_id","check_in_date", "check_out_date","hotel_result_index").\
filter(F.col('score_1').isNotNull())

In [11]:
# join score data with last_hotels_above_pl_df
last_hotels_above_pl_df = last_hotels_above_pl_df.join(dispHotelsDF.select("message_id","tuid","hotel_id","check_in_date", "check_out_date","hotel_result_index"),
                                    ["message_id","tuid","hotel_id","check_in_date", "check_out_date","hotel_result_index"],how='inner')

In [12]:
# count = 21521027
last_hotels_above_pl_df.count()

21521027

## 2.2 Bookings above PL

In [16]:
# filter outliers
bookings_df = bookings_df.filter(F.col('total_supply_revenue')<1000)

In [18]:
# join with last_hotels_above_pl_df
bookings_above_pl = bookings_df.join(last_hotels_above_pl_df,["hotel_id","check_in_date", "check_out_date","tuid"])

In [24]:
# suuply revenue by month
bookings_above_pl = bookings_above_pl.withColumn('month',F.month('issue_date'))
bookings_above_pl.select('month','total_supply_revenue','message_id').groupBy('month').agg(F.sum("total_supply_revenue"),F.count('message_id')).show()

+-----+-------------------------+-----------------+
|month|sum(total_supply_revenue)|count(message_id)|
+-----+-------------------------+-----------------+
|   12|        5969389.312911607|           204070|
|   10|        8102562.197199021|           273320|
|   11|         7037667.46557873|           249944|
+-----+-------------------------+-----------------+