In [2]:
import datetime
from datetime import date
import pandas as pd
import numpy as np

from pyspark import SparkContext
from pyspark import SQLContext
from pyspark.sql.session import SparkSession


from pyspark.sql import DataFrameStatFunctions as statFunc

from pyspark.sql.functions import row_number
from pyspark.sql.window import Window
from pyspark.sql.functions import desc

from pyspark.sql import functions as F
from pyspark.sql import types as T
from pyspark.sql.types import IntegerType

In [3]:
def getJoinDf(bookings_df,search_df):
    # filter outliers
    bookings_df = bookings_df.filter(F.col('total_supply_revenue')<1000)
    # join with last_hotels_below_pl_df
    bookings_join_df = bookings_df.filter(F.col('bk_rate_type')!='NEG').join(search_df,["hotel_id","check_in_date", "check_out_date","tuid"])
    return bookings_join_df

In [5]:
def getSupplyRevenue(bookings_df,search_df):
    bookings_join_df = getJoinDf(bookings_df,search_df)
    booking_revenue = bookings_join_df.select('total_supply_revenue').agg(F.sum("total_supply_revenue")).collect()[0][0]
    return booking_revenue

## Booking revenue for hotels below PL 

## 1.1 read search data

In [6]:
# read rate level df that we used for hotel revenue estimation 
rate_all_usd_df = sqlContext.read.parquet('s3://ege-ds-workshops-corp/yixli/data_preparation/rate_all_usd_2019')

In [14]:
# obtain hotel df 
last_hotels_below_pl_df = rate_all_usd_df.filter(F.col('score_1').isNotNull()).select("message_id","hotel_id","check_in_date","check_out_date","tuid",
                                                "hotel_index","bk_hotel_index",'hotel_result_index').dropDuplicates()

In [15]:
last_hotels_below_pl_df.count()

9826186

## 1.2 bookings below PL

In [16]:
# collect bookings from Redshift
sqlBookings = """SELECT 
      a.hotelfactid,
      a.hotelid as hotel_id,
     trunc(a.issuedate) as issue_date,
     trunc(a.traveldatestart) as check_in_date,
     trunc(a.traveldateend) as check_out_date,
     hsp.quarter as quarter,
      (case when a.customersystemid=1 then a.TUIDTraveler else g.TUIDInternal end) as tuid,      
      a.travelproductid AS travel_product_id,
      a.GroupAccountID AS group_account_id,
      hsp.rate_type AS hotel_rate_type,
      (case  
        when a.hotelrateTypeSupplyid IN (1,2,3,4,5,10,11,14,16,17,18,20,25,26,28,29,30,32) then 'GDS'
        when a.hotelrateTypeSupplyid IN (12,13,15,19,21,22,23,27,31,33,34) then 'NEG'
        when a.hotelrateTypeSupplyid = 6 then 'ESRA'
        when a.hotelrateTypeSupplyid = 7 then 'ESRM'
        when a.hotelrateTypeSupplyid = 8 then 'EPRA'
        when a.hotelrateTypeSupplyid = 9 then 'EPRM'
        when a.hotelrateTypeSupplyid IN (24,35) then 'HOTMIP'
        else 'UNK'
        end) as bk_rate_type,
      CASE WHEN a.OnlineBool = 1 THEN 'online' ELSE 'offline' END AS booking_type,
      a.bookingamtgross::DOUBLE precision*COALESCE(ex.exchangerate,1::DOUBLE precision) AS gross_booking_value_usd,
      a.bookingamtcommissionest::DOUBLE precision*COALESCE(ex.exchangerate,1::DOUBLE precision)
        + a.bookingamtgross::DOUBLE precision*COALESCE(ex.exchangerate,1::DOUBLE precision)*NVL(hsp.commission,0.00) AS total_commission,
      a.bookingamtmargin::DOUBLE precision*COALESCE(ex.exchangerate,1::DOUBLE precision) AS total_markup,
      4.3 AS total_gds,
      NVL(total_commission,0.00) + NVL(total_markup,0.00) + NVL(total_gds,0.00) AS total_supply_revenue
FROM public.HotelFact a
  JOIN public.ExchangeRateDailySubset ex
    ON a.IssueDateTimeID = ex.TimeID AND a.CurrencyCode = ex.FromCurrencyCode AND ex.ToCurrencyCode = 'USD'
  JOIN TravelerAccountDim g 
    ON a.TUIDTraveler = g.TUID AND a.customersystemid=g.customersystemid 
  JOIN public.GroupAccountdim d 
    ON a.GroupAccountID = d.GroupAccountID 
  JOIN public.hotel_supply hsp 
    ON hsp.year = EXTRACT (year FROM a.IssueDate)
    AND hsp.quarter = EXTRACT (quarter FROM a.IssueDate)
    AND hsp.rate_type = CASE
         WHEN a.HotelRateTypeSupplyID IN (7,9,24) THEN 'Expedia Collect'
         WHEN a.HotelRateTypeSupplyID IN (6,8,35) THEN 'Hotel Collect'
         WHEN a.HotelRateTypeSupplyID IN (12,13,15,19,21,22,23,27,31,33,34) THEN 'Negotiated'
         ELSE 'Published & GDS' END
    AND hsp.travel_product_id = a.TravelProductID
WHERE a.CustomerSystemID IN (1,2)
AND   (d.groupaccountinternaltypeid = 1 OR d.groupaccountinternaltypeid = 3)
AND a.BookingTypeID in (1,3) 
and a.issuedate>=to_date('20191001','YYYYMMDD') 
and a.issuedate<=to_date('20200101','YYYYMMDD')""".replace('\n',' ')

connection_string = "jdbc:redshift://egencia-reporting.czjkedodj6lc.us-west-2.redshift.amazonaws.com:5439/egedatamart?user=ds_rpt_user&password=Jan2019!";

bookings_df = sqlContext.read.\
    format("com.databricks.spark.redshift").\
    option("url", connection_string).\
    option("query", sqlBookings).\
    option("tempdir", "s3a://ege-ds-workshops-corp/yixli/").\
    load()

bookings_df.cache()
bookings_df.count()

1999891

In [17]:
bookings_below_pl = getJoinDf(bookings_df,last_hotels_below_pl_df)

In [18]:
bookings_below_pl.count()

514721

In [19]:
# total supply revenue
bookings_below_pl.select('total_supply_revenue').agg(F.sum("total_supply_revenue")).show()

+-------------------------+
|sum(total_supply_revenue)|
+-------------------------+
|      2.635564624041331E7|
+-------------------------+

In [13]:
# supply revenue by month
bookings_below_pl = bookings_below_pl.withColumn('month',F.month('issue_date'))
bookings_below_pl.select('month','total_supply_revenue').groupBy('month').agg(F.sum("total_supply_revenue"),F.count('month')).show()

+-----+-------------------------+------------+
|month|sum(total_supply_revenue)|count(month)|
+-----+-------------------------+------------+
|   12|        6923447.194805484|      138121|
|   10|     1.0524547245132282E7|      197153|
|   11|        8907651.800475556|      179447|
+-----+-------------------------+------------+