In [3]:
import datetime
import pandas as pd
import numpy as np

from pyspark import SparkContext
from pyspark import SQLContext
from pyspark.sql.session import SparkSession


from pyspark.sql import DataFrameStatFunctions as statFunc

from pyspark.sql.functions import row_number
from pyspark.sql.window import Window
from pyspark.sql.functions import desc

from pyspark.sql import functions as F
from pyspark.sql import types as T
from pyspark.sql.types import IntegerType

## Obtain last_idx_peterman_line

In [None]:
'''
use eandev; 
drop table if exists eandev.yixli_last_idx_peterman_line;
create table eandev.yixli_last_idx_peterman_line as    
select  
  a.message_id,
  IF(b.message_id is null, 0, b.last_idx_peterman_line) as last_idx_peterman_line,
  IF(c.message_id is null, 0, c.last_hotel_result_index) as last_hotel_result_index
from eandev.HotelMessages as a 
left join (
  select  
   message_id
  ,MAX(v1.hotel_result_index) as last_idx_peterman_line
  from eandev.HotelMessages 
  lateral view explode(hotels) hotel_tbl as v1
  where load_date between '20200101' and '20200107'
  and ((v1.property_level_neg = true) or (v1.chain_level_neg = true) or (v1.preferred = true))
  and message_id is not null 
  and message_id <> ''
  and ((original_message_id is null) or (original_message_id == '')) 
  and type = 'HSR'  
  group by message_id
) as b ON a.message_id = b.message_id
left join (
  select  
   message_id
  ,MAX(v1.hotel_result_index) as last_hotel_result_index
  from eandev.HotelMessages 
  lateral view explode(hotels) hotel_tbl as v1
  where load_date between '20200101' and '20200107'
  and message_id is not null 
  and message_id <> ''
  and ((original_message_id is null) or (original_message_id == '')) 
  and type = 'HSR'  
  group by message_id
) as c ON a.message_id = c.message_id
where a.load_date between '20200101' and '20200107'
and a.message_id is not null 
and a.message_id <> ''
and ((original_message_id is null) or (original_message_id == '')) 
and a.type = 'HSR'
'''

In [4]:
peterman_schema = T.StructType([
    T.StructField("message_id", T.StringType(), True),                  #1
    T.StructField("last_idx_peterman_line", T.IntegerType(), True),     #2 
    T.StructField("last_hotel_result_index", T.IntegerType(), True)     #3
])

In [5]:
file_loc = "s3://ege-ds-workshops-corp/yixli/last_idx_peterman_line.csv"

peterman_df = sqlContext.read.format('csv').\
            options(header='false', inferSchema='false', delimiter=',').\
            schema(peterman_schema).load(file_loc)

# load_date between '20200101' and '20200107'

print(peterman_df.count())

634458

In [7]:
peterman_df.show(50)

+--------------------+----------------------+-----------------------+
|          message_id|last_idx_peterman_line|last_hotel_result_index|
+--------------------+----------------------+-----------------------+
|00c46dc3-473b-425...|                     5|                     29|
|01843d83-6216-497...|                     3|                      3|
|0255f5ed-9752-4c1...|                     0|                     29|
|02bd63a6-0c92-4bd...|                     0|                      0|
|03cb5780-8119-4d0...|                    29|                     29|
|044a2221-4c35-45b...|                     0|                     24|
|04b61bf1-f355-485...|                     0|                     29|
|058cd6c0-52ac-411...|                    10|                     29|
|05c3e198-22b1-4b4...|                     0|                      9|
|05ee11cd-f44b-4e3...|                     0|                     30|
|074138fd-bc90-4c8...|                    17|                     29|
|08c85ca6-ee6b-436..

## Search data

In [None]:
'''
use eandev;

drop table if exists eandev.yixli_search_commission;
create table eandev.yixli_search_commission as   
select 
message_id
,message_timestamp as message_date
,check_in_date
,check_out_date
,tuid
,v1.hotel_id
,v1.hotel_result_index
,v2.rate_index
,v2.price.amount as rate_amount
,v2.price.currency as rate_currency
,(case when v2.rate_type = 'GDS_CHAIN_NEGOTIATED' then 'NEG'                             
  when v2.rate_type = 'GDS_CONSORTIA' then 'GDS'
  when v2.rate_type = 'GDS_NEGOTIATED' then 'NEG'
  when v2.rate_type = 'GDS_PUBLISHED' then 'GDS'                             
  when v2.rate_type = 'EPR_MERCHANT' then 'EPRM'
  when v2.rate_type = 'ESR_DIRECT_AGENCY' then 'ESRA'
  when v2.rate_type = 'EPR_DIRECT_AGENCY' then 'EPRA'
  when v2.rate_type = 'ESR_MERCHANT' then 'ESRM'
  else v2.rate_type end) as rate_type
,v3.commission_base
,v3.base_category
,v3.commission_tax
,v3.tax_category
,v3.currency
from eandev.HotelMessages
lateral view explode(travelers) travl_tbl as tuid
lateral view explode(hotels) hotel_tbl as v1
lateral view explode(v1.prices) rate_tbl as v2
lateral view explode(v2.commission_list) cms_tbl as v3 
where load_date between '20200101' and '20200107'
and message_id is not null 
and message_id <> ''
and ((original_message_id is null) or (original_message_id == ''))
and v2.selected = True
and type = 'HSR'
'''

In [None]:
'''
ssh Chwxedwhdc002.datawarehouse.expecn.com

hive -e "use eandev; INSERT OVERWRITE LOCAL DIRECTORY '/home/yixli/temp' ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' select * from yixli_search_commission;"&

cd temp
ls | xargs -I '{}' -n1 sed -i 's/\\N//g' '{}' &
cd ..
cat temp/* >search_commission.csv


~/aws_key_gen login

aws s3 cp search_commission.csv s3://ege-ds-workshops-corp/yixli/search_commission.csv


'''

In [6]:
search_schema = T.StructType([
    T.StructField("message_id", T.StringType(), True),                  #1
    T.StructField("message_date", T.TimestampType(), True),             #2
    T.StructField("check_in_date", T.DateType(), True),                 #3
    T.StructField("check_out_date", T.DateType(), True),                #4
    T.StructField("tuid", T.IntegerType(), True),                       #5
    T.StructField("hotel_id", T.IntegerType(), True),                   #6 
    T.StructField("hotel_result_index", T.IntegerType(), True),         #7
    T.StructField("rate_index", T.IntegerType(), True),                 #8 
    T.StructField("rate_amount", T.FloatType(), True),                  #9
    T.StructField("rate_currency", T.StringType(), True),               #10
    T.StructField("rate_type", T.StringType(), True),                   #11
    T.StructField("commission_base", T.FloatType(), True),              #12
    T.StructField("base_category", T.StringType(), True),               #13
    T.StructField("commission_tax", T.FloatType(), True),               #14
    T.StructField("tax_category", T.StringType(), True),                #15
    T.StructField("commission_currency", T.StringType(), True)          #16
])


In [7]:
file_loc = "s3://ege-ds-workshops-corp/yixli/search_commission.csv"
print("Collecting search the data...")
search_df = sqlContext.read.format('csv').\
            options(header='false', inferSchema='false', delimiter=',').\
            schema(search_schema).load(file_loc)

# load_date between '20200101' and '20200107'

search_df = search_df.\
            filter(F.col("hotel_id")>0).\
            filter(F.col("rate_amount").isNotNull()).\
            withColumn("hotel_index", row_number().over(Window.partitionBy("message_id", "tuid").\
                                                        orderBy(F.asc("hotel_result_index"))))
print(search_df.count())

Collecting search the data...
183500658

In [13]:
search_df.show(50)

+--------------------+--------------------+-------------+--------------+--------+--------+------------------+----------+-----------+-------------+---------+---------------+-------------+--------------+------------+-------------------+-----------+
|          message_id|        message_date|check_in_date|check_out_date|    tuid|hotel_id|hotel_result_index|rate_index|rate_amount|rate_currency|rate_type|commission_base|base_category|commission_tax|tax_category|commission_currency|hotel_index|
+--------------------+--------------------+-------------+--------------+--------+--------+------------------+----------+-----------+-------------+---------+---------------+-------------+--------------+------------+-------------------+-----------+
|0046e02b-7222-45b...|2020-01-06 23:27:...|   2020-01-12|    2020-01-14|20567003|  564732|                 0|         2|     230.41|          SGD|     ESRM|         149.99|         BASE|         50.02|       TAXES|                CNY|          1|
|0046e02b-72

## Filter hotels below peterman_line

In [8]:
search_df.registerTempTable("search_df")
peterman_df.registerTempTable("peterman_df")

In [9]:
search_below_peterman_df = sqlContext.sql("select\
                             b.*\
                             from search_df as b join peterman_df as c on (b.message_id=c.message_id)\
                             where (b.hotel_result_index>c.last_idx_peterman_line)\
                             order by b.message_id, b.hotel_result_index").\
                             dropDuplicates()

In [10]:
print(search_below_peterman_df.count())

94670179

In [17]:
search_below_peterman_df.show(50)

+--------------------+--------------------+-------------+--------------+--------+--------+------------------+----------+-----------+-------------+---------+---------------+-------------+--------------+------------+-------------------+-----------+
|          message_id|        message_date|check_in_date|check_out_date|    tuid|hotel_id|hotel_result_index|rate_index|rate_amount|rate_currency|rate_type|commission_base|base_category|commission_tax|tax_category|commission_currency|hotel_index|
+--------------------+--------------------+-------------+--------------+--------+--------+------------------+----------+-----------+-------------+---------+---------------+-------------+--------------+------------+-------------------+-----------+
|00000fc7-f7f1-449...|2020-01-07 09:57:...|   2020-01-26|    2020-02-01|20921655|  904882|                27|         0|     199.98|          USD|     EPRM|          24.24|         BASE|          8.21|       TAXES|                USD|        709|
|00000fc7-f7

In [25]:
 print(search_below_peterman_df.filter(F.col("rate_currency")==F.col("commission_currency")).count())

58860003

## Exchange currency to USD

In [11]:
sqlCurrency = """select DISTINCT
                 a.exchangeratedate as timestamp,
                 trunc(a.exchangeratedate) as exch_rate_date,
                 a.fromcurrencycode,
                 a.exchangerate
                 from public.exchangeratedailyfull a
                 where trunc(a.exchangeratedate)>=to_date('20200101','YYYYMMDD') 
                 and trunc(a.exchangeratedate)<=to_date('20200401','YYYYMMDD') 
                 and tocurrencycode = 'USD' """
connection_string = "jdbc:redshift://egencia-reporting.czjkedodj6lc.us-west-2.redshift.amazonaws.com:5439/egedatamart?user=ds_rpt_user&password=Jan2019!";

currency_df = sqlContext.read.\
    format("com.databricks.spark.redshift").\
    option("url", connection_string).\
    option("query", sqlCurrency).\
    option("tempdir", "s3a://ege-ds-workshops-corp/yixli/").\
    load().\
    dropDuplicates().\
    orderBy(["exch_rate_date","fromcurrencycode"],ascending=True)

currency_df.show(50)

+-------------------+--------------+----------------+-------------------+
|          timestamp|exch_rate_date|fromcurrencycode|       exchangerate|
+-------------------+--------------+----------------+-------------------+
|2020-01-01 00:00:00|    2020-01-01|             AED|  0.272257010618023|
|2020-01-01 00:00:00|    2020-01-01|             AFN| 0.0129614435937417|
|2020-01-01 00:00:00|    2020-01-01|             ALL|0.00916170407695831|
|2020-01-01 00:00:00|    2020-01-01|             AMD|0.00208768267223382|
|2020-01-01 00:00:00|    2020-01-01|             ANG|  0.566059096569682|
|2020-01-01 00:00:00|    2020-01-01|             AOA|0.00207380248275633|
|2020-01-01 00:00:00|    2020-01-01|             ARS| 0.0167021029617839|
|2020-01-01 00:00:00|    2020-01-01|             AUD|  0.701803635342831|
|2020-01-01 00:00:00|    2020-01-01|             AWG|  0.558659217877095|
|2020-01-01 00:00:00|    2020-01-01|             AZN|  0.587958607714017|
|2020-01-01 00:00:00|    2020-01-01|  

In [12]:
search_below_peterman_df.registerTempTable("search_below_peterman_df")
currency_df.registerTempTable("currency_df")

In [13]:
search_below_peterman_usd_df = sqlContext.sql("select\
                             a.*,\
                             a.rate_amount*COALESCE(ex1.exchangerate,1) AS src_rate_amount_usd,\
                             a.commission_base*COALESCE(ex2.exchangerate,1) AS src_commission_base_usd,\
                             a.commission_tax*COALESCE(ex2.exchangerate,1) AS src_commission_tax_usd\
                             from search_below_peterman_df a\
                             left join currency_df ex1 on to_date(a.message_date)=ex1.exch_rate_date and a.rate_currency=ex1.FromCurrencyCode\
                             left join currency_df ex2 on to_date(a.message_date)=ex2.exch_rate_date and a.commission_currency=ex2.FromCurrencyCode").\
                             dropDuplicates()

In [14]:
search_below_peterman_usd_df = search_below_peterman_usd_df.filter(F.col("src_commission_base_usd").isNotNull())

In [15]:
search_below_peterman_usd_df.count()

94669936

In [18]:
search_below_peterman_usd_df.select("rate_amount","rate_currency","src_rate_amount_usd","src_commission_base_usd","src_commission_tax_usd").show(50)

+-----------+-------------+-------------------+-----------------------+----------------------+
|rate_amount|rate_currency|src_rate_amount_usd|src_commission_base_usd|src_commission_tax_usd|
+-----------+-------------+-------------------+-----------------------+----------------------+
|    1463.99|          SEK| 156.29063319857548|     35.359930097970924|    2.1212542664467566|
|     1508.0|          SEK| 160.98899339176492|      25.87353616491249|    3.1044827026193893|
|    1635.84|          SEK| 174.63675692800481|      39.51062827380042|      2.37106458610164|
|    1635.84|          SEK| 174.63675692800481|      39.51062827380042|      2.37106458610164|
|     1685.0|          SEK| 179.88491635618297|     28.909693266108935|    3.4695903748225203|
|     1685.0|          SEK| 179.88491635618297|     28.909693266108935|    3.4695903748225203|
|    1872.71|          SEK| 199.92419862470794|      45.23171551936102|     2.713753464390683|
|     1929.0|          SEK| 205.93353332408128|   

## Booking data

In [None]:
'''
wget https://s3.amazonaws.com/redshift-downloads/drivers/jdbc/1.2.20.1043/RedshiftJDBC42-no-awssdk-1.2.20.1043.jar
wget https://repo1.maven.org/maven2/com/databricks/spark-avro_2.11/3.2.0/spark-avro_2.11-3.2.0.jar
wget https://repo1.maven.org/maven2/com/databricks/spark-redshift_2.11/2.0.1/spark-redshift_2.11-2.0.1.jar
wget https://repo1.maven.org/maven2/com/databricks/spark-csv_2.11/1.5.0/spark-csv_2.11-1.5.0.jar
wget https://repo1.maven.org/maven2/com/eclipsesource/minimal-json/minimal-json/0.9.5/minimal-json-0.9.5.jar
sudo cp *.jar /usr/lib/spark/jars
'''

In [18]:
sqlBookings = """SELECT 
      a.hotelfactid,
      a.hotelid as hotel_id,
     trunc(a.issuedate) as issue_date,
     trunc(a.traveldatestart) as check_in_date,
     trunc(a.traveldateend) as check_out_date,
      (case when a.customersystemid=1 then a.TUIDTraveler else g.TUIDInternal end) as tuid,      
      a.travelproductid AS travel_product_id,
      a.GroupAccountID AS group_account_id,
      hsp.rate_type AS hotel_rate_type,
      (case  
        when a.hotelrateTypeSupplyid IN (1,2,3,4,5,10,11,14,16,17,18,20,25,26,28,29,30,32) then 'GDS'
        when a.hotelrateTypeSupplyid IN (12,13,15,19,21,22,23,27,31,33,34) then 'NEG'
        when a.hotelrateTypeSupplyid = 6 then 'ESRA'
        when a.hotelrateTypeSupplyid = 7 then 'ESRM'
        when a.hotelrateTypeSupplyid = 8 then 'EPRA'
        when a.hotelrateTypeSupplyid = 9 then 'EPRM'
        when a.hotelrateTypeSupplyid IN (24,35) then 'HOTMIP'
        else 'UNK'
        end) as bk_rate_type,
      CASE WHEN a.OnlineBool = 1 THEN 'online' ELSE 'offline' END AS booking_type,
      a.bookingamtgross::DOUBLE precision*COALESCE(ex.exchangerate,1::DOUBLE precision) AS gross_booking_value_usd,
      a.bookingamtcommissionest::DOUBLE precision*COALESCE(ex.exchangerate,1::DOUBLE precision)
        + a.bookingamtgross::DOUBLE precision*COALESCE(ex.exchangerate,1::DOUBLE precision)*NVL(hsp.commission,0.00) AS total_commission,
      a.bookingamtmargin::DOUBLE precision*COALESCE(ex.exchangerate,1::DOUBLE precision) AS total_markup,
      NVL(hsp.gds,0.00) AS total_gds,
      NVL(total_commission,0.00) + NVL(total_markup,0.00) + NVL(total_gds,0.00) AS total_supply_revenue
FROM public.HotelFact a
  JOIN public.ExchangeRateDailySubset ex
    ON a.IssueDateTimeID = ex.TimeID AND a.CurrencyCode = ex.FromCurrencyCode AND ex.ToCurrencyCode = 'USD'
  JOIN TravelerAccountDim g 
    ON a.TUIDTraveler = g.TUID AND a.customersystemid=g.customersystemid 
  JOIN public.GroupAccountdim d 
    ON a.GroupAccountID = d.GroupAccountID 
  JOIN public.hotel_supply hsp 
    ON hsp.year = EXTRACT (year FROM a.IssueDate)
    AND hsp.quarter = EXTRACT (quarter FROM a.IssueDate)
    AND hsp.rate_type = CASE
         WHEN a.HotelRateTypeSupplyID IN (7,9,24) THEN 'Expedia Collect'
         WHEN a.HotelRateTypeSupplyID IN (6,8,35) THEN 'Hotel Collect'
         WHEN a.HotelRateTypeSupplyID IN (12,13,15,19,21,22,23,27,31,33,34) THEN 'Negotiated'
         ELSE 'Published & GDS' END
    AND hsp.travel_product_id = a.TravelProductID
WHERE a.CustomerSystemID IN (1,2)
AND   (d.groupaccountinternaltypeid = 1 OR d.groupaccountinternaltypeid = 3)
AND a.BookingTypeID in (1,3) 
and a.issuedate>=to_date('20200101','YYYYMMDD') 
and a.issuedate<=to_date('20200401','YYYYMMDD')""".replace('\n',' ')

connection_string = "jdbc:redshift://egencia-reporting.czjkedodj6lc.us-west-2.redshift.amazonaws.com:5439/egedatamart?user=ds_rpt_user&password=Jan2019!";

bookings_df = sqlContext.read.\
    format("com.databricks.spark.redshift").\
    option("url", connection_string).\
    option("query", sqlBookings).\
    option("tempdir", "s3a://ege-ds-workshops-corp/yixli/").\
    load()

bookings_df.cache()
bookings_df.count()

An error occurred while calling o177.load.
: java.sql.SQLException: [Amazon](500310) Invalid operation: column a.rate_amount does not exist;
	at com.amazon.redshift.client.messages.inbound.ErrorResponse.toErrorException(Unknown Source)
	at com.amazon.redshift.client.PGMessagingContext.handleErrorResponse(Unknown Source)
	at com.amazon.redshift.client.PGMessagingContext.handleMessage(Unknown Source)
	at com.amazon.jdbc.communications.InboundMessagesPipeline.getNextMessageOfClass(Unknown Source)
	at com.amazon.redshift.client.PGMessagingContext.doMoveToNextClass(Unknown Source)
	at com.amazon.redshift.client.PGMessagingContext.getParameterDescription(Unknown Source)
	at com.amazon.redshift.client.PGClient.prepareStatement(Unknown Source)
	at com.amazon.redshift.dataengine.PGQueryExecutor.<init>(Unknown Source)
	at com.amazon.redshift.dataengine.PGDataEngine.prepare(Unknown Source)
	at com.amazon.jdbc.common.SPreparedStatement.<init>(Unknown Source)
	at com.amazon.jdbc.jdbc41.S41PreparedS

In [4]:
bookings_df.show(50)

+-----------+--------+----------+-------------+--------------+--------+-----------------+----------------+---------------+------------+------------+-----------------------+------------------+------------------+---------+--------------------+
|hotelfactid|hotel_id|issue_date|check_in_date|check_out_date|    tuid|travel_product_id|group_account_id|hotel_rate_type|bk_rate_type|booking_type|gross_booking_value_usd|  total_commission|      total_markup|total_gds|total_supply_revenue|
+-----------+--------+----------+-------------+--------------+--------+-----------------+----------------+---------------+------------+------------+-----------------------+------------------+------------------+---------+--------------------+
|   89579450|   10971|2020-01-01|   2020-01-05|    2020-01-09|19466814|            60086|           34972|Expedia Collect|        ESRM|      online|      486.9729434742641|               0.0| 87.63766659007344|    0E-18|   87.63766659007344|
|   89577955| 4027043|2020-01-01

In [28]:
dir = 's3://ege-ds-workshops-corp/yixli/data_understanding/'

In [29]:
datestamp = datetime.datetime.now().strftime('%m-%d-%Y')
bookings_df.repartition(1).write.format('com.databricks.spark.csv').mode('overwrite')\
.save(dir+datestamp+'_bookings_df.csv',header = 'true')

In [32]:
bookings_df.\
                  groupBy('bk_rate_type').\
                  agg(F.avg("total_supply_revenue"),F.avg("total_commission"),F.avg("total_markup")).show(20)

+------------+-------------------------+---------------------+-----------------+
|bk_rate_type|avg(total_supply_revenue)|avg(total_commission)|avg(total_markup)|
+------------+-------------------------+---------------------+-----------------+
|        EPRA|        45.81221638076596|    45.81221638076596|              0.0|
|         NEG|                      0.0|                  0.0|              0.0|
|        EPRM|        53.72553461177965|                  0.0|53.72553461177965|
|        ESRM|        54.15272217260957|                  0.0| 54.1601880177948|
|      HOTMIP|       58.180847463342154|  0.15645780459399597|58.02438965874817|
|        ESRA|        42.52432011210949|    42.52432011210949|              0.0|
|         GDS|       29.038212791398234|   29.831784144380794|              0.0|
+------------+-------------------------+---------------------+-----------------+

## Matching last message in search chain with booking record

In [17]:
booking_search_df = bookings_df.\
                join(search_below_peterman_usd_df,["hotel_id","check_in_date", "check_out_date","tuid"]).\
                withColumn("rn", row_number().over(Window.partitionBy("hotel_id","check_in_date","check_out_date","tuid").orderBy(desc("message_date")))).\
                filter(F.col("rn") == 1).\
                withColumn("is_local", F.when(F.col("bk_rate_type").isin("ESRM","ESRA","EPRM","EPRA","HOTMIP"),1).otherwise(0)).\
                withColumnRenamed("hotel_index","bk_hotel_index")
booking_search_df.count()

48090

In [20]:
booking_search_df.printSchema()

root
 |-- hotel_id: integer (nullable = true)
 |-- check_in_date: date (nullable = true)
 |-- check_out_date: date (nullable = true)
 |-- tuid: integer (nullable = true)
 |-- hotelfactid: integer (nullable = true)
 |-- issue_date: date (nullable = true)
 |-- travel_product_id: integer (nullable = true)
 |-- group_account_id: integer (nullable = true)
 |-- hotel_rate_type: string (nullable = true)
 |-- bk_rate_type: string (nullable = true)
 |-- booking_type: string (nullable = true)
 |-- gross_booking_value_usd: double (nullable = true)
 |-- total_commission: double (nullable = true)
 |-- total_markup: double (nullable = true)
 |-- total_gds: decimal(20,18) (nullable = true)
 |-- total_supply_revenue: double (nullable = true)
 |-- message_id: string (nullable = true)
 |-- message_date: timestamp (nullable = true)
 |-- hotel_result_index: integer (nullable = true)
 |-- rate_index: integer (nullable = true)
 |-- rate_amount: float (nullable = true)
 |-- rate_currency: string (nullable = 

## Adjust the revenue by the length of stay

In [21]:
booking_search_df = booking_search_df.\
                     withColumn('duration', F.datediff(F.col("check_out_date"),F.col( "check_in_date")).cast(IntegerType()))

In [22]:
booking_search_df.select("check_out_date","check_in_date",'duration').show(50)

+--------------+-------------+--------+
|check_out_date|check_in_date|duration|
+--------------+-------------+--------+
|    2020-01-09|   2020-01-08|       1|
|    2020-01-18|   2020-01-14|       4|
|    2020-01-22|   2020-01-21|       1|
|    2020-01-03|   2020-01-02|       1|
|    2020-01-24|   2020-01-22|       2|
|    2020-01-17|   2020-01-13|       4|
|    2020-01-30|   2020-01-29|       1|
|    2020-03-05|   2020-03-04|       1|
|    2020-01-21|   2020-01-19|       2|
|    2020-01-09|   2020-01-07|       2|
|    2020-01-17|   2020-01-13|       4|
|    2020-02-08|   2020-02-04|       4|
|    2020-01-25|   2020-01-20|       5|
|    2020-04-30|   2020-04-29|       1|
|    2020-01-10|   2020-01-09|       1|
|    2020-01-14|   2020-01-13|       1|
|    2020-02-29|   2020-02-25|       4|
|    2020-01-23|   2020-01-22|       1|
|    2020-01-09|   2020-01-08|       1|
|    2020-01-18|   2020-01-17|       1|
|    2020-01-17|   2020-01-15|       2|
|    2020-01-17|   2020-01-16|       1|


In [43]:
booking_search_df = booking_search_df.\
                     withColumn('src_total_commission_base_usd', F.col("src_commission_base_usd")*F.col( "duration")).\
                     withColumn('src_total_commission_usd', F.col("src_commission_base_usd")*F.col( "duration")+F.col('src_commission_tax_usd')).\
                     withColumn('avg_book_rate_amount_usd',F.col('gross_booking_value_usd')/F.col( "duration")).\
                     withColumn('bk_avg_commission_usd',F.col('total_commission')/F.col( "duration")).\
                     withColumn('bk_avg_markup_usd',F.col('total_markup')/F.col( "duration")).\
                     withColumn('bk_avg_revenue_usd',F.col('total_supply_revenue')/F.col( "duration"))

## Match rates: is booked/not booked

In [25]:
booking_search_df = booking_search_df.\
                         withColumn('rate_diff',F.abs(F.col('avg_book_rate_amount_usd')-F.col('src_rate_amount_usd'))).\
                         withColumn('is_booked', F.when(F.abs(F.col('avg_book_rate_amount_usd')-F.col('src_rate_amount_usd'))<1.0,1).otherwise(0))

In [26]:
booking_search_df.filter(F.col('is_booked')==1).count()

7471

In [27]:
booking_search_df = booking_search_df.\
                    withColumn('revenue_diff',F.abs(F.col('total_supply_revenue')-F.col('src_total_commission_usd')))

In [48]:
dir = 's3://ege-ds-workshops-corp/yixli/data_understanding/'

In [49]:
datestamp = datetime.datetime.now().strftime('%m-%d-%Y')
booking_search_df.repartition(1).write.format('com.databricks.spark.csv').mode('overwrite')\
.save(dir+datestamp+'_booking_search_df.csv',header = 'true')

## Estimated revenue from booking data

In [44]:
booking_search_df.\
                  groupBy('bk_rate_type').\
                  agg(F.avg("bk_avg_revenue_usd"),F.avg("bk_avg_commission_usd"),F.avg("bk_avg_markup_usd")).show(20)

+------------+-----------------------+--------------------------+----------------------+
|bk_rate_type|avg(bk_avg_revenue_usd)|avg(bk_avg_commission_usd)|avg(bk_avg_markup_usd)|
+------------+-----------------------+--------------------------+----------------------+
|        EPRA|     22.182425043361306|        22.182425043361306|                   0.0|
|         NEG|                    0.0|                       0.0|                   0.0|
|        EPRM|      27.66371178499827|                       0.0|     27.66371178499827|
|        ESRM|     25.383485106418156|                       0.0|    25.383485106418156|
|      HOTMIP|     29.886133275140406|        0.0731780576447558|     29.81295521749565|
|        ESRA|      19.96869810542563|         19.96869810542563|                   0.0|
|         GDS|     14.276933739329458|        14.276933739329458|                   0.0|
+------------+-----------------------+--------------------------+----------------------+

## Estimated revenue from Search data(all)

In [46]:
booking_search_df.\
                  groupBy('bk_rate_type').\
                  agg(F.avg("src_commission_base_usd"),F.avg(F.col("src_commission_base_usd")+F.col("src_commission_tax_usd"))).show(20)

+------------+----------------------------+-------------------------------------------------------+
|bk_rate_type|avg(src_commission_base_usd)|avg((src_commission_base_usd + src_commission_tax_usd))|
+------------+----------------------------+-------------------------------------------------------+
|        EPRA|           22.54381125534363|                                      26.78441068546667|
|         NEG|          20.396313044972903|                                     24.936022826639817|
|        EPRM|          24.114858376449934|                                     30.299500597828573|
|        ESRM|          21.178081247400634|                                      27.55562114398488|
|      HOTMIP|          26.863897625390745|                                      33.20995771013311|
|        ESRA|           20.44288793814704|                                     25.292741646955722|
|         GDS|           23.72700553091482|                                      32.08309702820891|


## Estimated revenue from Search data(is booked)

In [47]:
booking_search_df.filter(F.col('is_booked')==1).\
                  groupBy('bk_rate_type').\
                  agg(F.avg("src_commission_base_usd"),F.avg(F.col("src_commission_base_usd")+F.col("src_commission_tax_usd"))).show(20)

+------------+----------------------------+-------------------------------------------------------+
|bk_rate_type|avg(src_commission_base_usd)|avg((src_commission_base_usd + src_commission_tax_usd))|
+------------+----------------------------+-------------------------------------------------------+
|        EPRA|          18.353862493771864|                                      20.46561375108867|
|         NEG|           16.53327200560976|                                     22.589362696174312|
|        EPRM|          19.092465863779392|                                     23.454821298847094|
|        ESRM|           16.67621684259227|                                     21.255125859550542|
|      HOTMIP|          20.276661407143468|                                     24.948094062474308|
|        ESRA|          16.222678519695798|                                     18.163415595705125|
|         GDS|          20.356888765783356|                                      25.92079778032349|


## Probabilty of booking by rate types

In [38]:
booking_search_df.\
                  groupBy('bk_rate_type').agg(F.count(F.col('is_booked')), F.avg(F.col('is_booked'))).show(20)

+------------+----------------+--------------------+
|bk_rate_type|count(is_booked)|      avg(is_booked)|
+------------+----------------+--------------------+
|        EPRA|            1824|  0.2362938596491228|
|         NEG|             107|0.009345794392523364|
|        EPRM|            6426|  0.1568627450980392|
|        ESRM|           21778| 0.17784002204059143|
|      HOTMIP|            3592|  0.1723273942093541|
|        ESRA|            5606| 0.20674277559757404|
|         GDS|            8757| 0.03802672147995889|
+------------+----------------+--------------------+

## Probabilty of booking by rate index

In [39]:
booking_search_df.\
                  groupBy('rate_index').agg(F.count(F.col('is_booked')), F.avg(F.col('is_booked'))).show(20)

+----------+----------------+--------------------+
|rate_index|count(is_booked)|      avg(is_booked)|
+----------+----------------+--------------------+
|        12|             164| 0.06707317073170732|
|         1|            7632| 0.21016771488469602|
|        13|             130|0.046153846153846156|
|         6|            3243| 0.08078939253777366|
|         3|            4549|  0.1424488898659046|
|         5|            3810| 0.10498687664041995|
|        15|               9|                 0.0|
|         9|            1062|0.062146892655367235|
|         4|            4151|  0.1069621777884847|
|         8|            2702| 0.06328645447816432|
|         7|            3060| 0.08398692810457517|
|        10|             754|  0.0649867374005305|
|        11|             639| 0.03912363067292645|
|        14|             111| 0.05405405405405406|
|         2|            6175| 0.13748987854251013|
|         0|            9899|  0.2719466612789171|
+----------+----------------+--