In [2]:
import datetime
import pandas as pd
import numpy as np

from pyspark import SparkContext
from pyspark import SQLContext
from pyspark.sql.session import SparkSession


from pyspark.sql import DataFrameStatFunctions as statFunc

from pyspark.sql.functions import row_number
from pyspark.sql.window import Window
from pyspark.sql.functions import desc

from pyspark.sql import functions as F
from pyspark.sql import types as T
from pyspark.sql.types import IntegerType

In [3]:
%local
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.options.display.float_format = '{:.5f}'.format

In [3]:
sqlBookings = """SELECT 
      a.hotelfactid,
      a.hotelid as hotel_id,
     trunc(a.issuedate) as issue_date,
     trunc(a.traveldatestart) as check_in_date,
     trunc(a.traveldateend) as check_out_date,
     hsp.quarter as quarter,
      (case when a.customersystemid=1 then a.TUIDTraveler else g.TUIDInternal end) as tuid,      
      a.travelproductid AS travel_product_id,
      a.GroupAccountID AS group_account_id,
      hsp.rate_type AS hotel_rate_type,
      (case  
        when a.hotelrateTypeSupplyid IN (1,2,3,4,5,10,11,14,16,17,18,20,25,26,28,29,30,32) then 'GDS'
        when a.hotelrateTypeSupplyid IN (12,13,15,19,21,22,23,27,31,33,34) then 'NEG'
        when a.hotelrateTypeSupplyid = 6 then 'ESRA'
        when a.hotelrateTypeSupplyid = 7 then 'ESRM'
        when a.hotelrateTypeSupplyid = 8 then 'EPRA'
        when a.hotelrateTypeSupplyid = 9 then 'EPRM'
        when a.hotelrateTypeSupplyid IN (24,35) then 'HOTMIP'
        else 'UNK'
        end) as bk_rate_type,
      CASE WHEN a.OnlineBool = 1 THEN 'online' ELSE 'offline' END AS booking_type,
      a.bookingamtgross::DOUBLE precision*COALESCE(ex.exchangerate,1::DOUBLE precision) AS gross_booking_value_usd,
      a.bookingamtcommissionest::DOUBLE precision*COALESCE(ex.exchangerate,1::DOUBLE precision)
        + a.bookingamtgross::DOUBLE precision*COALESCE(ex.exchangerate,1::DOUBLE precision)*NVL(hsp.commission,0.00) AS total_commission,
      a.bookingamtmargin::DOUBLE precision*COALESCE(ex.exchangerate,1::DOUBLE precision) AS total_markup,
      NVL(hsp.gds,0.00) AS total_gds,
      NVL(total_commission,0.00) + NVL(total_markup,0.00) + NVL(total_gds,0.00) AS total_supply_revenue
FROM public.HotelFact a
  JOIN public.ExchangeRateDailySubset ex
    ON a.IssueDateTimeID = ex.TimeID AND a.CurrencyCode = ex.FromCurrencyCode AND ex.ToCurrencyCode = 'USD'
  JOIN TravelerAccountDim g 
    ON a.TUIDTraveler = g.TUID AND a.customersystemid=g.customersystemid 
  JOIN public.GroupAccountdim d 
    ON a.GroupAccountID = d.GroupAccountID 
  JOIN public.hotel_supply hsp 
    ON hsp.year = EXTRACT (year FROM a.IssueDate)
    AND hsp.quarter = EXTRACT (quarter FROM a.IssueDate)
    AND hsp.rate_type = CASE
         WHEN a.HotelRateTypeSupplyID IN (7,9,24) THEN 'Expedia Collect'
         WHEN a.HotelRateTypeSupplyID IN (6,8,35) THEN 'Hotel Collect'
         WHEN a.HotelRateTypeSupplyID IN (12,13,15,19,21,22,23,27,31,33,34) THEN 'Negotiated'
         ELSE 'Published & GDS' END
    AND hsp.travel_product_id = a.TravelProductID
WHERE a.CustomerSystemID IN (1,2)
AND   (d.groupaccountinternaltypeid = 1 OR d.groupaccountinternaltypeid = 3)
AND a.BookingTypeID in (1,3) 
and a.issuedate>=to_date('20191001','YYYYMMDD') 
and a.issuedate<=to_date('20200101','YYYYMMDD')""".replace('\n',' ')

connection_string = "jdbc:redshift://egencia-reporting.czjkedodj6lc.us-west-2.redshift.amazonaws.com:5439/egedatamart?user=ds_rpt_user&password=Jan2019!";

bookings_df = sqlContext.read.\
    format("com.databricks.spark.redshift").\
    option("url", connection_string).\
    option("query", sqlBookings).\
    option("tempdir", "s3a://ege-ds-workshops-corp/yixli/").\
    load()

bookings_df.cache()
bookings_df.count()

1999888

In [4]:
bookings_df = bookings_df.withColumn('rate_type',F.when(F.col('bk_rate_type')=='HOTMIP','ESRM').otherwise(F.col('bk_rate_type')))

In [5]:
bookings_df = bookings_df.\
                     withColumn('duration', F.datediff(F.col("check_out_date"),F.col( "check_in_date")).cast(IntegerType()))

In [6]:
bookings_df = bookings_df.\
                     withColumn('avg_book_rate_amount_usd',F.col('gross_booking_value_usd')/F.col( "duration")).\
                     withColumn('bk_avg_commission_usd',F.col('total_commission')/F.col( "duration")).\
                     withColumn('bk_avg_markup_usd',F.col('total_markup')/F.col( "duration")).\
                     withColumn('bk_avg_revenue_usd',F.col('total_supply_revenue')/F.col( "duration"))

In [7]:
bookings_df = bookings_df.filter(F.col('total_supply_revenue')>0)

In [8]:
bk_hotel_id = bookings_df.select('hotel_id','tuid').dropDuplicates()

In [9]:
bk_hotel_id.count()

1178715

## Read search data

In [10]:
search_schema = T.StructType([
    T.StructField("message_id", T.StringType(), True),                  #1
    T.StructField("hotel_id", T.IntegerType(), True),                   #6
     T.StructField("check_in_date", T.DateType(), True),                 #3
    T.StructField("check_out_date", T.DateType(), True),                #4
    T.StructField("tuid", T.IntegerType(), True),                       #5
   
    T.StructField("rate_index", T.IntegerType(), True),                 #8 
       T.StructField("message_date", T.TimestampType(), True),             #2
    
      T.StructField("hotel_result_index", T.IntegerType(), True),         #7           #8
   T.StructField("city", T.StringType(), True),                  #1
     T.StructField("star_rating", T.StringType(), True),                  #1
    
    T.StructField("filter_want_in_policy_rates_only", T.BooleanType(), True),         #7
    T.StructField("filter_eligible_for_loyalty", T.BooleanType(), True),                 #8
    T.StructField("filter_free_breakfast", T.BooleanType(), True),              #12
    T.StructField("filter_free_wifi", T.BooleanType(), True),              #12
    T.StructField("filter_free_parking", T.BooleanType(), True),              #14
    T.StructField("rate_type", T.StringType(), True),                   #11
    T.StructField("eligible_for_loyalty", T.BooleanType(), True),         #7
    T.StructField("free_breakfast", T.BooleanType(), True),              #12
    T.StructField("free_wifi", T.BooleanType(), True),              #12
    T.StructField("free_parking", T.BooleanType(), True) ,             #14
     T.StructField("refundable", T.BooleanType(), True),
    T.StructField("src_rate_amount_usd", T.FloatType(), True),              #12
    T.StructField("src_commission_base_usd", T.FloatType(), True),              #12
    T.StructField("src_supply_revenue_usd", T.FloatType(), True)
])

In [13]:
file_loc = "s3://ege-ds-workshops-corp/yixli/data_understanding/09-15-2020_search_revenue_below_peterman_usd_df.csv"
print("Collecting search the data...")
search_revenue_below_peterman_usd_df = sqlContext.read.format('csv').\
            options(header='True', inferSchema='false', delimiter=',').\
            schema(search_schema).load(file_loc)

# load_date between '20191001' and '20200101'
print(search_revenue_below_peterman_usd_df.count())

Collecting search the data...
6528971

In [14]:
search_revenue_below_peterman_usd_df.show(50)

+--------------------+--------+-------------+--------------+--------+----------+--------------------+------------------+--------------------+-----------+--------------------------------+---------------------------+---------------------+----------------+-------------------+---------+--------------------+--------------+---------+------------+----------+-------------------+-----------------------+----------------------+
|          message_id|hotel_id|check_in_date|check_out_date|    tuid|rate_index|        message_date|hotel_result_index|                city|star_rating|filter_want_in_policy_rates_only|filter_eligible_for_loyalty|filter_free_breakfast|filter_free_wifi|filter_free_parking|rate_type|eligible_for_loyalty|free_breakfast|free_wifi|free_parking|refundable|src_rate_amount_usd|src_commission_base_usd|src_supply_revenue_usd|
+--------------------+--------+-------------+--------------+--------+----------+--------------------+------------------+--------------------+-----------+-----

In [15]:
# Create hotel level df
hotel_df = search_revenue_below_peterman_usd_df.\
select('message_id', 'tuid', 'hotel_id','check_in_date', 'check_out_date','hotel_result_index',"message_date").\
dropDuplicates()
hotel_df.count()

952605

In [16]:
hotel_df = hotel_df.withColumn('hotel_index',row_number().over(Window.partitionBy("message_id", "tuid").\
                                                        orderBy(F.asc("hotel_result_index"))))

In [17]:
hotel_df.orderBy(desc('message_id'),desc('tuid')).select('message_id', 'tuid', 'hotel_id',"hotel_result_index",'hotel_index').show(100)

+--------------------+--------+--------+------------------+-----------+
|          message_id|    tuid|hotel_id|hotel_result_index|hotel_index|
+--------------------+--------+--------+------------------+-----------+
|ffffdb92-6a52-49e...| 6508678|  688097|                 1|          1|
|ffffd68d-70e1-4dd...|18479478|   25155|                 0|          1|
|ffffca07-5fa2-4db...|19803429| 2279569|                13|          1|
|ffffc874-e189-4d1...|15736267|  447796|                 2|          1|
|ffffb7e2-49e8-413...|18668119| 1695656|                 3|          1|
|ffffb56a-d31e-49b...| 9070436|  696551|                 1|          1|
|ffffa551-4d4e-4d4...| 8641771|12709779|                 0|          1|
|ffffa2ab-9a40-410...|16098599| 4512886|                 7|          1|
|ffff9ef1-a522-463...| 4161855|   15067|                 0|          1|
|ffff98d9-fd0c-412...| 6795110|   28025|                 2|          1|
|ffff9803-3555-40a...|20468420| 3269648|                 0|     

In [18]:
# Find the last search
last_search_id = bookings_df.\
                join(hotel_df,["hotel_id","check_in_date", "check_out_date","tuid"]).\
                withColumn("rn", row_number().over(Window.partitionBy("hotel_id","check_in_date","check_out_date","tuid").orderBy(desc("message_date")))).\
                filter(F.col("rn") == 1).\
               select('message_id','tuid')
last_search_id.count()

384685

In [19]:
last_search_hotel_df = hotel_df.join(last_search_id,['message_id','tuid'])
last_search_hotel_df.count()

399962

In [39]:
# add hotel information to search data (last search, bk_hotel_index)
last_search_df = last_search_hotel_df.\
join(search_revenue_below_peterman_usd_df,['message_id', 'tuid', 'hotel_id', 'check_in_date', 'check_out_date', 'hotel_result_index', 'message_date'])

In [40]:
last_search_df.count()

2942308

In [41]:
last_search_df.printSchema()

root
 |-- message_id: string (nullable = true)
 |-- tuid: integer (nullable = true)
 |-- hotel_id: integer (nullable = true)
 |-- check_in_date: date (nullable = true)
 |-- check_out_date: date (nullable = true)
 |-- hotel_result_index: integer (nullable = true)
 |-- message_date: timestamp (nullable = true)
 |-- hotel_index: integer (nullable = true)
 |-- rate_index: integer (nullable = true)
 |-- city: string (nullable = true)
 |-- star_rating: string (nullable = true)
 |-- filter_want_in_policy_rates_only: boolean (nullable = true)
 |-- filter_eligible_for_loyalty: boolean (nullable = true)
 |-- filter_free_breakfast: boolean (nullable = true)
 |-- filter_free_wifi: boolean (nullable = true)
 |-- filter_free_parking: boolean (nullable = true)
 |-- rate_type: string (nullable = true)
 |-- eligible_for_loyalty: boolean (nullable = true)
 |-- free_breakfast: boolean (nullable = true)
 |-- free_wifi: boolean (nullable = true)
 |-- free_parking: boolean (nullable = true)
 |-- refundable:

In [None]:
# add score to last search df

In [25]:
from fast_hotel_sort.utils.SearchDataUtils import SearchDataUtils
from datetime import datetime, timedelta, date

In [26]:
cols = ["message_id"
        ,"timestamp"
        ,"search_date"
        ,"check_in_date"
        ,"check_out_date"
        ,"tuid"]
cols2 = ["message_id"
        ,"timestamp"
        ,"hotel_id"
        ,"star_rating"
        ,"city"
        ,"property_level_neg"
        ,"chain_level_neg"
        ,"preferred"
        ,"score_0"
        ,"score_1"
        ,"hotel_result_index"
        ,"country_code"]
q = SearchDataUtils()
sd = date(2019,10,1)
sd2 = date(2020,1,1)
dispHotelsDF = q.displayed_hotels(sd, sd2,cols, cols2)

In [27]:
dispHotelsDF.select('score_1').show()

+--------------------+
|             score_1|
+--------------------+
| 0.13827994800131627|
| 0.24201505737198709|
|  0.1334948763591221|
| 0.12422731867978883|
| 0.06927887203780084|
| 0.07036597174325393|
| 0.04331228691037983|
| 0.11179248165184742|
| 0.11641849342311002|
| 0.04704464239263926|
| 0.12599100502052407|
| 0.10110583016890139|
| 0.10907288473339308|
| 0.04331228691037983|
|0.047260752266933076|
| 0.08574266835957374|
|  0.1482633350620393|
| 0.10458626200560986|
| 0.04708303256754679|
| 0.12503782390364598|
+--------------------+
only showing top 20 rows

In [28]:
dispHotelsDF.count()

149291990

In [42]:
last_search_df = last_search_df.join(dispHotelsDF.select("message_id","tuid","hotel_id","check_in_date", "check_out_date",'score_1'),
                                    ["message_id","tuid","hotel_id","check_in_date", "check_out_date"])

In [43]:
last_search_df.count()

1898470

In [44]:
dir = 's3://ege-ds-workshops-corp/yixli/data_understanding/'

datestamp = datetime.now().strftime('%m-%d-%Y')
last_search_df.repartition(1).write.format('com.databricks.spark.csv').mode('overwrite')\
.save(dir+datestamp+'_last_search_df.csv',header = 'true')

## Read last search df(booked hotels)

In [10]:
file_loc = "s3://ege-ds-workshops-corp/yixli/data_understanding/09-15-2020_last_search_df.csv"
print("Collecting search the data...")
last_search_df1 = sqlContext.read.format('csv').\
            options(header='True', inferSchema='true', delimiter=',').\
           load(file_loc)

# load_date between '20191001' and '20200101'
print(last_search_df1.count())

Collecting search the data...
1898481

In [11]:
last_search_df1.count()

1898481

In [12]:
last_search_df1.printSchema()

root
 |-- message_id: string (nullable = true)
 |-- tuid: integer (nullable = true)
 |-- hotel_id: integer (nullable = true)
 |-- check_in_date: timestamp (nullable = true)
 |-- check_out_date: timestamp (nullable = true)
 |-- hotel_result_index: integer (nullable = true)
 |-- message_date: timestamp (nullable = true)
 |-- hotel_index: integer (nullable = true)
 |-- rate_index: integer (nullable = true)
 |-- city: string (nullable = true)
 |-- star_rating: string (nullable = true)
 |-- filter_want_in_policy_rates_only: boolean (nullable = true)
 |-- filter_eligible_for_loyalty: boolean (nullable = true)
 |-- filter_free_breakfast: boolean (nullable = true)
 |-- filter_free_wifi: boolean (nullable = true)
 |-- filter_free_parking: boolean (nullable = true)
 |-- rate_type: string (nullable = true)
 |-- eligible_for_loyalty: boolean (nullable = true)
 |-- free_breakfast: boolean (nullable = true)
 |-- free_wifi: boolean (nullable = true)
 |-- free_parking: boolean (nullable = true)
 |-- r

In [13]:
last_search_df1.show(50)

+--------------------+--------+--------+-------------------+-------------------+------------------+--------------------+-----------+----------+------------------+-----------+--------------------------------+---------------------------+---------------------+----------------+-------------------+---------+--------------------+--------------+---------+------------+----------+-------------------+-----------------------+----------------------+------------------+
|          message_id|    tuid|hotel_id|      check_in_date|     check_out_date|hotel_result_index|        message_date|hotel_index|rate_index|              city|star_rating|filter_want_in_policy_rates_only|filter_eligible_for_loyalty|filter_free_breakfast|filter_free_wifi|filter_free_parking|rate_type|eligible_for_loyalty|free_breakfast|free_wifi|free_parking|refundable|src_rate_amount_usd|src_commission_base_usd|src_supply_revenue_usd|           score_1|
+--------------------+--------+--------+-------------------+------------------

## Get labels

In [14]:
bookings_df.columns

['hotelfactid', 'hotel_id', 'issue_date', 'check_in_date', 'check_out_date', 'quarter', 'tuid', 'travel_product_id', 'group_account_id', 'hotel_rate_type', 'bk_rate_type', 'booking_type', 'gross_booking_value_usd', 'total_commission', 'total_markup', 'total_gds', 'total_supply_revenue', 'rate_type', 'duration', 'avg_book_rate_amount_usd', 'bk_avg_commission_usd', 'bk_avg_markup_usd', 'bk_avg_revenue_usd']

In [15]:
bookings_df = bookings_df.withColumn('rate_type',F.when(F.col('bk_rate_type')=='HOTMIP','ESRM').otherwise(F.col('bk_rate_type')))

In [16]:
book_rate_df =last_search_df1.join(bookings_df,["hotel_id","check_in_date", "check_out_date","tuid",'rate_type']).\
select("message_id","hotel_id","check_in_date", "check_out_date","tuid",'rate_type','rate_index','avg_book_rate_amount_usd','src_rate_amount_usd')
book_rate_df.count()

930339

In [17]:
book_rate_df = book_rate_df.\
                         withColumn('rate_diff',F.abs(F.col('avg_book_rate_amount_usd')-F.col('src_rate_amount_usd')))

book_rate_df = book_rate_df.filter((F.col('rate_diff')<1.0))

In [18]:
book_rate_df = book_rate_df.withColumn('label',F.lit(1))

## Match rates: is booked/not booked

In [19]:
booking_search_df = last_search_df1.join(book_rate_df.select("message_id","hotel_id","check_in_date", "check_out_date","tuid",'rate_type','rate_index','label'),
                                        ["message_id","hotel_id","check_in_date", "check_out_date","tuid",'rate_type','rate_index'],how='left')

In [20]:
booking_search_df = booking_search_df.fillna({'label':0})

In [21]:
booking_search_df.filter(F.col('label')==1).count()

208654

In [22]:
booking_search_df.filter(F.col('label')==0).count()

1696384

In [23]:
booking_search_df.printSchema()

root
 |-- message_id: string (nullable = true)
 |-- hotel_id: integer (nullable = true)
 |-- check_in_date: timestamp (nullable = true)
 |-- check_out_date: timestamp (nullable = true)
 |-- tuid: integer (nullable = true)
 |-- rate_type: string (nullable = true)
 |-- rate_index: integer (nullable = true)
 |-- hotel_result_index: integer (nullable = true)
 |-- message_date: timestamp (nullable = true)
 |-- hotel_index: integer (nullable = true)
 |-- city: string (nullable = true)
 |-- star_rating: string (nullable = true)
 |-- filter_want_in_policy_rates_only: boolean (nullable = true)
 |-- filter_eligible_for_loyalty: boolean (nullable = true)
 |-- filter_free_breakfast: boolean (nullable = true)
 |-- filter_free_wifi: boolean (nullable = true)
 |-- filter_free_parking: boolean (nullable = true)
 |-- eligible_for_loyalty: boolean (nullable = true)
 |-- free_breakfast: boolean (nullable = true)
 |-- free_wifi: boolean (nullable = true)
 |-- free_parking: boolean (nullable = true)
 |-- r

In [36]:
booking_search_df.show(50)

+--------+-------------+--------------+--------+---------+--------------------+------------------+----------+------------+-----------+--------------------+-----------+--------------------------------+---------------------------+---------------------+----------------+-------------------+--------------------+--------------+---------+------------+----------+--------------+-------------------+-----------------------+----------------------+------------+-----------------+----------------+---------------------+-----------------------+------------------------+------------------+------------------+--------------------+---------+
|hotel_id|check_in_date|check_out_date|    tuid|rate_type|          message_id|hotel_result_index|rate_index|message_date|      score|                city|star_rating|filter_want_in_policy_rates_only|filter_eligible_for_loyalty|filter_free_breakfast|filter_free_wifi|filter_free_parking|eligible_for_loyalty|free_breakfast|free_wifi|free_parking|refundable|bk_hotel_index|

In [None]:
dir = 's3://ege-ds-workshops-corp/yixli/data_understanding/'
datestamp = datetime.datetime.now().strftime('%m-%d-%Y')
booking_search_df.repartition(1).write.format('com.databricks.spark.csv').mode('overwrite')\
.save(dir+datestamp+'_booking_search_df_label.csv',header = 'true')