In [1]:
import findspark
findspark.init()
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
from IPython.display import display, Markdown
from pyspark.sql.types import StructType, StructField, IntegerType, DoubleType, StringType, DateType
from pyspark.sql.functions import when, count, col, countDistinct, \
                                    desc, asc, round, date_format, \
                                    concat_ws, expr, month, \
                                    first, lit, max, min, stddev, avg

sc = SparkContext.getOrCreate()
spark = SparkSession(sc)


 # Import Data

In [2]:
# import main dataset from csv with option inferSchema = true
data_df = \
    spark.read.option("inferSchema", "true")\
        .option("header", "true")\
        .csv('Data/Spark Lab/Individual Assignment/hotel_bookings.csv')
data_df.cache()

# manually create schema for helper table
dow_schema = StructType(\
    [StructField("date",DateType(),True),\
     StructField("day_of_week",StringType(),True)])

# import helper table with manually defined schema
days_of_week = \
    spark.read.schema(dow_schema)\
        .option("header", "true")\
        .option("sep", ";")\
        .csv('Data/Spark Lab/Individual Assignment/day_of_week3.csv')


Row(date=datetime.date(2013, 6, 1), day_of_week='Saturday')

In [3]:
# assigning the schema and column names to variables
columns = data_df.schema.names
total_bookings = data_df.count()

display(Markdown('printing the schema of the dataset'))
data_df.printSchema()

display(Markdown(f'the dataset consists of **{total_bookings}** rows'))


printing the schema of the dataset

root
 |-- hotel: string (nullable = true)
 |-- is_canceled: integer (nullable = true)
 |-- lead_time: integer (nullable = true)
 |-- arrival_date_year: integer (nullable = true)
 |-- arrival_date_month: string (nullable = true)
 |-- arrival_date_week_number: integer (nullable = true)
 |-- arrival_date_day_of_month: integer (nullable = true)
 |-- stays_in_weekend_nights: integer (nullable = true)
 |-- stays_in_week_nights: integer (nullable = true)
 |-- adults: integer (nullable = true)
 |-- children: string (nullable = true)
 |-- babies: integer (nullable = true)
 |-- meal: string (nullable = true)
 |-- country: string (nullable = true)
 |-- market_segment: string (nullable = true)
 |-- distribution_channel: string (nullable = true)
 |-- is_repeated_guest: integer (nullable = true)
 |-- previous_cancellations: integer (nullable = true)
 |-- previous_bookings_not_canceled: integer (nullable = true)
 |-- reserved_room_type: string (nullable = true)
 |-- assigned_room_type: string (nullab

the dataset consists of **119390** rows

In [4]:
# perform typecasts where needed
# change column names
# drop columns that are not needed

data_df = \
    data_df.withColumn("is_canceled",col("is_canceled").cast("boolean"))\
        .withColumn("is_repeated_guest",col("is_repeated_guest").cast("boolean"))\
        .withColumn("adr",col("adr").cast("double"))\
        .withColumnRenamed("adr", "average_daily_rate")\
        .drop('required_car_parking_spaces')\
        .drop('previous_cancellations')\
        .drop('previous_bookings_not_canceled')\
        .drop('assigned_room_type')\
        .drop('booking_changes')\
        .drop('deposit_type')\
        .drop('company')\
        .drop('reservation_status_date')\


# update columns
columns = data_df.schema.names

# TODO lit()



get a random sample from the dataset with spark

[Row(hotel='Resort Hotel', is_canceled=False, lead_time=68, arrival_date_year=2015, arrival_date_month='July', arrival_date_week_number=27, arrival_date_day_of_month=1, stays_in_weekend_nights=0, stays_in_week_nights=4, adults=2, children='0', babies=0, meal='BB', country='IRL', market_segment='Online TA', distribution_channel='TA/TO', is_repeated_guest=False, reserved_room_type='D', agent='240', days_in_waiting_list=0, customer_type='Transient', average_daily_rate=97.0, total_of_special_requests=3, reservation_status='Check-Out'), Row(hotel='Resort Hotel', is_canceled=False, lead_time=90, arrival_date_year=2015, arrival_date_month='July', arrival_date_week_number=27, arrival_date_day_of_month=2, stays_in_weekend_nights=2, stays_in_week_nights=5, adults=2, children='0', babies=0, meal='HB', country='GBR', market_segment='Offline TA/TO', distribution_channel='TA/TO', is_repeated_guest=False, reserved_room_type='A', agent='243', days_in_waiting_list=0, customer_type='Contract', average_d

get a random sample from the dataset with pandas

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,market_segment,distribution_channel,is_repeated_guest,reserved_room_type,agent,days_in_waiting_list,customer_type,average_daily_rate,total_of_special_requests,reservation_status
78593,City Hotel,False,5,2016,July,31,25,1,0,1,...,Corporate,Corporate,True,A,,0,Transient,65.0,1,Check-Out
37556,Resort Hotel,False,217,2017,June,24,15,2,5,2,...,Offline TA/TO,TA/TO,False,D,243.0,0,Contract,96.4,4,Check-Out


In [6]:
display(Markdown('printing null values per column'))
# thank you Raúl for this line of code!
# it took a while to understand it, it is genius!
data_df.select([count(when(col(c).isNull(), c)).alias(c) for c in columns[:10]]).show()             
data_df.select([count(when(col(c).isNull(), c)).alias(c) for c in columns[10:19]]).show()
data_df.select([count(when(col(c).isNull(), c)).alias(c) for c in columns[19:]]).show()
# printing this in three rows, because otherwise the format would break

display(Markdown('get a random sample from the dataset with spark'))
print(data_df.sample(False, 0.1).take(2))

display(Markdown('get a random sample from the dataset with pandas'))
pandas_sample_df = data_df.toPandas()
pandas_sample_df.sample(n=2)


printing null values per column

+-----+-----------+---------+-----------------+------------------+------------------------+-------------------------+-----------------------+--------------------+------+
|hotel|is_canceled|lead_time|arrival_date_year|arrival_date_month|arrival_date_week_number|arrival_date_day_of_month|stays_in_weekend_nights|stays_in_week_nights|adults|
+-----+-----------+---------+-----------------+------------------+------------------------+-------------------------+-----------------------+--------------------+------+
|    0|          0|        0|                0|                 0|                       0|                        0|                      0|                   0|     0|
+-----+-----------+---------+-----------------+------------------+------------------------+-------------------------+-----------------------+--------------------+------+

+--------+------+----+-------+--------------+--------------------+-----------------+------------------+-----+
|children|babies|meal|country|market_se

 In this data set, consisting of 119.390 rows we can see a mix of integer-, double- and string-type data. The initial check for NULL values (with .isNull()) suggested that there are no missing values. However, running the code again with == 'NULL' we can see that the columns "company", "agent", and "country" have some missing values.

In [7]:
display(Markdown('printing null values per column again, this time with col(c) == "NULL"'))
data_df.select([count(when(col(c).isNull() | (col(c) == "NULL"), c)).alias(c) for c in columns[:10]]).show()             
data_df.select([count(when(col(c).isNull() | (col(c) == "NULL"), c)).alias(c) for c in columns[10:19]]).show()
data_df.select([count(when(col(c).isNull() | (col(c) == "NULL"), c)).alias(c) for c in columns[19:]]).show()


printing null values per column

+-----+-----------+---------+-----------------+------------------+------------------------+-------------------------+-----------------------+--------------------+------+
|hotel|is_canceled|lead_time|arrival_date_year|arrival_date_month|arrival_date_week_number|arrival_date_day_of_month|stays_in_weekend_nights|stays_in_week_nights|adults|
+-----+-----------+---------+-----------------+------------------+------------------------+-------------------------+-----------------------+--------------------+------+
|    0|          0|        0|                0|                 0|                       0|                        0|                      0|                   0|     0|
+-----+-----------+---------+-----------------+------------------+------------------------+-------------------------+-----------------------+--------------------+------+

+--------+------+----+-------+--------------+--------------------+-----------------+------------------+-----+
|children|babies|meal|country|market_se

 Next, lets try to define some groups of columns. For this reason, we will first check [the datasource](https://www.kaggle.com/jessemostipak/hotel-booking-demand) and get ourselves familiar with the contents of each column.
 With this information at hand, we can further categorize the data, helping us to better understand what the data can tell us.

 # Entries, Metrics & Dimensions

In [None]:
display(Markdown('get a random sample from the dataset with spark'))
print(data_df.sample(False, 0.1).take(2))

display(Markdown('get a random sample from the dataset with pandas'))
pandas_sample_df = data_df.toPandas()
pandas_sample_df.sample(n=2)

In [8]:
# define group variable "booking"
booking = ['hotel', 'is_canceled', 'market_segment', 'agent', 
            'days_in_waiting_list', 'reservation_status', 
             'distribution_channel', 'average_daily_rate']

# define group variable "time"
time = ['lead_time', 'arrival_date_year', 'arrival_date_month', 
        'arrival_date_week_number', 'arrival_date_day_of_month', 
        'stays_in_weekend_nights', 'stays_in_week_nights']

# define group variable "guests"
guest = ['adults', 'children', 'babies', 'country', 'is_repeated_guest',  
        'meal', 'reserved_room_type'] 


 With these groups, we can now create some basic insights about the data.
 We will query for dinsinct values, counts and summary statistics of numerical and categorical columns.

In [9]:
# Since we are going to use different metrics depending on the datatype, functions to get these appropriate columnnames can be handy.
# The basic idea here was to spend more time with one column-group, think about how to modularize functions so they can be used in multiple ways, to create a recipe that can be used regardless of the data at hand. This recipe can then be applied to the other column-groups.

def get_categoricals(data_df):
    """This function takes as input a spark dataframe and returns a list of its StringType columnames"""
    categoricals = [column.name for column in data_df.schema.fields if isinstance(column.dataType, StringType)]
    return categoricals


def get_numericals(data_df):
    """This function takes as input a spark dataframe and returns a list of its IntegerType and DoubleType columnames"""
    numericals =  [column.name for column in data_df.schema.fields if isinstance(column.dataType, (IntegerType, DoubleType))]
    return numericals

def get_min_max(data_df):
    "This function takes as input a spark dataframe-column and prints its highest and lowest value as markdown table"
    for category in get_categoricals(data_df):
        first = data_df.groupBy(category).count().sort(desc('count')).first()
        last = data_df.groupBy(category).count().sort(asc('count')).first()
        display(Markdown("""
| %s | %s |
|----|----|
| %s | %s |
""" % (f"least_{category}", f"most_{category}",
    "%s (%d occurrences)" % (first[category], first["count"]), 
    "%s (%d occurrences)" % (last[category], last["count"]))))


 # Basic profiling of booking-related data


In [10]:
display(Markdown('\n print the most occuring entries of the whole booking group in descending order'))
data_df.groupBy(booking).count().sort(desc('count')).show(10)

display(Markdown('\n print the highest and lowest counts of categorical columns belonging to the booking-related group'))
get_min_max(data_df[booking])

display(Markdown('\n show number of unique categorical-values per column'))
data_df.select([countDistinct(c).alias(c) for c in get_categoricals(data_df[booking])]).show()

display(Markdown('\n print summary statistics of booking-numricals'))
data_df.select(get_numericals(data_df[booking])).summary().show()



 print the most occuring entries of the whole booking group in descending order

+------------+-----------+--------------+-----+--------------------+------------------+--------------------+------------------+-----+
|       hotel|is_canceled|market_segment|agent|days_in_waiting_list|reservation_status|distribution_channel|average_daily_rate|count|
+------------+-----------+--------------+-----+--------------------+------------------+--------------------+------------------+-----+
|  City Hotel|       true|        Groups|    1|                   0|          Canceled|               TA/TO|              62.0| 2984|
|  City Hotel|      false|     Corporate| NULL|                   0|         Check-Out|           Corporate|              65.0|  574|
|  City Hotel|      false|        Groups|    1|                   0|         Check-Out|               TA/TO|              62.0|  563|
|  City Hotel|       true|        Groups|    1|                   0|          Canceled|               TA/TO|              62.8|  495|
|  City Hotel|      false| Offline TA/TO|    6|               


 print the highest and lowest counts of categorical columns belonging to the booking-related group


| least_hotel | most_hotel |
|----|----|
| City Hotel (79330 occurrences) | Resort Hotel (40060 occurrences) |



| least_market_segment | most_market_segment |
|----|----|
| Online TA (56477 occurrences) | Undefined (2 occurrences) |



| least_agent | most_agent |
|----|----|
| 9 (31961 occurrences) | 451 (1 occurrences) |



| least_reservation_status | most_reservation_status |
|----|----|
| Check-Out (75166 occurrences) | No-Show (1207 occurrences) |



| least_distribution_channel | most_distribution_channel |
|----|----|
| TA/TO (97870 occurrences) | Undefined (5 occurrences) |



 show number of unique categorical-values per column

+-----+--------------+-----+------------------+--------------------+
|hotel|market_segment|agent|reservation_status|distribution_channel|
+-----+--------------+-----+------------------+--------------------+
|    2|             8|  334|                 3|                   5|
+-----+--------------+-----+------------------+--------------------+




 print summary statistics of booking-numricals

+-------+--------------------+------------------+
|summary|days_in_waiting_list|average_daily_rate|
+-------+--------------------+------------------+
|  count|              119390|            119390|
|   mean|   2.321149174972778|101.83112153446218|
| stddev|  17.594720878776243| 50.53579028554872|
|    min|                   0|             -6.38|
|    25%|                   0|             69.29|
|    50%|                   0|              94.5|
|    75%|                   0|             126.0|
|    max|                 391|            5400.0|
+-------+--------------------+------------------+



 # Basic profiling of timing-related data

In [11]:
display(Markdown('\n print the most occuring entries of the whole timing group in descending order'))
data_df.groupBy(time).count().sort(desc('count')).show(10)

display(Markdown('\n print the highest and lowest counts of categorical columns belonging to the timing-related group'))
get_min_max(data_df[time])

display(Markdown('\n show number of unique categorical-values per column'))
data_df.select([countDistinct(c).alias(c) for c in get_categoricals(data_df[time])]).show()

display(Markdown('\n print summary statistics of time-numricals'))
data_df.select(get_numericals(data_df[time])).summary().show()



 print the most occuring entries of the whole timing group in descending order

+---------+-----------------+------------------+------------------------+-------------------------+-----------------------+--------------------+-----+
|lead_time|arrival_date_year|arrival_date_month|arrival_date_week_number|arrival_date_day_of_month|stays_in_weekend_nights|stays_in_week_nights|count|
+---------+-----------------+------------------+------------------------+-------------------------+-----------------------+--------------------+-----+
|      277|             2016|          November|                      46|                        7|                      1|                   2|  180|
|       68|             2016|          February|                       8|                       17|                      0|                   2|  150|
|      102|             2015|           October|                      42|                       16|                      0|                   2|  146|
|       74|             2015|         September|                      38|                     


 print the highest and lowest counts of categorical columns belonging to the timing-related group


| least_arrival_date_month | most_arrival_date_month |
|----|----|
| August (13877 occurrences) | January (5929 occurrences) |



 show number of unique categorical-values per column

+------------------+
|arrival_date_month|
+------------------+
|                12|
+------------------+




 print summary statistics of time-numricals

+-------+------------------+------------------+------------------------+-------------------------+-----------------------+--------------------+
|summary|         lead_time| arrival_date_year|arrival_date_week_number|arrival_date_day_of_month|stays_in_weekend_nights|stays_in_week_nights|
+-------+------------------+------------------+------------------------+-------------------------+-----------------------+--------------------+
|  count|            119390|            119390|                  119390|                   119390|                 119390|              119390|
|   mean|104.01141636652986| 2016.156554150264|       27.16517296255968|       15.798241058715135|     0.9275986263506156|   2.500301532791691|
| stddev| 106.8630970479881|0.7074759445220408|      13.605138355497665|        8.780829470578343|     0.9986134945978791|  1.9082856150479042|
|    min|                 0|              2015|                       1|                        1|                      0|              

 # Basic profiling of guest-related data

In [12]:
display(Markdown('\n print the most occuring entries of the whole guest group in descending order'))
data_df.groupBy(guest).count().sort(desc('count')).limit(10).show()

display(Markdown('\n print the highest and lowest counts of categorical columns belonging to the guest-related group'))
get_min_max(data_df[guest])

display(Markdown('\n show number of unique categorical-values per column'))
data_df.select([countDistinct(c).alias(c) for c in get_categoricals(data_df[guest])]).show()

display(Markdown('\n print summary statistics of guest-numricals'))
data_df.select(get_numericals(data_df[guest])).summary().show()



 print the most occuring entries of the whole guest group in descending order

+------+--------+------+-------+-----------------+----+------------------+-----+
|adults|children|babies|country|is_repeated_guest|meal|reserved_room_type|count|
+------+--------+------+-------+-----------------+----+------------------+-----+
|     2|       0|     0|    PRT|            false|  BB|                 A|20529|
|     1|       0|     0|    PRT|            false|  BB|                 A| 8279|
|     2|       0|     0|    FRA|            false|  BB|                 A| 4003|
|     2|       0|     0|    GBR|            false|  BB|                 A| 3698|
|     2|       0|     0|    PRT|            false|  HB|                 A| 3555|
|     2|       0|     0|    ESP|            false|  BB|                 A| 2618|
|     2|       0|     0|    PRT|            false|  BB|                 D| 2340|
|     2|       0|     0|    DEU|            false|  BB|                 A| 2278|
|     2|       0|     0|    GBR|            false|  BB|                 D| 1849|
|     1|       0|     0|    


 print the highest and lowest counts of categorical columns belonging to the guest-related group


| least_children | most_children |
|----|----|
| 0 (110796 occurrences) | 10 (1 occurrences) |



| least_country | most_country |
|----|----|
| PRT (48590 occurrences) | BHS (1 occurrences) |



| least_meal | most_meal |
|----|----|
| BB (92310 occurrences) | FB (798 occurrences) |



| least_reserved_room_type | most_reserved_room_type |
|----|----|
| A (85994 occurrences) | L (6 occurrences) |



 show number of unique categorical-values per column

+--------+-------+----+------------------+
|children|country|meal|reserved_room_type|
+--------+-------+----+------------------+
|       6|    178|   5|                10|
+--------+-------+----+------------------+




 print summary statistics of guest-numricals

+-------+------------------+--------------------+
|summary|            adults|              babies|
+-------+------------------+--------------------+
|  count|            119390|              119390|
|   mean|1.8564033838679956|0.007948739425412514|
| stddev|0.5792609988327531|  0.0974361913012642|
|    min|                 0|                   0|
|    25%|                 2|                   0|
|    50%|                 2|                   0|
|    75%|                 2|                   0|
|    max|                55|                  10|
+-------+------------------+--------------------+



 # Business Question 1: what does the customer-spending mix look like?

 customer_spending is going to be categorized by the colum "average-daily-rate" as follows:

 - "6 something went wrong"               -> adr_pp = (-infinity,0)
 - "5 very low"            -> adr_pp = (0, 28)
 - "4 low"                -> adr_pp = (28,45)
 - "3 average"                -> adr_pp = (45,62)
 - "2 high"                   -> adr_pp = (62,85)
 - "1 very high"                       -> adr_pp = (85,+infinity)



In [13]:
# first lets calculate the avere-daily-rate per guest, (currently per booking).
# for this we need to create a total guests column, that takes into consideration, that children do not count as a "full guest" (also called PAX)
# then we devide the average_daily_rate per PAX and categorize accordingly
# guest = guest + ["customer_spending"]

bq1_df = \
    data_df.withColumn("PAX", (col("adults") + 0.5 * col("children") + 0.2 * col("babies")))\
        .withColumn("adr_pp", (col("average_daily_rate") / col("PAX")))\
        .withColumn("customer_spending", 
            when(col("adr_pp")<=0,
                "6 something went wrong")
            .when((col("adr_pp")>0) & (col("adr_pp")<=28),
                "5 very low")
            .when((col("adr_pp")>28) & (col("adr_pp")<=45),
                "4 low")
            .when((col("adr_pp")>45) & (col("adr_pp")<=62),
                "3 average")
            .when((col("adr_pp")>62) & (col("adr_pp")<=85),
                "2 high")
            .otherwise(
                "1 very high"))

display(Markdown('Print the customer mix according to the new categorization'))
bq1_df.groupBy("customer_spending").count().sort(desc('count')).show(10)


Print the customer mix according to the new categorization

+--------------------+-----+
|   customer_spending|count|
+--------------------+-----+
|               4 low|37655|
|           3 average|32325|
|              2 high|23224|
|         1 very high|15347|
|          5 very low| 9028|
|6 something went ...| 1811|
+--------------------+-----+



 # Business Question 2: during which month do we get the highest ratio of 3 average bookings

In [14]:
# we can reuse the df from business question 1 - however there are a few things to do:
# 1 cast dateTypes on the according columns, to create a date column
# 2 Calculate the booking date by subtracting the lead time from the 
# Group the data so it answers the Business Question

# first create a proper month column
bq2_df = \
    bq1_df.\
        withColumn("month",
            when(col("arrival_date_month") == "January", 1)
            .when(col("arrival_date_month") == "February", 2)
            .when(col("arrival_date_month") == "March", 3)
            .when(col("arrival_date_month") == "April", 4)
            .when(col("arrival_date_month") == "May", 5)
            .when(col("arrival_date_month") == "June", 6)
            .when(col("arrival_date_month") == "July", 7)
            .when(col("arrival_date_month") == "August", 8)
            .when(col("arrival_date_month") == "September", 9)
            .when(col("arrival_date_month") == "October", 10)
            .when(col("arrival_date_month") == "November", 11)
            .when(col("arrival_date_month") == "December", 12))

# next we can create a Date-colum "booking_date".
# finally, we can calculate the booking date, by subtracting the lead_time from the booking_date 
    # expr() is needed in order to pass the column to date_add    

In [15]:
bq2_df = \
    bq2_df.withColumn("arrival_date", 
            date_format(concat_ws('-', bq2_df.arrival_date_year, bq2_df.month , bq2_df.arrival_date_day_of_month), 'yyyy-MM-dd'))\


In [16]:
bq2_df = \
    bq2_df.withColumn("booking_date", 
        expr("date_add(to_date(arrival_date,'yyyy-MM-dd'),-cast(lead_time as int))"))                


In [17]:
# join dataframe with day_of_week dataframe
bq2_df = \
    bq2_df.join(days_of_week, 
        bq2_df["arrival_date"] == days_of_week["date"], 
        how = 'left')


In [18]:
# add dummy variables for customer_spending
bq2_df = \
    bq2_df.withColumn("very_high", 
            when(col("customer_spending") == "1 very high", 1).otherwise(0))\
        .withColumn("high", 
            when(col("customer_spending") == "2 high", 1).otherwise(0))\
        .withColumn("average", 
            when(col("customer_spending") == "3 average", 1).otherwise(0))\
        .withColumn("low", 
            when(col("customer_spending") == "4 low", 1).otherwise(0))\
        .withColumn("very_low", 
            when(col("customer_spending") == "5 very low", 1).otherwise(0))\
        .withColumn("something_went_wrong", 
            when(col("customer_spending") == "6 something went wrong", 1).otherwise(0))\



In [19]:
# with the dataframe created above, we can finally go about answering the business question
# calculate ratio of "1 very high" per month
ba2_df = \
    bq2_df.groupBy(month("booking_date"))\
    .sum("very_high", "high", "average", "low", "very_low", "something_went_wrong")

display(Markdown("printing the ratio of customer-spending 'very high'"))
ba2_df = \
    ba2_df.withColumn(
            "ratio", round((
                col("sum(very_high)") / 
                (col("sum(very_high)") + col("sum(high)") + col("sum(average)") + col("sum(low)") + col("sum(very_low)") + col("sum(something_went_wrong)")))
                ,2))\
        .sort(desc("ratio")).show()


printing the ratio of customer-spending 'very high'

+-------------------+--------------+---------+------------+--------+-------------+-------------------------+-----+
|month(booking_date)|sum(very_high)|sum(high)|sum(average)|sum(low)|sum(very_low)|sum(something_went_wrong)|ratio|
+-------------------+--------------+---------+------------+--------+-------------+-------------------------+-----+
|                  6|          1562|     2039|        1327|     849|          182|                      104| 0.26|
|                  5|          1641|     2147|        2318|    1318|          262|                      167| 0.21|
|                  4|          1775|     2107|        2565|    1725|          347|                      103| 0.21|
|                  8|          1243|     1893|        2087|    2142|          476|                      129| 0.16|
|                  7|          1590|     1968|        2095|    3640|          360|                      165| 0.16|
|                  9|          1181|     1523|        1823|    2492|         101

 # Business Question 3: what are the customer_spending ratios per weekday, for each of the two hotels

In [20]:
display(Markdown("printing the customer-spending mix per country"))
bq3_df = \
    bq2_df.groupBy("hotel", "day_of_week")\
        .pivot("customer_spending")\
        .agg(count("customer_spending"))\
        .orderBy(
            col("1 very high").desc(),
            col("2 high").desc(), 
            col("3 average").desc(), 
            col("4 low").desc(), 
            col("5 very low").desc())\
        .show()



printing the customer-spending mix per country

+------------+-----------+-----------+------+---------+-----+----------+----------------------+
|       hotel|day_of_week|1 very high|2 high|3 average|4 low|5 very low|6 something went wrong|
+------------+-----------+-----------+------+---------+-----+----------+----------------------+
|  City Hotel|     Monday|       1926|  2646|     3664| 3305|       157|                   125|
|  City Hotel|    Tuesday|       1917|  2209|     2496| 2217|       109|                   179|
|  City Hotel|     Friday|       1608|  3103|     4858| 3990|       234|                   162|
|  City Hotel|  Wednesday|       1543|  2486|     3511| 3373|       103|                   213|
|  City Hotel|   Thursday|       1539|  2742|     3901| 4522|       138|                   167|
|  City Hotel|     Sunday|       1096|  2021|     3180| 2724|        95|                    78|
|  City Hotel|   Saturday|        960|  2447|     3641| 3642|       156|                   147|
|Resort Hotel|     Monday|        821|  

In [21]:

display(Markdown("**customer-spending mix per country"))
bq2_df.groupBy("country")\
    .agg(round(avg("very_high"),2).alias("average_high"),
        round(min("adr_pp"),2).alias("adr_pp_min"),
        round(max("adr_pp"),2).alias("adr_pp_max"),
        round(stddev("very_high"),2).alias("stddev_high"))\
    .orderBy(
        col("average_high").desc())\
    .where((col("stddev_high") != 0) & (col("stddev_high").isNotNull()) & (col("stddev_high") != "NaN"))\
    .show()

**customer-spending mix per country

+-------+------------+----------+----------+-----------+
|country|average_high|adr_pp_min|adr_pp_max|stddev_high|
+-------+------------+----------+----------+-----------+
|    SRB|        0.64|       0.0|     115.0|       0.48|
|    LKA|        0.57|     16.65|     119.0|       0.53|
|    GAB|         0.5|      44.8|     172.0|       0.58|
|    GEO|         0.5|     42.75|     129.5|       0.51|
|    UZB|         0.5|      30.0|     138.5|       0.58|
|    CIV|         0.5|      22.5|     144.5|       0.55|
|    MYT|         0.5|      49.5|      85.5|       0.71|
|    COM|         0.5|     28.64|     86.33|       0.71|
|    KAZ|        0.47|     46.65|     173.0|       0.51|
|    BIH|        0.46|      25.0|     130.0|       0.52|
|    AND|        0.43|      37.5|     141.6|       0.53|
|    IRQ|        0.43|     43.85|     139.0|       0.51|
|    BHR|         0.4|     40.38|     140.0|       0.55|
|    TZA|         0.4|       0.0|    124.25|       0.55|
|    GIB|        0.39|      28.

In [22]:
# TODO persist this dataframe in cache for SQL Query
"""select *, count()
from table
groupby day_of_week"""

'select *, count()\nfrom table\ngroupby day_of_week'

In [23]:
bq2_df.groupBy("day_of_week").count().show()



+-----------+-----+
|day_of_week|count|
+-----------+-----+
|  Wednesday|16139|
|    Tuesday|13999|
|     Friday|19631|
|   Thursday|19254|
|   Saturday|18055|
|     Monday|18171|
|     Sunday|14141|
+-----------+-----+

