In [5]:
import pandas as pd
import numpy as np
import os

# loading in the training and test datasets
train_data = pd.read_csv('dmt-2025-2nd-assignment/training_set_VU_DM.csv')
test_data = pd.read_csv('dmt-2025-2nd-assignment/test_set_VU_DM.csv')

In [6]:
# The assignment said there are nearly 5 million observations, verifying
print(train_data.shape) # (4958347, 54)
print(test_data.shape)  # (4959183, 50)

# The training data contains four columns that are not in the testing data. I will look at observations only where a booking was made
print(train_data[train_data['gross_bookings_usd'].notna()][['position', 'click_bool', 'booking_bool', 'gross_bookings_usd']].head(10))

# What about observations where no booking was made
print(train_data[train_data['gross_bookings_usd'].isna()][['position', 'click_bool', 'booking_bool', 'gross_bookings_usd']].head(10))

(4958347, 54)
(4959183, 50)
     position  click_bool  booking_bool  gross_bookings_usd
12         13           1             1              114.29
63          1           1             1              162.38
68         16           1             1               96.41
194         4           1             1              222.58
211        22           1             1               47.10
243        26           1             1              286.32
274         1           1             1             1927.64
312        13           1             1             1831.30
346         4           1             1              226.76
387         1           1             1               60.77
   position  click_bool  booking_bool  gross_bookings_usd
0        27           0             0                 NaN
1        26           0             0                 NaN
2        21           0             0                 NaN
3        34           0             0                 NaN
4         4           

In [8]:
# The gross price column in the training data will be used to determine which countries display price information per night or for the whole stay
test = train_data[['prop_country_id', 'prop_log_historical_price', 'price_usd', 'srch_length_of_stay', 'gross_bookings_usd']]
test = test[test['gross_bookings_usd'].notna()]  # only keep non-NaN observations

# adjusting the historical price
test['prop_historical_price'] = test['prop_log_historical_price'].replace(0, np.nan) # no meaningful comparison can be done
test['prop_historical_price'] = np.exp(test['prop_historical_price']).round(2) # converting to USD

# checking out some observations
print(test.head())

# The gross_booking_usd tends to be higher than the price_usd, due to taxes and other fees. For gross_bookings_usd observations that are significantly higher, that has to mean the price_usd is per night

# any cases where gross booking price is lower than price_usd?
print(test[test['gross_bookings_usd'] < test['price_usd']][['prop_country_id', 'prop_historical_price', 'price_usd', 'srch_length_of_stay', 'gross_bookings_usd']].head(50))

# any NaN values for either price_usd or gross_bookings_usd?
print(test[test['gross_bookings_usd'].isna() | test['price_usd'].isna()][['prop_country_id', 'prop_historical_price', 'price_usd', 'srch_length_of_stay', 'gross_bookings_usd']].head(50)) # empty dataframe

# any cases where gross booking price or the USD price is 0?
print(test[test['gross_bookings_usd'] == 0][['prop_country_id', 'prop_historical_price', 'price_usd', 'srch_length_of_stay', 'gross_bookings_usd']].head(50)) # not empty

# They are all cases where the price_usd is listed but the gross_bookings_usd is 0. For these I will assign them their price_usd value
test.loc[test['gross_bookings_usd'] == 0, 'gross_bookings_usd'] = test['price_usd']

# for cases where price_usd is equal to 0 (there are only 2), I assign the gross_bookings_usd price
test.loc[test['price_usd'] == 0, 'price_usd'] = test['gross_bookings_usd']

# In general the observations are slightly lower, I wonder if this is due to discounts or currency conversions. Calculating a ratio
test['gross_bookings_usd_ratio'] = (test['gross_bookings_usd'] / test['srch_length_of_stay']) / test['price_usd']

# ratio summary statistics
print(test['gross_bookings_usd_ratio'].describe())

# all observations with only one night are assigned a class value of 'one_night'
test['class'] = pd.Series(np.nan, index=test.index, dtype='object')
test.loc[test['srch_length_of_stay'] == 1, 'class'] = 'one_night'

# For cases where the USD price is higher than the gross price, and the number of nights specified is larger than 1, we assume price_usd is for the whole stay
test.loc[
    (test['price_usd'] > test['gross_bookings_usd']) &
    (test['srch_length_of_stay'] > 1),
    'class'
] = 'per_stay'

# Looking at cases with low ratios
print(test[test['gross_bookings_usd_ratio'] < 0.35][['prop_country_id', 'prop_historical_price', 'price_usd', 'srch_length_of_stay', 'gross_bookings_usd', 'class']].head(50))

# cases with high ratios
print(test[test['gross_bookings_usd_ratio'] > 1.5][['prop_country_id', 'prop_historical_price', 'price_usd', 'srch_length_of_stay', 'gross_bookings_usd', 'class']].head(50))

# cases with a ratio higher than 1.5 and where the number of nights specified is larger than 1 are assumed to have a class value of per_night
test.loc[
    (test['gross_bookings_usd_ratio'] > 1.3) &
    (test['srch_length_of_stay'] > 1),
    'class'
] = 'per_night'

# looking at observations with a ratio between 0.7 and 1.3
print(test[(test['gross_bookings_usd_ratio'] > 0.7) & (test['gross_bookings_usd_ratio'] < 1.3)][['prop_country_id', 'prop_historical_price', 'price_usd', 'srch_length_of_stay', 'gross_bookings_usd', 'class']].head(50))

# these observations will be made per night
test.loc[
    test['class'].isna() &
    (test['gross_bookings_usd_ratio'] > 0.7) &
    (test['gross_bookings_usd_ratio'] < 1.3),
    'class'
] = 'per_night'

# for observations where class is still NaN, srch_length_of_stay > 1, and the price is lower than the gross price, we assume the price is per stay
test.loc[
    test['class'].isna() &
    (test['srch_length_of_stay'] > 1) &
    (test['price_usd'] <= test['gross_bookings_usd']),
    'class'
] = 'per_stay'

# which observations are still NaN?
print(test[test['class'].isna()][['class', 'gross_bookings_usd_ratio', 'price_usd', 'srch_length_of_stay', 'gross_bookings_usd']].head(50)) # empty!

# Now I will aggregate the data by country ID and compute the percentages of class. First, all observations that are one_night are dropped.
filtered = test[test['class'] != 'one_night'].copy()
counts = filtered.groupby(['prop_country_id', 'class']).size().reset_index(name='count')
totals = counts.groupby('prop_country_id')['count'].transform('sum')
counts['percentage'] = counts['count'] / totals * 100
counts['percentage'] = counts['percentage'].round(2)
counts = counts.sort_values(by=['prop_country_id', 'percentage'], ascending=[True, False])
print(counts)

# are all the countries accounted for?
print(counts['prop_country_id'].nunique()) # 156
print(train_data['prop_country_id'].nunique()) # 172

# 16 countries are missing. They will be dealt with later, but for now I will create a dictionary with the class values for each country. Only the first class (the likeliest one) will be used
top_class_per_country = counts.drop_duplicates(subset='prop_country_id', keep='first')

     prop_country_id  prop_log_historical_price  price_usd  srch_length_of_stay  gross_bookings_usd  prop_historical_price
12               219                       4.44     100.89                    1              114.29                  84.77
63               100                       0.00     145.00                    1              162.38                    NaN
68               219                       4.62      85.00                    1               96.41                 101.49
194              216                       4.54      47.94                    4              222.58                  93.69
211              219                       4.44      42.00                    1               47.10                  84.77
       prop_country_id  prop_historical_price  price_usd  srch_length_of_stay  gross_bookings_usd
1503               219                    NaN     180.00                    1              171.64
1753               219                 273.14     299.00          

In [12]:
print(top_class_per_country.sort_values('percentage', ascending=True))

# The country with the smallest probability is at 75%. That is already really good. We can be confident then that the class values are correct. I will still use the price ratio method with the historical price to verify this

     prop_country_id      class  count  percentage
147              143  per_night      3       75.00
196              195  per_night      4       80.00
166              159  per_night     11       84.62
20                21  per_night      6       85.71
122              113  per_night     13       86.67
..               ...        ...    ...         ...
120              111  per_night      1      100.00
117              106  per_night     25      100.00
106               97  per_night     13      100.00
138              131  per_night      3      100.00
126              119  per_night     22      100.00

[156 rows x 4 columns]


In [31]:
# How many countries are not per night?
print(top_class_per_country[top_class_per_country['class'] != 'per_night']['prop_country_id'].nunique()) # 1 only

1


In [32]:
# One of the columns is the logged historical price over the last trading period, where a 0 is if it wasn't sold then. I will convert this to standard USD and create a column comparing that to the given price. However the price may be per night or for the whole stay, depending on the country. The srch_length_of_stay will be used to determine that.
train_data['prop_historical_price'] = train_data['prop_log_historical_price'].replace(0, np.nan) # no meaningful comparison can be done
train_data['prop_historical_price'] = np.exp(train_data['prop_historical_price']).round(2) # converting to USD

# In order to determine if the price is per night or the whole stay, I will divide the displayed price with the number of nights the user specified. This will be compared to the historical price in USD. If the value is significantly lower than the historical price, then the price is per night. If it is around the same, then the price is for the whole stay.
train_data['price_usd_per_night_test'] = train_data['price_usd'] / train_data['srch_length_of_stay']

# display adjustments
pd.set_option('display.max_columns', None)   # show all columns
pd.set_option('display.width', 1000)         # or set to a large number
pd.set_option('display.max_colwidth', None)  # avoid truncating long cell contents

# One of the variables is whether a hotel has a sales promotion currently displayed. That would affect the current price but not the historical price, and it would mean the current price is lower than it historically should be. I will use 25% for the adjustment, which is only for observations where the hotel has a sale

# Decreasing a price value by 25% means the decreased value has to be increased by 33% in order to get back to what it originally was
train_data['price_usd_without_promo'] = np.where(train_data['promotion_flag'] == 1, train_data['price_usd'] * 1.33, train_data['price_usd']).round(2)

# actually I think the historical_price column is already per night, or at least the description doesn't say anything about that. I will calculate a ratio between the two prices and see if it is significantly different than 1
train_data['price_ratio'] = train_data['price_usd_without_promo'] / train_data['prop_historical_price']

# looking at the ratio
print(train_data[['prop_historical_price', 'price_usd_without_promo', 'srch_length_of_stay', 'prop_country_id', 'price_ratio', 'promotion_flag']].head(50))

# displaying quantile statistics of the ratio
print(train_data['price_ratio'].describe())

# looking at the smallest values
print(train_data[train_data['price_ratio'] < 0.35][['prop_historical_price', 'price_usd_without_promo', 'srch_length_of_stay', 'prop_country_id', 'price_ratio', 'promotion_flag']].head(50))

    prop_historical_price  price_usd_without_promo  srch_length_of_stay  prop_country_id  price_ratio  promotion_flag
0                  141.17                   104.77                    1              219     0.742155               0
1                  152.93                   170.74                    1              219     1.116459               0
2                  137.00                   179.80                    1              219     1.312409               0
3                   80.64                   602.77                    1              219     7.474826               0
4                  138.38                   143.58                    1              219     1.037578               0
5                  181.27                   195.32                    1              219     1.077509               0
6                  122.73                   129.35                    1              219     1.053940               0
7                   62.80                    85.37      

In [33]:
# The adjusted USD price without promotion is significantly lower than the historical price. I will assume that means the price is per night. I will compare this assumption to the gross booking price country classifier
comp = pd.DataFrame(set(train_data[train_data['price_ratio'] < 0.35]['prop_country_id'])). \
    rename(columns={0:'prop_country_id'}). \
    merge(top_class_per_country[['prop_country_id', 'class']], on='prop_country_id', how='left')

# any cases where the class is not per_night?
print(comp[comp['class'] != 'per_night']) # 5

# Five countries have NaN values for the class. I already knew earlier that 16 countries are missing from the classifier. As every other country is per_night, then my assumption about the countries with low ratios being per night holds. As the USD price column is per night for these countries, no changes need to be made.

    prop_country_id class
11               19   NaN
25               38   NaN
52               79   NaN
57               89   NaN
69              108   NaN


In [35]:
# Checking out the other end of the spectrum
comp = pd.DataFrame(set(train_data[train_data['price_ratio'] > 2]['prop_country_id'])). \
    rename(columns={0:'prop_country_id'}). \
    merge(top_class_per_country[['prop_country_id', 'class']], on='prop_country_id', how='left')

print(train_data[train_data['price_ratio'] > 1.5][['prop_historical_price', 'price_usd_without_promo', 'srch_length_of_stay', 'prop_country_id', 'price_ratio', 'promotion_flag']].head(50))

# For cases where the ratio is significantly higher than the historical price, I will assume that means the price is for the whole stay. It will have to be adjusted to be per night. This is a mismatch with the results from earlier


      prop_historical_price  price_usd_without_promo  srch_length_of_stay  prop_country_id  price_ratio  promotion_flag
3                     80.64                   602.77                    1              219     7.474826               0
9                    172.43                   280.69                    1              219     1.627849               0
23                    84.77                   128.06                    1              219     1.510676               0
67                   165.67                   259.00                    1              219     1.563349               0
132                   61.56                   195.51                    2              158     3.175926               1
149                  175.91                   305.82                    4               31     1.738503               0
152                  206.44                   327.58                    4               31     1.586805               0
156                  152.93             

In [37]:
# Verifying the high ratio countries
high_ratio = train_data[train_data['price_ratio'] > 1.5].groupby('prop_country_id').size() / train_data.groupby('prop_country_id').size()
high_ratio.fillna(0, inplace=True)

# only keeping countries that have a high proportion of high ratios
high_ratio = high_ratio[high_ratio > 0.4]
print(high_ratio) # 2, only these countries have high ratios

# Print observations for these 2 countries
print(train_data[train_data['prop_country_id'].isin(high_ratio.index)][['prop_historical_price', 'price_usd_without_promo', 'srch_length_of_stay', 'prop_country_id', 'price_ratio', 'promotion_flag']].head(50))

# adjusting the values for the countries with a high ratio
train_data.loc[train_data['prop_country_id'].isin(high_ratio.index), 'price_usd_without_promo'] = train_data['price_usd_without_promo'] / train_data['srch_length_of_stay']


prop_country_id
126    0.428571
197    0.447368
dtype: float64
         prop_historical_price  price_usd_without_promo  srch_length_of_stay  prop_country_id  price_ratio  promotion_flag
73183                   497.70                   630.00                    4              197     1.265823               0
73184                   497.70                   656.00                    4              197     1.318063               0
73185                   497.70                  1078.00                    4              197     2.165963               0
73186                   497.70                   797.00                    4              197     1.601366               0
73187                      NaN                  2701.00                    4              197          NaN               0
73188                   497.70                   736.00                    4              197     1.478802               0
73189                   497.70                   979.00                    4

In [38]:
# What is the average price per night for countries? Computing a weighted average between current price and nights
weighted_avg = (
    train_data
    .assign(weighted_price=lambda df: df['price_usd_without_promo'] * df['srch_length_of_stay'])
    .groupby('prop_country_id')
    .agg(
        total_weighted_price=('weighted_price', 'sum'),
        total_stay=('srch_length_of_stay', 'sum')
    )
    .assign(weighted_avg_price_usd_without_promo=lambda df: df['total_weighted_price'] / df['total_stay'])
    .round(2)
    .reset_index()[['prop_country_id', 'weighted_avg_price_usd_without_promo']]
)

print(weighted_avg)

# quantile statistics
print(weighted_avg['weighted_avg_price_usd_without_promo'].describe())

# looking at the most expensive countries
print(weighted_avg[weighted_avg['weighted_avg_price_usd_without_promo'] > 1000][['prop_country_id', 'weighted_avg_price_usd_without_promo']])

     prop_country_id  weighted_avg_price_usd_without_promo
0                  1                                223.43
1                  2                                137.58
2                  4                                149.63
3                  7                                132.99
4                  9                                131.30
..               ...                                   ...
167              224                                 87.92
168              225                                387.06
169              226                                166.79
170              229                                101.32
171              230                                246.43

[172 rows x 2 columns]
count     172.000000
mean      309.350000
std       610.837121
min        32.740000
25%       137.302500
50%       183.685000
75%       260.750000
max      5426.590000
Name: weighted_avg_price_usd_without_promo, dtype: float64
     prop_country_id  weighted_avg_price_

In [39]:
train_data[train_data['prop_country_id'].isin([53, 39, 34])][['prop_historical_price', 'price_usd_without_promo', 'srch_length_of_stay', 'prop_country_id', 'price_ratio', 'promotion_flag']]. \
    sort_values(by=['price_usd_without_promo', 'prop_country_id'], ascending = False)

Unnamed: 0,prop_historical_price,price_usd_without_promo,srch_length_of_stay,prop_country_id,price_ratio,promotion_flag
1168566,,19726328.00,4,39,,0
3117007,160.77,9381308.71,1,53,58352.358711,0
1168574,,5444467.00,4,39,,0
1168581,,5194731.29,4,39,,1
1168576,,4884239.00,4,39,,0
...,...,...,...,...,...,...
3402025,86.49,0.01,1,39,0.000116,0
3402026,68.03,0.01,1,39,0.000147,1
3402029,74.44,0.01,1,39,0.000134,0
3402030,75.94,0.01,1,39,0.000132,0


In [40]:
# A lot of these values are extremely high and unrealistic. I will make adjustments to observations that have a price higher than 1000. For ones that have the historical value, I will use that instead. Otherwise for NaNs I do current price / nights

# This is needed for the where condition so that only these rows are affected by the calculation
mask = train_data['price_usd_without_promo'] > 1000

# doing the calculation
train_data.loc[mask, 'price_usd_without_promo'] = np.where(
    train_data.loc[mask, 'prop_historical_price'].notna(),
    train_data.loc[mask, 'prop_historical_price'],
    train_data.loc[mask, 'price_usd_without_promo'] / train_data.loc[mask, 'srch_length_of_stay']
)

In [41]:
# What is the average price per night for countries? Computing a weighted average between current price and nights
weighted_avg = (
    train_data
    .assign(weighted_price=lambda df: df['price_usd_without_promo'] * df['srch_length_of_stay'])
    .groupby('prop_country_id')
    .agg(
        total_weighted_price=('weighted_price', 'sum'),
        total_stay=('srch_length_of_stay', 'sum')
    )
    .assign(weighted_avg_price_usd_without_promo=lambda df: df['total_weighted_price'] / df['total_stay'])
    .round(2)
    .reset_index()[['prop_country_id', 'weighted_avg_price_usd_without_promo']]
)

print(weighted_avg)

# quantile statistics
print(weighted_avg['weighted_avg_price_usd_without_promo'].describe()) # much much better than before

     prop_country_id  weighted_avg_price_usd_without_promo
0                  1                                223.43
1                  2                                137.58
2                  4                                149.53
3                  7                                132.99
4                  9                                130.96
..               ...                                   ...
167              224                                 85.82
168              225                                150.34
169              226                                166.79
170              229                                101.32
171              230                                241.12

[172 rows x 2 columns]
count     172.000000
mean      197.838140
std       129.457518
min        32.740000
25%       132.937500
50%       167.475000
75%       225.297500
max      1468.740000
Name: weighted_avg_price_usd_without_promo, dtype: float64


In [42]:
# Do any hotels have NaN price values?
print(train_data[train_data['price_usd_without_promo'].isna()][['prop_historical_price', 'price_usd_without_promo', 'srch_length_of_stay', 'prop_country_id', 'price_ratio', 'promotion_flag']].head(50)) # empty dataframe

Empty DataFrame
Columns: [prop_historical_price, price_usd_without_promo, srch_length_of_stay, prop_country_id, price_ratio, promotion_flag]
Index: []


In [None]:
# Catboost ranker for