In [1]:
import pandas as pd
import numpy as np

# 1. Load a sample of data (Expedia train.csv is HUGE, let's peek at the first 200k rows)
# If your machine is powerful, you can try nrows=1_000_000
df_sample = pd.read_csv('./data/train.csv', nrows=1_000_000)

print(f"Loaded rows: {len(df_sample)}")
print("-" * 30)

# ==========================================
# Check 1: Identify Booking vs No-Booking Sessions (The Outside Option)
# ==========================================
# Group by search_id to see if a booking happened in that session
session_summary = df_sample.groupby('srch_id')['booking_bool'].sum()

num_sessions = len(session_summary)
num_booked_sessions = (session_summary > 0).sum()
num_no_book_sessions = (session_summary == 0).sum()

print(f"Total Sessions (srch_id): {num_sessions}")
print(f"Sessions WITH Booking (Observed Sales): {num_booked_sessions} ({num_booked_sessions/num_sessions:.2%})")
print(f"Sessions WITHOUT Booking (Outside Option): {num_no_book_sessions} ({num_no_book_sessions/num_sessions:.2%})")

# ==========================================
# Check 2: Assortment Size Stats
# ==========================================
assortment_sizes = df_sample.groupby('srch_id').size()
print("\nAssortment Size Stats (Items per Search):")
print(assortment_sizes.describe())

# ==========================================
# Check 3: Feature Quality (Missing Values)
# ==========================================
# Potential features for utility function
features_to_check = [
    'prop_starrating', 
    'prop_review_score', 
    'prop_location_score1', 
    'prop_location_score2', 
    'price_usd',
    'promotion_flag'
]

print("\nMissing Values Count:")
print(df_sample[features_to_check].isnull().sum())

# ==========================================
# Check 4: Context Features (Z)
# ==========================================
# These are features associated with the user/search, not the item
context_cols = [
    'site_id', 'visitor_location_country_id', 
    'srch_length_of_stay', 'srch_booking_window',
    'srch_adults_count', 'srch_children_count',
    'srch_room_count', 'srch_saturday_night_bool'
]
print("\nContext Features Sample (First 5 rows):")
print(df_sample[context_cols].head())

Loaded rows: 1000000
------------------------------
Total Sessions (srch_id): 40173
Sessions WITH Booking (Observed Sales): 27738 (69.05%)
Sessions WITHOUT Booking (Outside Option): 12435 (30.95%)

Assortment Size Stats (Items per Search):
count    40173.000000
mean        24.892341
std          9.099532
min          5.000000
25%         18.000000
50%         30.000000
75%         32.000000
max         37.000000
dtype: float64

Missing Values Count:
prop_starrating              0
prop_review_score         1407
prop_location_score1         0
prop_location_score2    218352
price_usd                    0
promotion_flag               0
dtype: int64

Context Features Sample (First 5 rows):
   site_id  visitor_location_country_id  srch_length_of_stay  \
0       12                          187                    1   
1       12                          187                    1   
2       12                          187                    1   
3       12                          187           