# Importing helper libraries

In [None]:
!pip install --upgrade pip
!pip install "snowflake-connector-python[pandas]" "snowflake-snowpark-python[pandas]" snowflake-snowpark-python==1.9.0 fosforio fosforml numpy pandas matplotlib scikit-learn xgboost seaborn python-dateutil tqdm holidays faker
!pip install --upgrade --q snowflake-snowpark-python==1.9.0
!pip uninstall urllib3 -y
!pip install urllib3==1.26.15

In [2]:
!pip install fosforml==
pip freeze

aiohttp==3.9.3
aiosignal==1.3.1
alembic==1.13.1
anyio==4.2.0
archspec @ file:///croot/archspec_1697725767277/work
argon2-cffi==23.1.0
argon2-cffi-bindings==21.2.0
asn1crypto==1.5.1
asttokens==2.4.1
async-generator==1.10
async-timeout==4.0.3
attrs==23.2.0
Babel==2.14.0
beautifulsoup4==4.12.3
bleach==4.1.0
boltons @ file:///croot/boltons_1677628692245/work
bqplot==0.12.42
Brotli @ file:///tmp/abs_ecyw11_7ze/croots/recipe/brotli-split_1659616059936/work
certifi==2024.7.4
certipy==0.1.3
cffi==1.16.0
charset-normalizer==3.3.2
click==8.1.7
clickclick==20.10.2
cloudpickle==2.2.1
comm==0.2.1
conda @ file:///croot/conda_1701719518285/work
conda-content-trust @ file:///tmp/abs_5952f1c8-355c-4855-ad2e-538535021ba5h26t22e5/croots/recipe/conda-content-trust_1658126371814/work
conda-libmamba-solver @ file:///croot/conda-libmamba-solver_1702997573971/work/src
conda-package-handling @ file:///croot/conda-package-handling_1690999929514/work
conda_package_streaming @ file:///croot/conda-package-streamin

In [1]:
from fosforio import snowflake
from fosforml import *
from fosforml.constants import MLModelFlavours
from fosforio import get_dataframe
from matplotlib import pyplot as plt
import pandas as pd
pd.set_option('display.max_columns', 500)
import seaborn as sns
from sklearn.metrics import mean_absolute_percentage_error
import numpy as np
import warnings; warnings.simplefilter('ignore')
from joblib import dump, load
import requests
from tqdm import tqdm
import time
import calendar

from time import sleep
import configparser
from dateutil.relativedelta import relativedelta
import datetime
from dateutil.easter import easter
from scipy.optimize import minimize_scalar
from scipy.optimize import curve_fit
import holidays, itertools

%matplotlib inline

Connection manager service url initialised to http://fdc-project-manager:80/project-manager
If you need to update its value then update the variable CONNECTION_MANAGER_BASE_URL in os env.


ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

# Reading the data

In [None]:
data = pd.read_csv("./bookings_transformed_latest.csv")

In [None]:
from scipy.optimize import curve_fit

# Define the demand curve function
def demand_curve(x, a, b, c, d, max_demand):
    demand = a * np.exp(-b * x) + c
    demand = np.where(x <= max_demand, np.minimum(demand, max_demand), demand)
    return demand + d

In [None]:
def revenue(price):
    return price * demand_curve(price, a_fit, b_fit,c_fit,d_fit,max_demand)

In [None]:
from scipy.optimize import brentq

def demand_to_price(num_rooms, a, b, c, d, max_demand):
    def root_func(x):
        return num_rooms - (a * np.exp(-b * x) + c)

    try:
        price = brentq(root_func, 0, 200)  # Adjust the interval bounds as needed
    except ValueError:
        # Fallback to default price if no root is found
        price_range=(0, 200)
        price = np.random.uniform(*price_range)

    return price

In [None]:
def generate_holiday_dates(start_year, end_year):
    holidays = {}
    for year in range(start_year, end_year + 1):
        holidays[datetime.date(year, 1, 1)] = 'new_year'
        easter_date = easter(year)
        holidays[easter_date] = 'easter'
        holidays[datetime.date(year, 12, 25)] = 'christmas'
    return holidays

holidays = generate_holiday_dates(2020, 2023)

In [None]:
def data_conversion(data):
    import holidays
    hotels = data['hotel'].unique()
    room_types = data['reserved_room_type'].unique()
    expanded_df = pd.DataFrame()
    for _, row in data.iterrows():
        num_stay_dates = row['total_rns']
        try:
            # Create a row for each stay date
            expanded_booking = pd.DataFrame({
                'hotel': row['hotel'],
                'room_type': row['reserved_room_type'], 
                'arrival_date': pd.date_range(start=row['expected_arrival_date'], periods=num_stay_dates),
                'total_rns': 1,
                'adr': row['adr'],
                'room_limit': row['room_limit']
            })

            # Append the stay date information to the new dataframe
            expanded_df = pd.concat([expanded_df, expanded_booking], ignore_index=True)
        except ValueError as e:
            print(f"Error processing booking for {row['hotel']} on {row['expected_arrival_date']} : {num_stay_dates} {e}")
    expanded_df = expanded_df.sort_values('arrival_date')
    expanded_df = expanded_df.reset_index(drop=True)
    expanded_df['adr']= np.round(expanded_df['adr'], 2)
           
    expanded_df['dow'] = expanded_df.arrival_date.dt.strftime('%A')
    expanded_df['month'] = expanded_df.arrival_date.dt.strftime('%B')

    return expanded_df

data['total_rns'] = data['stays_in_week_nights'] + data['stays_in_weekend_nights']


In [None]:
data = data[(data['is_canceled'] == 0) & (data['reservation_status'] !='No-Show')] 
data = data[(data.market_segment != 'Complementary') ]
data = data[(data.reserved_room_type == 'A') |(data.reserved_room_type == 'D') | (data.reserved_room_type == 'E')]

converted_df = data_conversion(data[['hotel', 'reserved_room_type', 'expected_arrival_date', 'adr', 'room_limit', 'total_rns']])
converted_df

In [None]:
@scoring_func
def format_data(request, call=None):
    import json
    file_obj = FileStorage(request.files["file1"])
    file_obj.save('./input_file.csv')    
    
    expanded_df = pd.read_csv('./input_file.csv')

    results = pd.DataFrame(columns=['month', 'hotel','room_limit', 'room_type', 'dow', 'optimal_rate', 'expected_rn','expected_rev','optimal_rate_lim_inv'])
    hotel_types = ['Resort Hotel', 'City Hotel']
    room_types = ['A', 'D', 'E']
    daily_rns= expanded_df.groupby(['arrival_date','dow','month', 'hotel', 'room_type']).agg({'room_limit': 'mean', 'total_rns':'sum'}).reset_index() # ge total stays per day

    daily_rns = daily_rns.groupby(['dow','month', 'hotel', 'room_type']).agg({'room_limit': 'mean','total_rns':['sum','mean','median']}).reset_index() # get Rns metrics by Dow & Month

    daily_rns.columns = ['_'.join(col) for col in daily_rns.columns] #remove multi level column
    adr_frequency = expanded_df.groupby(['dow','month','adr', 'hotel', 'room_type']).agg({'room_limit': 'mean','total_rns':'sum'})
    adr_frequency.reset_index(inplace=True)
    merged_df = pd.merge(adr_frequency, daily_rns,how='left',left_on=['dow','month', 'hotel', 'room_type'], right_on=['dow_','month_', 'hotel_', 'room_type_'],suffixes=('_act', '_tot'))

    merged_df = merged_df.drop(['dow_','month_'],axis=1)

    merged_df['probability'] = merged_df['total_rns']/merged_df['total_rns_sum']
    merged_df['expected_rns'] = merged_df['probability'] * merged_df['total_rns_median']
    merged_df = merged_df.sort_values(by=['dow', 'month', 'adr'], ascending=[True, True, False])
    merged_df['expected_demand']=merged_df.groupby(['dow', 'month'])['expected_rns'].cumsum()
    merged_df['expected_rev'] = merged_df['adr']* merged_df['expected_demand']
    results = pd.DataFrame(columns=['month', 'hotel','room_limit', 'room_type', 'dow', 'optimal_rate', 'expected_rn','expected_rev','optimal_rate_lim_inv'])
    months = merged_df.month.unique()
    dow = merged_df.dow.unique()
    for hotel in hotels:
        for room_type in room_types:
            for month in months:
                for day in dow:
                    # Get data for the specific combination
                    data_subset = merged_df[(merged_df['dow'] == day) & 
                                            (merged_df['hotel'] == hotel) & 
                                            (merged_df['room_type'] == room_type) & 
                                            (merged_df['month'] == month)].reset_index()

                    if data_subset.empty:
                        continue

                    # Remove outliers
                    mean = data_subset['adr'].mean()
                    std_dev = data_subset['adr'].std()
                    data_subset['z_scores'] = np.abs((data_subset['adr'] - mean) / std_dev)
                    data_subset = data_subset[data_subset['z_scores'] <= 2]

                    # Fit demand curve
                    x_data = data_subset['adr'].values
                    y_data = data_subset['expected_demand'].values

                    try:
                        initial_guess = [1, 0.01, 1, 1, data_subset['total_rns_median'].values[0]]
                        bounds = ([0, 0, 0, 0, 0], [np.inf, np.inf, np.inf, np.inf, np.inf])
                        maxfev = 10000  # Increase the number of maximum function evaluations
                        params, _ = curve_fit(demand_curve, x_data, y_data, bounds=bounds, p0=initial_guess, maxfev=maxfev)
                    except RuntimeError as e:
                        print(f"Error fitting demand curve for {hotel}, {room_type}, {month}, {day}: {e}")
                        continue

                    a_fit, b_fit, c_fit, d_fit, max_demand = params

                    # Optimize revenue
                    def revenue(price):
                        return price * demand_curve(price, a_fit, b_fit, c_fit, d_fit, max_demand)

                    objective = lambda price: -revenue(price)
                    optimize = minimize_scalar(objective, bounds=(45, 200), method='bounded')
                    optimal_price = optimize.x
                    max_revenue = -optimize.fun
                    expected_rns = demand_curve(optimal_price, a_fit, b_fit, c_fit, d_fit, max_demand)

                    optimal_rate_lim_inv = demand_to_price(data_subset['room_limit'].mean(), a_fit, b_fit, c_fit, d_fit, max_demand)

                    new_row = pd.DataFrame({'hotel': hotel,
                                            'room_type': room_type,
                                            'room_limit': data_subset['room_limit'].mean(),
                                            'month': month,
                                            'dow': day,
                                            'optimal_rate': optimal_price,
                                            'expected_rev': max_revenue,
                                            'expected_rn': expected_rns,
                                            'optimal_rate_lim_inv': optimal_rate_lim_inv}, index=[0])
                    results = pd.concat([results, new_row], ignore_index=True)

    
    results['optimal_rate'] = results['optimal_rate'].round()
    results['optimal_rate_lim_inv'] = results['optimal_rate_lim_inv'].round()

    results['expected_rn'] = results['expected_rn'].round().astype(int)
    results['expected_rev'] = results['expected_rev'].round()
    combinations = list(itertools.product(hotel_types, room_types))
    combinations_df = pd.DataFrame(combinations, columns=['hotel', 'room_type'])
    month_dict = {month: index for index, month in enumerate(pd.date_range('2020-01-01', periods=12, freq='M').strftime('%B'), 1)}


    new_data = pd.DataFrame()

    for year in range(2020, 2024):
        for month in month_dict.values():
            start_date = pd.to_datetime(f'{year}-{month}-01').replace(day=1)
            end_date = pd.to_datetime(f'{year}-{month}-01').replace(day=1) + pd.offsets.MonthEnd(0)
            date_range = pd.date_range(start_date, end_date, freq='D')
            df = pd.DataFrame(date_range, columns=['arrival_date'])
            df['dow'] = df['arrival_date'].dt.day_name()
            df['month'] = df['arrival_date'].dt.month_name()

            result_df = df.assign(key=1).merge(combinations_df.assign(key=1), on='key').drop('key', axis=1)
            new_data = pd.concat([new_data, result_df], ignore_index=True)

    final_data = pd.merge(new_data, results, how='left', on=['dow', 'hotel', 'room_type', 'month'])
    print(final_data.isna().sum())
    final_data['room_limit'] = final_data['room_limit'].astype(int)
    return final_data


# req = requests.Request()
# req.files = {'file1': 'sample.csv'}
# score(clf, req)

In [None]:
converted_df.to_csv('./dates_dataset.csv', index=False)

In [None]:
format_data(converted_df)

In [None]:
register_model(None, 
               format_data, 
               name="predict_optimal_price", 
               description="predict_optimal_price",
               flavour=MLModelFlavours.sklearn,
               init_script="\\n pip install fosforml \\n pip install fosforio[snowflake] \\n pip install seaborn \\n pip install snowflake-connector-python[pandas] \\n pip install joblib==1.3.2 scikit-learn=1.3.2",
)