In [1]:
import numpy as np
import pandas as pd
from scipy import stats
import datetime
import logging
logger = logging.getLogger(__name__)
from kedro.framework.context import load_context
context = load_context('/Users/neil/Desktop/Hui_Yuan/Projects/pricing_airbnb-master', env='base')
catalog = context.catalog
pd.set_option('display.max_columns', 20)
pd.set_option('display.max_rows', 300)
from functools import partial
from scipy.optimize import minimize_scalar
from prc.nodes.ds import embedding_model

  "Skipping re-loading from configuration path: {}".format(item)
  from collections import MutableMapping
  "Skipping re-loading from configuration path: {}".format(item)
  from google.protobuf.pyext import _message
  "Skipping re-loading from configuration path: {}".format(item)


In [2]:
# step 0 : clean data
def load_data():
    df = catalog.load("fea_output")
    df = df.set_index(["listing_id", "date"])
    return df

In [3]:
df = load_data()
df.head()

2020-06-16 13:41:28,952 - kedro.io.data_catalog - INFO - Loading data from `fea_output` (CSVDataSet)...


Unnamed: 0_level_0,Unnamed: 1_level_0,available,price,minimum_nights,neighbourhood_group,room_type,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,lag,month
listing_id,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
8521,2019-05-23,f,5.298317,1.098612,,Entire home/apt,29,0.24,3,5.624018,0.0,5
8521,2019-05-24,f,5.298317,1.098612,,Entire home/apt,29,0.24,3,5.624018,0.693147,5
8521,2019-05-25,f,5.298317,1.098612,,Entire home/apt,29,0.24,3,5.624018,1.098612,5
8521,2019-05-26,f,5.298317,1.098612,,Entire home/apt,29,0.24,3,5.624018,1.386294,5
8521,2019-05-27,t,5.298317,1.098612,,Entire home/apt,29,0.24,3,5.624018,1.609438,5


In [6]:
# step 1: process data
def process_raw_data(df,
                     continuous_col = ["price", "minimum_nights", "lag", "number_of_reviews","calculated_host_listings_count"],
                     embedding_cols = ["neighbourhood_group", "room_type", "month"]):
    """
    
    :param df:
    :param continuous_col: ["price", "minimum_nights", "lag",...], the first element must be price.
    :param embedding_cols: the categorical data column
    :return:
    """
    df_select = df[continuous_col + embedding_cols + ["available"]]
    if df_select['neighbourhood_group'].isnull().sum()==len(df_select):
        df['neighbourhood_group']=np.ones((len(df['neighbourhood_group']))).astype(int)
    df_select = df_select.dropna(subset=embedding_cols + continuous_col, how="any")
    df_true = df_select[df_select["available"] == "t"]
    df_false = df_select[df_select["available"] != "t"]
    if len(df_true) < len(df_false):
        df_sampled = pd.concat([df_false.sample(n=len(df_true)), df_true], axis=0)
    else:
        df_sampled = pd.concat([df_false, df_true.sample(n=len(df_false))], axis=0)

    embed_lookup, x_embed_raw = embedding_model.build_embedding_dic(df_sampled, embedding_cols)
    return df_sampled, continuous_col, embedding_cols, embed_lookup, x_embed_raw

In [7]:
df_sampled, continuous_col, embedding_cols, embed_lookup, x_embed_raw = process_raw_data(df)
df_sampled.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,price,minimum_nights,lag,number_of_reviews,calculated_host_listings_count,neighbourhood_group,room_type,month,available
listing_id,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
15391083,2019-06-10,4.174387,0.0,2.944439,162,2,1,Private room,6,f
19402921,2019-06-04,4.624973,1.609438,2.564949,0,1,1,Entire home/apt,6,f
30662073,2019-06-02,3.931826,0.693147,2.397895,20,3,1,Private room,6,f
14449525,2019-05-30,4.941642,0.693147,2.079442,133,1,1,Entire home/apt,5,f
3475883,2019-07-07,4.174387,1.098612,3.828641,33,4,1,Private room,7,f


In [8]:
# step 2: train_model
def train_model(df, continuous_col, embed_lookup, x_embed_raw, epochs):
    num_dimensions = len(continuous_col)
    num_samples = len(df)
    model = embedding_model.build_model(num_dimensions, num_samples, embed_lookup, lr=0.001)
    x_val = df[continuous_col].values
    X = [x_val, x_embed_raw[0], x_embed_raw[1], x_embed_raw[2]]
    y = (df["available"] == "t").astype(int)
    model.fit(X, y, epochs=epochs)
    # the parameter of w and bias
    candidate_w_bs = [(model.layers[-1].kernel_posterior.sample().numpy(),
                       model.layers[-1].bias_posterior.sample().numpy())]

    return model

In [9]:
model = train_model(df_sampled, continuous_col, embed_lookup, x_embed_raw, 10)


Instructions for updating:
Please use `layer.add_weight` method instead.
Epoch 1/10


  if not isinstance(values, collections.Sequence):


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [36]:
# step3.1 deal with data
def series_row(row, embedding_cols, continuous_col, embed_lookup) :
    
    continuous = np.array(row[continuous_col].values.tolist()).reshape(-1, len(continuous_col))
    input_row = [continuous]
    for key,val in embed_lookup.items():
        input_row.append(np.array([val[row[key]]]))
    return input_row
    

# step3.2 calc revenue
def cal_neg_revnue(price, input_model, row, embed_lookup, continuous_col):
    """ only change price to figure out the accept rate """
    continuous = np.append(np.array(price),row[0][0][1:]).reshape(-1,len(continuous_col))
    new_price_row = [continuous]
    for i in range(1,len(row)):
        new_price_row.append(row[i])
    prob = input_model.predict(new_price_row)
    return -1 * price * prob[0][0]


# step3.3 estimate price and prob
def optimal_sellable_capacity(model=None, row=None, method="Bounded",
                              min_search_bound=1, max_search_bound=10,
                              embed_lookup=None, continuous_col=None):
    func = partial(
        cal_neg_revnue,
        input_model=model,
        row=row,
        embed_lookup=embed_lookup,
        continuous_col=continuous_col

    )

    min_ = minimize_scalar(
        func, bounds=(min_search_bound, max_search_bound), method=method
    )

    best_price = min_.x

    x0 = np.append(np.array(best_price), row[0][0][1:]).reshape(-1, len(continuous_col))
    new_price_row = [x0]
    for i in range(1, len(row)):
        new_price_row.append(np.array(row[i]))
    prob = model.predict(new_price_row)
    return best_price, prob[0][0]

# step3.4 combined
def wrapper_price_func(model=None, row=None, continuous_col=None, embed_lookup= None,
                       min_search_bound=1, max_search_bound=10):
    input_row = series_row(row, embedding_cols, continuous_col, embed_lookup)
    est_price, prob = optimal_sellable_capacity(model=model, row=input_row,
                                                min_search_bound=min_search_bound,
                                                max_search_bound=max_search_bound,
                                                continuous_col=continuous_col,
                                                embed_lookup=embed_lookup)
    return est_price, prob


# step 3.5 apply to dataframe
def run_estimate_price(df,model,continuous_col,embedding_cols, embed_lookup):
    
    
    df_sampled_new = df.reset_index(level=['listing_id','date'])
    index_series = df_sampled_new[['listing_id','date']]
    df_sampled_new = df_sampled_new[continuous_col + embedding_cols + ['listing_id','date']] 
    
    
    ### select sample to test 
    ### date:'2019-05-01' to '2019-6-01' inclusive
    ### test first 20 rows
    sample_num = 20
    df_sampled_new = df_sampled_new[df_sampled_new['date'].between('2019-05-01', '2019-6-01', inclusive=True)]
    df_new = df_sampled_new.iloc[:sample_num, :-2].reset_index(drop=True)
    # estimate best rental price 
    df_new['price_prob'] = df_test.apply(lambda x: wrapper_price_func(min_search_bound=1,max_search_bound=10,
                                                continuous_col=continuous_col, embed_lookup= embed_lookup,
                                                model= model, row = x)
                                        , axis=1)
    df_new['estimate_price'] = df_new['price_prob'].apply(lambda x: '$' + str(round(np.exp(x[0]), 2)))
    df_new['prob'] = df_new['price_prob'].apply(lambda x: str(round(x[1] * 100, 2)) + '%')
    df_new = df_new.drop("price_prob", axis=1)
    
    # get back log columns
    log_cols = ["price", "minimum_nights", "lag"]
    df_new[log_cols] = np.exp(df_new[log_cols])

    # change price, minimum_nights, lag format
    df_new['price'] = df_new['price'].apply(lambda x: '$' + str(round(x, 2)))
    df_new['minimum_nights'] = df_new['minimum_nights'].apply(lambda x: int(x))
    df_new['lag'] = df_new['lag'].apply(lambda x: int(x))

    # reset index
    index_new_listId = index_series.iloc[:sample_num, 0]
    index_new_date = index_series.iloc[:sample_num, 1]
    df_new = df_new.set_index([index_new_listId, index_new_date])
    return df_new



In [37]:
df_new = run_estimate_price(df,model,continuous_col,embedding_cols, embed_lookup)
df_new

Unnamed: 0_level_0,Unnamed: 1_level_0,price,minimum_nights,lag,number_of_reviews,calculated_host_listings_count,neighbourhood_group,room_type,month,estimate_price,prob
listing_id,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
8521,2019-05-23,$200.0,3,1,29,3,1,Entire home/apt,5,$21435.83,93.49%
8521,2019-05-24,$200.0,3,2,29,3,1,Entire home/apt,5,$21019.74,92.62%
8521,2019-05-25,$200.0,3,3,29,3,1,Entire home/apt,5,$18186.3,92.41%
8521,2019-05-26,$200.0,3,4,29,3,1,Entire home/apt,5,$20730.06,94.18%
8521,2019-05-27,$200.0,3,5,29,3,1,Entire home/apt,5,$21601.92,94.45%
8521,2019-05-28,$200.0,3,6,29,3,1,Entire home/apt,5,$21752.77,91.84%
8521,2019-05-29,$400.0,3,6,29,3,1,Entire home/apt,5,$18977.57,93.51%
8521,2019-05-30,$400.0,3,8,29,3,1,Entire home/apt,5,$18619.68,90.86%
8521,2019-05-31,$400.0,3,9,29,3,1,Entire home/apt,5,$19560.46,93.05%
8521,2019-06-01,$400.0,3,10,29,3,1,Entire home/apt,6,$21179.34,97.07%


In [None]:
continuous_col = ["price", "minimum_nights", "lag", "number_of_reviews","calculated_host_listings_count"]
embedding_cols = ["neighbourhood_group", "room_type", "month"]