In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import gc
import time
import os
import pickle
from scipy.stats import pointbiserialr


import lightgbm
from lightgbm import early_stopping, log_evaluation

In [8]:
def load_data(file_path):
    gc.collect()
    print('--------Loading original data from file: {}--------'.format(file_path))
    raw_data = pd.read_csv(file_path)
    print('--------Data loaded successfully---------')
    return raw_data

def get_target(data):
    target_col = 'target'
    conditions = [
            data["click_bool"] == 1,
            data["booking_bool"] == 1,
        ]
    choices = [1, 5]
    data[target_col] = np.select(conditions, choices, default=0)   
    return data
    

def merge_similar_data(data):
    comp_rate = data.filter(regex=r'^comp[12345678]_rate\d*$').mean(axis=1)
    comp_inv = data.filter(regex=r'^comp[12345678]_inv\d*$').mean(axis=1)
    comp_rate_percent_diff = data.filter(regex=r'^comp[12345678]_rate_percent_diff\d*$').mean(axis=1)
    data['comp_rate'] = comp_rate
    data['comp_inv'] = comp_inv
    data['comp_rate_percent_diff'] = comp_rate_percent_diff
    data.drop(labels = data.filter(regex=r'^comp[1-8]').columns, axis=1, inplace=True)
    return data

def drop_low_value_attributes(data):
    sparse_col = data.isnull().mean()[data.isnull().mean() > 0.1].index.to_list()
    # In the Point-biserial correlation coefficient calculation, the valuable_col shows a positive correlation or statistically significant with 'click_bool'. 
    # Raw and balanced data are used to calculate the correlation coefficient('click_bool' '1' : '0' == 50 : 2; '1' : '0' == 1 : 1).
    valuable_col = ['prop_review_score', 'prop_location_score2', 'srch_query_affinity_score', 'comp_rate', 'comp_inv']
    low_value_col = ['visitor_hist_starrating', 'visitor_hist_adr_usd', 'orig_destination_distance', 'comp_rate_percent_diff', 'gross_bookings_usd']
    low_value_col = [c for c in low_value_col if c in data.columns.values]
    data = data.drop(labels = low_value_col, axis = 1)
    return data

def add_date_features(data, datetime_key = 'date_time'):
    dates = pd.to_datetime(data[datetime_key])
    data['month'] = dates.dt.month
    data['quarter'] = dates.dt.quarter
    data = data.drop(labels=datetime_key, axis = 1)
    return data


def add_mean_position_features(data):
    prop_mean_position = data[data['random_bool'] == 0]
    prop_mean_position = prop_mean_position.groupby(["srch_destination_id", "prop_id"]).agg({"position": "mean"})
    prop_mean_position = prop_mean_position.rename(columns = {"position": "mean_position"}).reset_index()
    prop_mean_position['mean_position'] = prop_mean_position['mean_position'].astype(int) 
    data = data.merge(prop_mean_position, how = 'left', on = ["srch_destination_id", "prop_id"])

    # data['mean_position'] = 1 / data['mean_position']
    # data['position'] = 1 / data['position']
    return data

def add_diff_features(data, groupby_key, target_col):
    average_value_by_srch_id = data.groupby(groupby_key)[target_col].transform('mean')
    data[f'{target_col}_diff'] = data[target_col] - average_value_by_srch_id
    return data

def add_prop_pop_score(data):
    # 按 id 列分组，并计算 'click' 和 'book' 列的总和
    grouped = data.groupby('prop_id')[['click_bool', 'booking_bool']].sum()
    # 计算 'click' 和 'book' 列的总和，并乘以相应的权重
    grouped['weighted_sum'] = (grouped['booking_bool'] * 5 + grouped['click_bool'] * 1)
    # 计算每个 'id' 出现的次数
    id_counts = data['prop_id'].value_counts()
    id_counts.sort_index(inplace=True)
    # 将加权总和除以每个 'id' 出现的次数
    grouped['pop_score'] = grouped['weighted_sum'] / id_counts
    data['pop_score'] = data['prop_id'].map(grouped['pop_score'])
    return data

def normalize_features(input_df, group_key, target_column, take_log10=False):
    # for numerical stability
    epsilon = 1e-4
    if take_log10:
        input_df[target_column] = np.log10(input_df[target_column] + epsilon)
    methods = ["mean", "std"]

    df = input_df.groupby(group_key).agg({target_column: methods})

    df.columns = df.columns.droplevel()
    col = {}
    for method in methods:
        col[method] = target_column + "_" + method

    df.rename(columns=col, inplace=True)
    df_merge = input_df.merge(df.reset_index(), on=group_key)
    df_merge[target_column + "_norm_by_" + group_key] = (
        df_merge[target_column] - df_merge[target_column + "_mean"]
    ) / df_merge[target_column + "_std"]
    df_merge = df_merge.drop(labels=[col["mean"], col["std"]], axis=1)

    gc.collect()
    return df_merge

def normalization(input_df):
    input_df = normalize_features(input_df, group_key="srch_id", target_column="price_usd", take_log10=True)
    input_df = normalize_features(input_df, group_key="prop_id", target_column="price_usd")
    input_df = normalize_features(input_df, group_key="srch_id", target_column="prop_starrating")
    input_df = normalize_features(input_df, group_key="srch_id", target_column="prop_location_score2")
    input_df = normalize_features(input_df, group_key="srch_id", target_column="prop_location_score1")
    input_df = normalize_features(input_df, group_key="srch_id", target_column="prop_review_score")
    return input_df
    

def train_data_preprocessing(data, kind='train'):
    print('--------Preprocessing training data--------')
    # 第一步 合并同类数据
    data = merge_similar_data(data)
    # 第二步 删除缺失值过多的列 （如果缺失值过多且与'click_bool'相关性弱，则删除）
    data = drop_low_value_attributes(data)
    print('--------Low value cols have been dropped---------')
    
    # feature engineering
    # 第一步 提取日期特征
    data = add_date_features(data)
    # if kind == 'train': 
    #     data = add_mean_position_features(data)

    # people will prefer to choose the cheaper/higher score props in each search
    # add new features based on expected visitor’s behavior
    prop_attr = ['price_usd', 'prop_starrating', 'prop_review_score', 'prop_location_score1', 'prop_location_score2']
    # 第二步 In each search people will tend to click or even book hotels that have:higher star ratings, higher reviews,better located
    for col in prop_attr:
        data = add_diff_features(data, 'srch_id', col)
    # add pop_score based on the percentage of booking and clicking

    # 第三步 加入平均位置和popularity，这个部分处理有点复杂，version1中的代码暂时被注释掉了，到时我加上就会
    # if kind == 'train':
    #     data = add_prop_pop_score(data)
    
    # 第三步 根据clik，book计算得到目标列0，1，5，数值越大权重越高
    if kind == 'train':
        data = get_target(data)

    # 第三步 均值化
    data = normalization(data)

    # 按"srch_id"排序 与模型训练所需得group顺序保持一致
    data = data.sort_values("srch_id")

    data = data.reset_index(drop=True)


    # use pop_score to replace 'click_bool' and 'booking_bool', and drop the original columns

    # 删掉test数据集没有的click和book
    data = data.drop(labels=['click_bool', 'booking_bool'], axis=1)
    print('--------Data Preprocessing Completed--------')
    return data

def get_categorical_index(df):
    categorical_features = [
        "month",
        "quarter",
        "prop_country_id",
        "site_id",
        "visitor_location_country_id",
    ]
    categorical_features = [c for c in categorical_features if c in df.columns.values]
    categorical_features_index = [df.columns.get_loc(x) for x in categorical_features]
    return categorical_features_index

def remove_columns(x1, ignore_column=["srch_id", "prop_id", "position", "random_bool"]):
    ignore_column = [c for c in ignore_column if c in x1.columns.values]
    x1 = x1.drop(labels=ignore_column, axis=1)
    return x1

def split_train_data(data_for_training):
    sample = data_for_training.copy()

    x1 = sample.iloc[:int(len(sample) * 0.8)]
    x2 = sample.iloc[int(len(sample) * 0.8):]
    y1 = x1['target'].values
    y2 = x2['target'].values
    x1 = x1.drop(labels='target', axis=1)
    x2 = x2.drop(labels='target', axis=1)

    groups = x1["srch_id"].value_counts(sort=False).sort_index()
    eval_groups = x2["srch_id"].value_counts(sort=False).sort_index()
    len(eval_groups), len(x2), len(x1), len(groups)

    x1 = remove_columns(x1)
    x2 = remove_columns(x2)

    return (x1, x2, y1, y2, groups, eval_groups)

def train_model(
    x1, x2, y1, y2, groups, eval_groups, lr, method, output_dir, name_of_model=None
):
    if not name_of_model:
        name_of_model = str(int(time.time()))

    categorical_features_index = get_categorical_index(x1)
    clf = lightgbm.LGBMRanker(
        objective="lambdarank",
        n_estimators=5000,
        learning_rate=lr,
        random_state=69,
        seed=69,
        # boosting=method,
    )
    gc.collect()

    print("Training on train set with columns: {}".format(x1.columns.values))
    early_stopping_callback = early_stopping(stopping_rounds=200, verbose=True)
    log_evaluation_callback = log_evaluation(period=20)
    
    clf.fit(
        x1,
        y1,
        eval_set=[(x1, y1), (x2, y2)],
        eval_group=[groups, eval_groups],
        group=groups,
        eval_at=38,
        callbacks=[early_stopping_callback, log_evaluation_callback],
        categorical_feature=categorical_features_index,
    )
    gc.collect()
    pickle.dump(clf, open(os.path.join(output_dir, "model.dat"), "wb"))
    return clf

def predict(name_of_model, test_data, output_dir):

    gc.collect()

    model = pickle.load(open(os.path.join(output_dir, "model.dat"), "rb"))


    test_data_srch_id_prop_id = test_data[["srch_id", "prop_id"]]

    test_data = remove_columns(test_data)

    categorical_features_numbers = get_categorical_index(test_data)

    print("Predicting on train set with columns: {}".format(test_data.columns.values))
    kwargs = {}
    kwargs = {"categorical_feature": categorical_features_numbers}

    predictions = model.predict(test_data, **kwargs)
    test_data_srch_id_prop_id["prediction"] = predictions
    del test_data
    gc.collect()

    test_data_srch_id_prop_id = test_data_srch_id_prop_id.sort_values(
        ["srch_id", "prediction"], ascending=False
    )
    print("Saving predictions into submission.csv")
    test_data_srch_id_prop_id[["srch_id", "prop_id"]].to_csv(
        os.path.join(output_dir, "submission.csv"), index=False
    )

In [3]:
train_data = pd.read_csv('training_set_VU_DM.csv')
test_data = pd.read_csv('test_set_VU_DM.csv')

In [9]:
train = train_data_preprocessing(train_data)

--------Preprocessing training data--------
--------Low value cols have been dropped---------
--------Data Preprocessing Completed--------


In [10]:
x1, x2, y1, y2, groups, eval_groups = split_train_data(train)

In [11]:
model = train_model(x1, x2, y1, y2, groups, eval_groups, 0.12, "dart", "model")

Training on train set with columns: ['site_id' 'visitor_location_country_id' 'prop_country_id'
 'prop_starrating' 'prop_review_score' 'prop_brand_bool'
 'prop_location_score1' 'prop_location_score2' 'prop_log_historical_price'
 'price_usd' 'promotion_flag' 'srch_destination_id' 'srch_length_of_stay'
 'srch_booking_window' 'srch_adults_count' 'srch_children_count'
 'srch_room_count' 'srch_saturday_night_bool' 'srch_query_affinity_score'
 'comp_rate' 'comp_inv' 'month' 'quarter' 'price_usd_diff'
 'prop_starrating_diff' 'prop_review_score_diff'
 'prop_location_score1_diff' 'prop_location_score2_diff'
 'price_usd_norm_by_srch_id' 'price_usd_norm_by_prop_id'
 'prop_starrating_norm_by_srch_id' 'prop_location_score2_norm_by_srch_id'
 'prop_location_score1_norm_by_srch_id'
 'prop_review_score_norm_by_srch_id']
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.105599 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not eno