In [38]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import gc
import time
from scipy.stats import pointbiserialr


In [48]:
def load_data(file_path):
    print('--------Loading original data from file: {}--------'.format(file_path))
    raw_data = pd.read_csv(file_path)
    print('--------Data loaded successfully---------')
    return raw_data
gc.collect()
train = pd.read_csv('training_set_VU_DM.csv')

In [3]:
def show_correlation(data, col1, col2):
    gc.collect()
    tmp = data.dropna(subset=[col1, col2])
    corr, p_value = pointbiserialr(tmp[col1], tmp[col2])
    print(f"Point-Biserial Correlation coefficient: {corr}, P-value: {p_value}", f"{col1} and {col2} ")

In [59]:
def get_target(data):
    target_col = 'target'
    conditions = [
            data["click_bool"] == 1,
            data["booking_bool"] == 1,
        ]
    choices = [1, 2]
    data[target_col] = np.select(conditions, choices, default=0)   
    return data
    

def merge_similar_data(data):
    comp_rate = data.filter(regex=r'^comp[12345678]_rate\d*$').mean(axis=1)
    comp_inv = data.filter(regex=r'^comp[12345678]_inv\d*$').mean(axis=1)
    comp_rate_percent_diff = data.filter(regex=r'^comp[12345678]_rate_percent_diff\d*$').mean(axis=1)
    data['comp_rate'] = comp_rate
    data['comp_inv'] = comp_inv
    data['comp_rate_percent_diff'] = comp_rate_percent_diff
    data.drop(labels = data.filter(regex=r'^comp[1-8]').columns, axis=1, inplace=True)
    return data

def drop_low_value_attributes(data):
    sparse_col = data.isnull().mean()[data.isnull().mean() > 0.1].index.to_list()
    # In the Point-biserial correlation coefficient calculation, the valuable_col shows a positive correlation or statistically significant with 'click_bool'. 
    # Raw and balanced data are used to calculate the correlation coefficient('click_bool' '1' : '0' == 50 : 2; '1' : '0' == 1 : 1).
    valuable_col = ['prop_review_score', 'prop_location_score2', 'srch_query_affinity_score', 'comp_rate', 'comp_inv']
    low_value_col = ['visitor_hist_starrating', 'visitor_hist_adr_usd', 'orig_destination_distance', 'comp_rate_percent_diff']
    data = data.drop(labels = [col for col in sparse_col if col not in valuable_col], axis = 1)
    return data

def add_date_features(data, datetime_key = 'date_time'):
    dates = pd.to_datetime(data[datetime_key])
    data['month'] = dates.dt.month
    data['quarter'] = dates.dt.quarter
    data = data.drop(labels=datetime_key, axis = 1)
    return data


def add_mean_position_features(data):
    prop_mean_position = data[data['random_bool'] == 0]
    prop_mean_position = prop_mean_position.groupby(["srch_destination_id", "prop_id"]).agg({"position": "mean"})
    prop_mean_position = prop_mean_position.rename(columns = {"position": "mean_position"}).reset_index()
    prop_mean_position['mean_position'] = prop_mean_position['mean_position'].astype(int) 
    data = data.merge(prop_mean_position, how = 'left', on = ["srch_destination_id", "prop_id"])

    # data['mean_position'] = 1 / data['mean_position']
    # data['position'] = 1 / data['position']
    return data

def add_diff_features(data, groupby_key, target_col):
    average_value_by_srch_id = data.groupby(groupby_key)[target_col].transform('mean')
    data[f'{target_col}_diff'] = data[target_col] - average_value_by_srch_id
    return data

def add_prop_pop_score(data):
    # 按 id 列分组，并计算 'click' 和 'book' 列的总和
    grouped = data.groupby('prop_id')[['click_bool', 'booking_bool']].sum()
    # 计算 'click' 和 'book' 列的总和，并乘以相应的权重
    grouped['weighted_sum'] = (grouped['booking_bool'] * 5 + grouped['click_bool'] * 1)
    # 计算每个 'id' 出现的次数
    id_counts = data['prop_id'].value_counts()
    id_counts.sort_index(inplace=True)
    # 将加权总和除以每个 'id' 出现的次数
    grouped['pop_score'] = grouped['weighted_sum'] / id_counts
    data['pop_score'] = data['prop_id'].map(grouped['pop_score'])
    return data
    

def train_data_preprocessing(data):
    print('--------Preprocessing training data--------')
    data = merge_similar_data(data)
    data = drop_low_value_attributes(data)
    print('--------Low value cols have been dropped---------')
    
    # feature engineering
    data = add_date_features(data)
    data = add_mean_position_features(data)

    # people will prefer to choose the cheaper/higher score props in each search
    # add new features based on expected visitor’s behavior
    prop_attr = ['price_usd', 'prop_starrating', 'prop_review_score', 'prop_location_score1', 'prop_location_score2']
    for col in prop_attr:
        data = add_diff_features(data, 'srch_id', col)
    # add pop_score based on the percentage of booking and clicking
    data = add_prop_pop_score(data)
    
    data = get_target(data)


    # use pop_score to replace 'click_bool' and 'booking_bool', and drop the original columns
    # data = data.drop(labels=['click_bool', 'booking_bool'], axis=1)
    print('--------Data Preprocessing Completed--------')
    return data

In [None]:
def process_before_train(data):
    # 均值化数据 
    # 删除一些列
    return data

In [71]:
def split_train_data(data_for_training):
    sample = data_for_training[0:500000]

    x1 = sample.sample(frac=0.8, replace=False, random_state=42)
    x2 = sample.drop(x1.index)
    y1 = x1['target'].values
    y2 = x2['target'].values
    x1 = x1.drop(labels='target', axis=1)
    x2 = x2.drop(labels='target', axis=1)

    groups = x1["srch_id"].value_counts(sort=False).sort_index()
    eval_groups = x2["srch_id"].value_counts(sort=False).sort_index()
    len(eval_groups), len(x2), len(x1), len(groups)

    return (x1, x2, y1, y2, groups, eval_groups)

In [77]:
df = train_data_preprocessing(train)

--------Preprocessing training data--------
--------Low value cols have been dropped---------
--------Data Preprocessing Completed--------


In [73]:
x1, x2, y1, y2, groups, eval_groups = split_train_data(df)

In [78]:
df

Unnamed: 0,srch_id,site_id,visitor_location_country_id,prop_country_id,prop_id,prop_starrating,prop_review_score,prop_brand_bool,prop_location_score1,prop_location_score2,...,month,quarter,mean_position,price_usd_diff,prop_starrating_diff,prop_review_score_diff,prop_location_score1_diff,prop_location_score2_diff,pop_score,target
0,1,12,187,219,893,3,3.5,1,2.83,0.0438,...,4,2,26.0,-58.948929,-0.071429,0.017857,0.530357,-0.005192,0.107843,0
1,1,12,187,219,10404,4,4.0,1,2.20,0.0149,...,4,2,24.0,7.021071,0.928571,0.517857,-0.099643,-0.034092,0.102916,0
2,1,12,187,219,21315,3,4.5,1,2.20,0.0245,...,4,2,22.0,16.081071,-0.071429,1.017857,-0.099643,-0.024492,0.027223,0
3,1,12,187,219,27348,2,4.0,1,2.83,0.0125,...,4,2,30.0,439.051071,-1.071429,0.517857,0.530357,-0.036492,0.089130,0
4,1,12,187,219,29604,4,3.5,1,2.64,0.1241,...,4,2,6.0,-20.138929,0.928571,0.017857,0.340357,0.075108,0.231579,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4958342,332785,5,219,219,77700,3,4.0,1,1.61,0.0471,...,6,2,8.0,15.500000,0.000000,0.666667,-0.041667,-0.023325,0.276596,0
4958343,332785,5,219,219,88083,3,4.0,1,1.95,0.1520,...,6,2,7.0,-13.500000,0.000000,0.666667,0.298333,0.081575,0.973684,0
4958344,332785,5,219,219,94508,3,3.5,1,1.10,0.0164,...,6,2,27.0,-3.500000,0.000000,0.166667,-0.551667,-0.054025,0.023256,0
4958345,332785,5,219,219,128360,3,5.0,1,1.95,0.0662,...,6,2,5.0,36.500000,0.000000,1.666667,0.298333,-0.004225,0.513514,1


In [45]:
random_seed = 42
random1 = df.sample(n=20, replace=False, random_state=random_seed)
random2 = y.sample(n=20, replace=False, random_state=random_seed)
print(random1['target'], random2)

AttributeError: 'numpy.ndarray' object has no attribute 'sample'

In [None]:
def run(train_csv):
    name_of_model = str(int(time.time()))

    training_data = load_data(train_csv)
    training_data, y = train_data_preprocessing(training_data)

    model = train_model(
        x1, x2, y1, y2, groups, eval_groups, lr, method, output_dir, name_of_model
    )

In [None]:
train_csv = 'training_set_VU_DM.csv'

In [77]:
# 假设 df 是你的数据集，'binary_target' 是二元分类目标列，'numeric_feature' 是数值特征列   接近0，表明几乎无相关性  远高于0.05，说明相关性统计上不显著  0为非常显著
for col in sparse_col:
    tmp = df.dropna(subset=[col])
    corr, p_value = pointbiserialr(tmp[col], tmp['click_bool'])
    print(f"Point-Biserial Correlation coefficient: {corr}, P-value: {p_value}", f"rows of {col}: ", len(tmp))

Point-Biserial Correlation coefficient: 0.0025406329466251686, P-value: 0.20229359422592888 rows of visitor_hist_starrating:  251866
Point-Biserial Correlation coefficient: 0.0038583051537975873, P-value: 0.052301402422990625 rows of visitor_hist_adr_usd:  252988
Point-Biserial Correlation coefficient: 0.023423735769677913, P-value: 0.0 rows of prop_review_score:  4950983
Point-Biserial Correlation coefficient: 0.07380689212761574, P-value: 0.0 rows of prop_location_score2:  3867999
Point-Biserial Correlation coefficient: 0.0417642671842636, P-value: 1.6019829618070852e-122 rows of srch_query_affinity_score:  317406
Point-Biserial Correlation coefficient: 0.0022874264083852978, P-value: 2.8262262480095217e-05 rows of orig_destination_distance:  3350565
Point-Biserial Correlation coefficient: nan, P-value: nan rows of gross_bookings_usd:  138390




Point-Biserial Correlation coefficient: 0.023587286274157557, P-value: 0.0 rows of comp_rate:  3241539
Point-Biserial Correlation coefficient: -0.003432931863442141, P-value: 3.2671789280871503e-10 rows of comp_inv:  3352374
Point-Biserial Correlation coefficient: 0.000252412085076082, P-value: 0.7509962361192315 rows of comp_rate_percent_diff:  1580490


In [33]:
ones = df[(df['target'] == 1) | (df['target'] == 2)]
zeros =  df[df['target'] == 0].sample(n=len(ones), random_state=42)
result = pd.concat([ones, zeros])
for col in df.columns:
    tmp = result.dropna(subset=[col])
    corr, p_value = pointbiserialr(tmp[col], tmp['target'])
    print(f"Point-Biserial Correlation coefficient: {corr}, P-value: {p_value}", f"rows of {col}: ", len(tmp))

Point-Biserial Correlation coefficient: -6.666119517446598e-05, P-value: 0.9645805087810249 rows of srch_id:  443758
Point-Biserial Correlation coefficient: 0.021741914902619116, P-value: 1.5053983240919704e-47 rows of site_id:  443758
Point-Biserial Correlation coefficient: -0.010193061096830432, P-value: 1.1191490872184585e-11 rows of visitor_location_country_id:  443758
Point-Biserial Correlation coefficient: -0.013902886400740685, P-value: 2.009165623041335e-20 rows of prop_country_id:  443758
Point-Biserial Correlation coefficient: -0.0001231490383373367, P-value: 0.9346183227236535 rows of prop_id:  443758
Point-Biserial Correlation coefficient: 0.07752232439098877, P-value: 0.0 rows of prop_starrating:  443758
Point-Biserial Correlation coefficient: 0.061526746849709296, P-value: 0.0 rows of prop_review_score:  443155
Point-Biserial Correlation coefficient: 0.003388892148485753, P-value: 0.02397584049620096 rows of prop_brand_bool:  443758
Point-Biserial Correlation coefficient:

Highly Significant Correlations: ['position', 'mean_position', 'pop_score']
Moderate Correlations: ['prop_location_score2_diff', 'prop_starrating_diff', 'prop_review_score_diff']
Statistically Significant: ['comp_rate', 'comp_inv']
Low or Insignificant Correlations: ['srch_id', 'prop_id', 'month', 'quarter']