# Import

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import datetime
import matplotlib.pyplot as plt
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import ndcg_score


# Reduce File Size into chunks

In [None]:
"""
# Load the large CSV file
file_path = 'C:/Users/98398/Desktop/P5/DM-AS2/Data/training_set_VU_DM.csv'
data = pd.read_csv(file_path)

# Define the size of each chunk
chunk_size = 500000  # This number can change 

# Split the CSV into chunks
for i in range(0, len(data), chunk_size):
    chunk = data.iloc[i:i + chunk_size]
    chunk.to_csv(f'chunk_{i//chunk_size}.csv', index=False)

"""

In [None]:
df = pd.read_csv("chunk_0.csv")
df['date_time'] = pd.to_datetime(df['date_time'])
df.head()

In [None]:
print(len(df.columns))

df.columns

# Data cleaning

要增加的部分

1，如果一个人只浏览但没有点击，那么它的数据在排序中是否是无意义的？

## Remove missing values

In [None]:
file_path = 'C:/Users/98398/Desktop/P5/DM-AS2/Test_data/chunk_0_cleaned.csv'

df_cleaned_chunk0 = pd.read_csv(file_path)

In [None]:
print(df_cleaned_chunk0.head())

## Classify according to different variable types

In [None]:
Criteria_in_the_search_query = ['date_time', 'site_id','srch_destination_id', 'srch_length_of_stay', 'srch_booking_window',
       'srch_adults_count', 'srch_children_count', 'srch_room_count',
       'srch_saturday_night_bool']

Static_hotel_characteristics = ['prop_country_id',
       'prop_id', 'prop_starrating', 'prop_review_score', 'prop_brand_bool',
       'prop_location_score1', 'prop_location_score2',
       'prop_log_historical_price','srch_query_affinity_score','orig_destination_distance']

Dynamic_hotel_characteristics = ['random_bool', 'position', 'price_usd', 'promotion_flag','click_bool', 'gross_bookings_usd',
       'booking_bool']

Visitor_information = ['srch_id', 'visitor_location_country_id',
       'visitor_hist_starrating', 'visitor_hist_adr_usd']

Competitive_information = ['comp1_rate', 'comp1_inv',
       'comp1_rate_percent_diff', 'comp2_rate', 'comp2_inv',
       'comp2_rate_percent_diff', 'comp3_rate', 'comp3_inv',
       'comp3_rate_percent_diff', 'comp4_rate', 'comp4_inv',
       'comp4_rate_percent_diff', 'comp5_rate', 'comp5_inv',
       'comp5_rate_percent_diff', 'comp6_rate', 'comp6_inv',
       'comp6_rate_percent_diff', 'comp7_rate', 'comp7_inv',
       'comp7_rate_percent_diff', 'comp8_rate', 'comp8_inv',
       'comp8_rate_percent_diff']

Other = []

all_catogories = [Criteria_in_the_search_query, Static_hotel_characteristics, 
                 Dynamic_hotel_characteristics, Visitor_information,Competitive_information,Other]
all_variables = []
for i in all_catogories:
    for j in i:
        all_variables.append(j)

df_variables = df.columns.to_list()

differences = set(all_variables).symmetric_difference(set(df_variables))

if differences == set():
    print('All catogorized')
else:
    print(f'Differences between df_variables and catogorized variables: {differences}')


### Criteria_in_the_search_query

In [None]:
print(Criteria_in_the_search_query)

# Model lambdarank

## Basic model(without parameter choose and feature engineering)

In [None]:
file_path = 'C:/Users/98398/Desktop/P5/DM-AS2/Test_data/chunk_0_cleaned.csv'

#file_path = 'C:/Users/98398/Desktop/P5/DM-AS2/Test_data/cleaned_training_set_VU_DM.csv'
#file_path = 'C:/Users/98398/Desktop/P5/DM-AS2/Test_data/training_set_VU_DM.csv'
df = pd.read_csv(file_path)

# 增加一个计算得分的列
def assign_scores(row):
    if row['booking_bool'] == 1:
        return 5
    elif row['click_bool'] == 1:
        return 1
    else:
        return 0

df['score'] = df.apply(assign_scores, axis=1)

features = ['site_id','srch_destination_id', 'srch_length_of_stay', 'srch_booking_window',
       'srch_adults_count', 'srch_children_count', 'srch_room_count',
       'srch_saturday_night_bool','prop_country_id',
       'prop_id', 'prop_starrating', 'prop_review_score', 'prop_brand_bool',
       'prop_location_score1', 'prop_location_score2',
       'prop_log_historical_price',

       'srch_query_affinity_score','orig_destination_distance','random_bool', 
       'price_usd', 'promotion_flag','srch_id', 'visitor_location_country_id',
       'visitor_hist_starrating', 'visitor_hist_adr_usd','comp1_rate', 

       'comp1_inv',
       'comp1_rate_percent_diff', 'comp2_rate', 'comp2_inv',
       'comp2_rate_percent_diff', 'comp3_rate', 'comp3_inv',
       'comp3_rate_percent_diff', 'comp4_rate', 'comp4_inv',
       'comp4_rate_percent_diff', 'comp5_rate', 'comp5_inv',
       'comp5_rate_percent_diff', 'comp6_rate', 'comp6_inv',
       'comp6_rate_percent_diff', 'comp7_rate', 'comp7_inv',
       'comp7_rate_percent_diff', 'comp8_rate', 'comp8_inv',
       'comp8_rate_percent_diff']

# 划分数据集
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['srch_id'])

# 准备 LightGBM 数据结构
train_data = lgb.Dataset(train_df[features], label=train_df['score'], group=train_df['srch_id'].value_counts().sort_index())
test_data = lgb.Dataset(test_df[features], label=test_df['score'], group=test_df['srch_id'].value_counts().sort_index())

# 设置模型参数
params = {
    'objective': 'lambdarank',
    'metric': 'ndcg',
    'ndcg_eval_at': [3, 5],
    'learning_rate': 0.1,
    'num_leaves': 31,
    'verbose': -1
}

# 训练模型
num_round = 100
bst = lgb.train(params, train_data, num_round, valid_sets=[test_data])

# 预测测试集
test_pred = bst.predict(test_df[features])

# 评估模型，计算 NDCG 分数
test_df['predictions'] = test_pred

# 首先确保数据按照 srch_id 和 predictions 降序排序
test_df.sort_values(['srch_id', 'predictions'], ascending=[True, False], inplace=True)

# 分组并计算每个搜索会话的 NDCG
grouped = test_df.groupby('srch_id')
ndcg_scores = []

for name, group in grouped:
    group = group.sort_values('predictions', ascending=False)
    true_relevance = group['score'].values
    scores_pred = group['predictions'].values
    # 计算当前搜索会话的 NDCG 分数，并追加到列表中
    if len(np.unique(true_relevance)) > 1:  # 只计算有正样本的会话
        ndcg_scores.append(ndcg_score([true_relevance], [scores_pred], k=5))

average_ndcg = np.mean(ndcg_scores)
print(f"Average NDCG Score: {average_ndcg}")


### Result output

In [None]:
# 使用迭代器逐块读取数据
chunk_size = 10000
reader = pd.read_csv('C:/Users/98398/Desktop/P5/DM-AS2/Data/test_set_VU_DM.csv', chunksize=chunk_size)

predictions = []  # 创建一个空列表以存储每个块的预测结果
for chunk in reader:
    # 可以在这里添加数据预处理步骤，比如填充缺失值等
    chunk_pred = bst.predict(chunk[features])  # 应用模型进行预测
    chunk['predictions'] = chunk_pred  # 将预测结果添加到 DataFrame
    predictions.append(chunk[['srch_id', 'prop_id', 'predictions']])  # 仅保留需要的列

# 合并所有批次的预测结果
final_predictions = pd.concat(predictions)

In [None]:
# 确保按照预测分数排序，如果 Kaggle 要求
final_predictions.sort_values(['srch_id', 'predictions'], ascending=[True, False], inplace=True)

# 选择需要的列
final_predictions = final_predictions[['srch_id', 'prop_id']]

# 保存为 CSV 文件，确保不包含索引，包含列标题
final_predictions.to_csv('train=all_cleaned.csv', index=False, header=True)

In [None]:
print(final_predictions)

### All train data

In [None]:
All_test_df = pd.read_csv('C:/Users/98398/Desktop/P5/DM-AS2/Data/training_set_VU_DM.csv')

# 预测测试集

All_test_pred = bst.predict(All_test_df[features])

# 评估模型，计算 NDCG 分数
All_test_df['predictions'] = All_test_pred

All_test_df['score'] = All_test_df.apply(assign_scores, axis=1)
# 假设 df 是你的 DataFrame
# 首先确保数据按照 srch_id 和 predictions 降序排序
All_test_df.sort_values(['srch_id', 'predictions'], ascending=[True, False], inplace=True)

# 分组并计算每个搜索会话的 NDCG
grouped = All_test_df.groupby('srch_id')
ndcg_scores = []

for name, group in grouped:
    # 确保每组数据中的预测是降序的（虽然已全局排序，这里是双重确认）
    group = group.sort_values('predictions', ascending=False)
    true_relevance = group['score'].values
    scores_pred = group['predictions'].values
    # 计算当前搜索会话的 NDCG 分数，并追加到列表中
    if len(np.unique(true_relevance)) > 1:  # 只计算有正样本的会话
        ndcg_scores.append(ndcg_score([true_relevance], [scores_pred], k=5))

# 计算平均 NDCG 分数
average_ndcg = np.mean(ndcg_scores)
print(f"Average NDCG Score: {average_ndcg}")


## Use all cleaned data without feature engineering

## With feature engineering by Menghan 

### Model build

In [None]:
def assign_scores(row):
    if row['booking_bool'] == 1:
        return 5
    elif row['click_bool'] == 1:
        return 1
    else:
        return 0

In [None]:
# 逐个读取feature_engineered_training_chunk{i}并上下拼接到一个df
base_path = '/Users/eva/Documents/Study/Y1S2/DMT/assignment2/'
file_pattern = 'feature_engineered_training_chunk_{}.csv'
for i in range(10):
    df_chunk = pd.read_csv(base_path + file_pattern.format(i))
    df_chunk['score'] = df_chunk.apply(assign_scores, axis=1)
    if i == 0:
        df = df_chunk
    else:
        df = pd.concat([df, df_chunk], axis=0)
df.head()


In [None]:

#file_path = '/Users/eva/Documents/Study/Y1S2/DMT/assignment2/feature_engineered_training_set_VU_DM.csv'

#file_path = 'C:/Users/98398/Desktop/P5/DM-AS2/Test_data/cleaned_training_set_VU_DM.csv'
#file_path = 'C:/Users/98398/Desktop/P5/DM-AS2/Test_data/training_set_VU_DM.csv'
#df = pd.read_csv(file_path)

# 增加一个计算得分的列




In [None]:
columns = df.columns

# 使用列表推导式筛选出不需要删除的列
features = [
    col for col in columns if col not in ['date_time', 'position', 'click_bool', 'booking_bool', 'score']
    and 'gross_bookings_usd' not in col and 'position' not in col
]


print(len(features))
print(len(df.columns))

In [None]:
# 检查df NA 展示有NA的列
na_counts = df.isna().sum()

# 打印有NA值的列和NA值的数量
print("Columns with NA values and their counts:")
for col, count in na_counts.items():
    if count > 0:
        print(f"{col}: {count}")
# 删除有NA值的列
df = df.dropna(axis=1)


### Default Params

In [None]:
default_params = {
    'objective': 'lambdarank',
    'metric': 'ndcg',
    'max_depth': 6,
    'num_leaves': 40,
    'learning_rate': 0.1,
    'feature_fraction': 0.7,
    'min_child_samples': 21,
    'min_child_weight': 0.001,
    'bagging_fraction': 1,
    'bagging_freq': 2,
    'reg_alpha': 0.001,
    'reg_lambda': 8,
    'cat_smooth': 0,
    'num_iterations': 200,
    'is_unbalance': True  # 仅当你确信数据不平衡严重且影响模型性能时才设置
}

### Tune max_depth and num_leaves on whole df

In [None]:

import lightgbm as lgb
import pandas as pd
import itertools
from tqdm import tqdm
import multiprocessing as mp
from functools import partial   
    

def function_to_get_data(df):
    sample_df = df.sample(frac=0.3, random_state=42)
    train_df, test_df = train_test_split(sample_df, test_size=0.2, random_state=42, stratify=df['srch_id'])
    
    x_train = train_df[features]
    x_val = test_df[features]
    y_train = train_df['score']
    y_val = test_df['score']
    
    # we need group because we need to rank a list of item not like classification or regression
    group_qids_train = train_df['srch_id'].value_counts().sort_index()
    group_qids_val = test_df['srch_id'].value_counts().sort_index()
    return x_train, x_val, y_train, y_val, group_qids_train, group_qids_val
    
def get_experience(x_train, x_val, y_train, y_val, group_qids_train, group_qids_val, ranking_param_grid):
    params = {
        'learning_rate': ranking_param_grid[0],
        'n_estimators': ranking_param_grid[1],
        'num_leaves': ranking_param_grid[2],
        'max_depth': ranking_param_grid[3],
        'boosting_type': ranking_param_grid[4]
    }
    gbm = lgb.LGBMRanker(n_jobs=24, **params)
    
    # eval_set = [10, 50, 100] this is the define NDCG@10, NDCG@50, NDCG@100
    gbm.fit(x_train, y_train, group=group_qids_train, eval_set=[(x_val, y_val)],
            eval_metric='ndcg', eval_group=[group_qids_val], eval_at=[10, 50, 100],
            early_stopping_rounds=100, verbose=False)
        
    return gbm

def fit_lgbm(df):
    """Train Light GBM model"""
    ranking_param_grid = {
        'learning_rate': np.arange(0.06, 0.14, 0.02),
        'n_estimators': np.arange(100, 500, 100),
        'num_leaves': range(25, 35, 1),
        'max_depth': [-1, 50, 100, 200],
        'boosting_type': ['gbdt', 'goss']
    }

    x_train, x_val, y_train, y_val, group_qids_train, group_qids_val = function_to_get_data(df)

    the_best_configs = {}
    len_ = 1

    for k in ranking_param_grid.keys():
        len_ *= len(ranking_param_grid[k])
        
    list_params = itertools.product(ranking_param_grid['learning_rate'],
                                    ranking_param_grid['n_estimators'],
                                    ranking_param_grid['num_leaves'],
                                    ranking_param_grid['max_depth'],
                                    ranking_param_grid['boosting_type'])
    
    # Multi processing
    process_pool = mp.Pool(processes=32)
    get_experience_each = partial(
        get_experience,
        x_train,
        x_val,
        y_train,
        y_val,
        group_qids_train,
        group_qids_val
    )
    with tqdm(total=len_) as pbar:
        for each_item in process_pool.imap(
            get_experience_each,
            list_params
        ):
            pbar.update()
            the_best_configs[list(each_item.best_score_['valid_0'].values())[
                2]] = each_item.get_params()

    df = pd.DataFrame(the_best_configs.items(), columns=['scores', 'params'])
    best_params = df[df.scores == max(df.scores)].params.to_list()
    print(best_params)
        
    return best_params

best_params = fit_lgbm(df)

In [None]:
import lightgbm as lgb
import numpy as np
from sklearn.model_selection import train_test_split, GroupKFold, GridSearchCV
from sklearn.metrics import make_scorer, ndcg_score

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['srch_id'])
sample_df = train_df.sample(frac=0.3, random_state=42)
features = [col for col in sample_df.columns if col not in ['date_time', 'position', 'click_bool', 'booking_bool', 'score', 'srch_id'] and 'gross_bookings_usd' not in col and 'position' not in col]
train_data = lgb.Dataset(sample_df[features], label=sample_df['score'], group=sample_df['srch_id'])


# 使用 GroupKFold 以确保相同的查询 ID 不会同时出现在训练和验证集中
gkf = GroupKFold(n_splits=5)

# 设置 GridSearchCV
grid_search = GridSearchCV(
    estimator=lgb.LGBMRanker(**default_params),
    param_grid={
        'max_depth': [3, 4, 5],
        'num_leaves': [31, 41, 51]
    },
    scoring=make_scorer(ndcg_score, needs_proba=True, k=5),
    cv=gkf.split(X=train_data.data, y=train_data.label, groups=train_data.group),
    verbose=3,
    n_jobs=-1# 在发生错误时引发异常
)



# 执行网格搜索
grid_search.fit(train_data.data, train_data.label, groups=train_data.group)

# 输出最佳参数和分数
print("Best parameters found: ", grid_search.best_params_)
print("Best NDCG score: ", grid_search.best_score_)

# 使用最佳参数重新训练模型
best_params = grid_search.best_params_
best_params.update(default_params)  # 确保这里是之前定义的 default_params
print("Best parameters:", best_params)

In [None]:
import lightgbm as lgb
import numpy as np
from sklearn.model_selection import train_test_split, GroupKFold, RandomizedSearchCV
from sklearn.metrics import make_scorer
from scipy.stats import randint, uniform

# 准备数据
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['srch_id'])
sample_df = train_df.sample(frac=0.2, random_state=42)
features = [col for col in sample_df.columns if col not in ['date_time', 'position', 'click_bool', 'booking_bool', 'score', 'srch_id'] and 'gross_bookings_usd' not in col and 'position' not in col]
X = sample_df[features].values
y = sample_df['score'].values
groups = sample_df['srch_id']

# LightGBM 参数网格
param_grid = {
    'learning_rate': [0.1],  # 可以展开更多的选项，如[0.05, 0.1, 0.2]
    'n_estimators': [100],  # 同样，可以根据需要增加更多的选择
    'max_depth': [3, 4, 5],
    'num_leaves': [5, 6, 7, 12, 13, 14, 15, 28, 29, 30, 31],
    'min_child_samples': [30, 40, 50],
    'min_child_weight': [0.01],  # 这里只有一个值，如果需要测试不同的值可以增加
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0],
    'reg_alpha': [0.01, 0.1, 1, 10, 100],  # loguniform通常用于生成一个范围的值，这里需要手动指定一系列的值
    'reg_lambda': [0.01, 0.1, 1, 10, 100]  # 同上，指定一系列可能的值
}

def ndcg_scorer(y_true, y_pred):
    return np.mean(lgb.Dataset.get_field('ndcg@5'))

# 设置模型
model = lgb.LGBMRanker(
    objective='lambdarank',
    metric='ndcg',
    verbose=-1,
    random_state=42
)

# 创建 GroupKFold 以确保相同的查询 ID 不会同时出现在训练和验证集中
gkf = GroupKFold(n_splits=5)

# 设置 RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=model,
    param_distributions=param_dist,
    n_iter=100,  # 进行100次随机采样
    scoring=make_scorer(ndcg_scorer, needs_proba=True),
    cv=gkf,
    verbose=3,
    n_jobs=1,
    random_state=42
)

# 执行随机搜索
random_search.fit(X, y, groups=groups)

# 输出最佳参数和分数
print("Best parameters found: ", random_search.best_params_)
print("Best NDCG score: ", random_search.best_score_)

# 使用最佳参数重新训练模型
best_params = random_search.best_params_
best_params.update({'objective': 'lambdarank', 'metric': 'ndcg'})


In [None]:
import lightgbm as lgb
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from timeit import default_timer as timer
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import StratifiedKFold

import csv

import os
import sys


In [None]:
class LGBBO:
    def __init__(self, fp_path, **kwargs):
        self.fp_path = fp_path
        self.iter = 0
        self.train_set = None

        self.kfold = kwargs.get('kfold', 3)
        self.n_estimators = kwargs.get('n_estimators', 800)

        csv_conn = open(self.fp_path, 'w')
        writer = csv.writer(csv_conn)
        writer.writerow(['loss', 'train_auc', 'valid_auc', 'train_ks', 'valid_ks',
                         'lst_train_auc', 'lst_valid_auc', 'lst_train_ks', 'lst_valid_ks',
                         'params', 'iteration', 'train_time'])
        csv_conn.close()

    def load_data(self, df_data, feature_list, label):
        self.df_data = df_data.reset_index(drop=True)
        self.feature_list = feature_list
        self.label = label

    def objective(self, params):
        def eval_ks(ytrue, yprob):
            fpr, tpr, thr = roc_curve(ytrue, yprob)
            ks = max(tpr - fpr)
            return "ks", ks, True

        def eval_auc(ytrue, yprob):
            auc = roc_auc_score(ytrue, yprob)
            return "auc", auc, True

        self.iter += 1
        start = timer()
        model = lgb.LGBMClassifier(**params,
                                   learning_rate=0.1,
                                   min_child_samples=20000,
                                   objective='cross_entropy',
                                   importance_type='gain',
                                   class_weight='balanced',
                                   boosting_type='gbdt', n_estimators=self.n_estimators,
                                   silent=True, n_jobs=1, random_state=0
                                   )

        lst_train_auc, lst_valid_auc = list(), list()
        lst_train_ks, lst_valid_ks = list(), list()

        for k in range(self.kfold):
            df_cv_train = self.df_data[self.df_data[f"cv{k}"] == 'train']
            df_cv_valid = self.df_data[self.df_data[f"cv{k}"] == 'valid']
            print(k, df_cv_train.shape, df_cv_valid.shape)
            print(params)

            eval_set = [(df_cv_train[self.feature_list], df_cv_train[self.label]),
                        (df_cv_valid[self.feature_list], df_cv_valid[self.label])
                        ]

            model.fit(df_cv_train[self.feature_list], 
                    df_cv_train[self.label],
                    eval_set=eval_set,  # 确保这里至少有一个元组组成的列表
                    eval_metric='auc',  # 这里示例使用 'auc' 作为评估指标
                    early_stopping_rounds=50,
                    verbose=20)

            yprob = model.predict_proba(df_cv_train[self.feature_list])[:, 1]
            _, train_auc, _ = eval_auc(df_cv_train[self.label], yprob)
            _, train_ks, _ = eval_ks(df_cv_train[self.label], yprob)
            yprob = model.predict_proba(df_cv_valid[self.feature_list])[:, 1]
            _, valid_auc, _ = eval_auc(df_cv_valid[self.label], yprob)
            _, valid_ks, _ = eval_ks(df_cv_valid[self.label], yprob)

            lst_train_auc.append(train_auc)
            lst_valid_auc.append(valid_auc)
            lst_train_ks.append(train_ks)
            lst_valid_ks.append(valid_ks)

            print(train_auc, valid_auc, train_ks, valid_ks)

        run_time = timer() - start

        train_auc_avg = np.mean(lst_train_auc)
        valid_auc_avg = np.mean(lst_valid_auc)
        train_ks_avg = np.mean(lst_train_ks)
        valid_ks_avg = np.mean(lst_valid_ks)

        loss = -valid_ks_avg

        csv_conn = open(self.fp_path, 'a')
        writer = csv.writer(csv_conn)

        writer.writerow([loss,
                         train_auc_avg, valid_auc_avg,
                         train_ks_avg, valid_ks_avg,
                         lst_train_auc, lst_valid_auc,
                         lst_train_ks, lst_valid_ks,
                         params, self.iter, run_time])

        res = {'loss': loss,
               'train_auc': train_auc_avg, 'valid_auc': valid_auc_avg,
               'train_ks': train_ks_avg, 'valid_ks': valid_ks_avg,
               'lst_train_auc': lst_train_auc, 'lst_valid_auc': lst_valid_auc,
               'lst_train_ks': lst_train_ks, 'lst_valid_ks': lst_valid_ks,
               'params': params, 'iteration': self.iter, 'train_time': run_time,
               'status': STATUS_OK}

        print(self.iter)
        print(res)

        return res

    def optimize(self, max_evals):
        self.iter = 0

        space = {
            'max_depth': hp.choice('max_depth', [3, 4, 5]),
            'num_leaves': hp.choice('num_leaves', [5, 6, 7, 12, 13, 14, 15, 28, 29, 30, 31]),
            'subsample': hp.choice('subsample', [0.8, 0.9, 1.0]),
            'colsample_bytree': hp.choice('colsample_bytree', [0.8, 0.9, 1.0]),
            'reg_alpha': hp.loguniform('reg_alpha', np.log(0.01), np.log(1000)),
            'reg_lambda': hp.loguniform('reg_lambda', np.log(0.01), np.log(1000))
        }

        rstate = np.random.default_rng(0)
        best = fmin(fn=self.objective, space=space, algo=tpe.suggest, max_evals=max_evals,
                    trials=Trials(), rstate=rstate)

        print(best)
        return best

kf = StratifiedKFold(n_splits=3, random_state=0, shuffle=True)
sample_df = sample_df.reset_index(drop=True)
for k, (itrain, ivalid) in enumerate(kf.split(sample_df[features], sample_df['score'])):
    sample_df[f"cv{k}"] = None
    sample_df.loc[itrain, f'cv{k}'] = 'train'
    sample_df.loc[ivalid, f'cv{k}'] = 'valid'

print('start tuning...')
bo = LGBBO(f'param.csv')
bo.load_data(sample_df, features, 'score')
bo.optimize(10000)

## From here it's workable

In [None]:


# 准备 LightGBM 数据结构
train_data = lgb.Dataset(train_df[features], label=train_df['score'], group=train_df['srch_id'].value_counts().sort_index())
test_data = lgb.Dataset(test_df[features], label=test_df['score'], group=test_df['srch_id'].value_counts().sort_index())

# 设置模型参数
params = {
    'objective': 'lambdarank',
    'metric': 'ndcg',
    'ndcg_eval_at': [3, 5],
    'learning_rate': 0.1,
    'num_leaves': 31,
    'verbose': -1
}

# 训练模型
num_round = 100
bst = lgb.train(params, train_data, num_round, valid_sets=[test_data])

# 预测测试集
test_pred = bst.predict(test_df[features])

# 评估模型，计算 NDCG 分数
test_df['predictions'] = test_pred

# 首先确保数据按照 srch_id 和 predictions 降序排序
test_df.sort_values(['srch_id', 'predictions'], ascending=[True, False], inplace=True)

# 分组并计算每个搜索会话的 NDCG
grouped = test_df.groupby('srch_id')
ndcg_scores = []

for name, group in grouped:
    group = group.sort_values('predictions', ascending=False)
    true_relevance = group['score'].values
    scores_pred = group['predictions'].values
    # 计算当前搜索会话的 NDCG 分数，并追加到列表中
    if len(np.unique(true_relevance)) > 1:  # 只计算有正样本的会话
        ndcg_scores.append(ndcg_score([true_relevance], [scores_pred], k=5))

average_ndcg = np.mean(ndcg_scores)
print(f"Average NDCG Score: {average_ndcg}")


### Result output

In [None]:
# 使用迭代器逐块读取数据
chunk_size = 10000
reader = pd.read_csv('/Users/eva/Documents/Study/Y1S2/DMT/assignment2/feature_engineered_test_set_VU_DM.csv', chunksize=chunk_size)

predictions = []  # 创建一个空列表以存储每个块的预测结果
for chunk in reader:
    # 可以在这里添加数据预处理步骤，比如填充缺失值等
    chunk_pred = bst.predict(chunk[features])  # 应用模型进行预测
    chunk['predictions'] = chunk_pred  # 将预测结果添加到 DataFrame
    predictions.append(chunk[['srch_id', 'prop_id', 'predictions']])  # 仅保留需要的列

# 合并所有批次的预测结果
final_predictions = pd.concat(predictions)

In [None]:
# 确保按照预测分数排序，如果 Kaggle 要求
final_predictions.sort_values(['srch_id', 'predictions'], ascending=[True, False], inplace=True)

# 选择需要的列
final_predictions = final_predictions[['srch_id', 'prop_id']]

# 保存为 CSV 文件，确保不包含索引，包含列标题
final_predictions.to_csv('train=all_featured_cleaned(train+set_all_grouped_by).csv', index=False, header=True)

In [None]:
print(final_predictions)