In [110]:
import re
import math

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from kanjize import kanji2number
import japanize_matplotlib 

import optuna
from optuna.integration import OptunaSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
import lightgbm as lgb
from sklearn.metrics import auc
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

np.random.seed(42)

#行の表示数の上限を撤廃
pd.set_option('display.max_rows', None)

#列の表示数の上限を撤廃
pd.set_option('display.max_columns', None)


In [56]:
submit_df = pd.read_csv('./data/sample_submit.csv', header=None)

In [4]:
test_df = pd.read_csv('./data/test.csv')

In [5]:
train_df = pd.read_csv('./data/train.csv')

In [6]:
all_df = pd.concat([train_df, test_df], sort=False).reset_index(drop=True)

In [7]:
# id:削除
all_df = all_df.drop("id", axis=1)

In [8]:
# Age:age_numericという数値型の特徴量に変換
# 年齢を表す特徴量に「代」「歳」「才」「漢数字＋歳」などの異なる形式が混在している場合、これらを全て数値型に変換する

# 漢数字を数値に変換する関数
def kanji2number(kanji_str):
    kanji_dict = {'〇': 0, '一': 1, '二': 2, '三': 3, '四': 4, '五': 5, '六': 6, '七': 7, '八': 8, '九': 9, '十': 10}
    num = 0
    ten_multiplier = 1
    
    for char in reversed(kanji_str):
        if char == '十':
            ten_multiplier = 10
        else:
            num += kanji_dict.get(char, 0) * ten_multiplier
            ten_multiplier = 1
            
    return num

def convert_age_to_number(age_str):
    # 入力が文字列かどうかを確認
    if isinstance(age_str, str):
        # 1. 漢数字を数値に変換
        try:
            age_str = re.sub(r'[一二三四五六七八九十〇]+', lambda x: str(kanji2number(x.group())), age_str)
        except Exception:
            pass

        # 2. 数字と単位を取り出す
        match = re.search(r'(\d+)(代|歳|才)?', age_str)
        if match:
            number = int(match.group(1))
            return number  # 数字と単位があればそのまま返す
    elif isinstance(age_str, (int, float)):
        if math.isnan(age_str):  # NaNを確認
            return age_str
        return int(age_str)

    return None

In [9]:
def split_features(text):
    # 区切り文字の正規表現パターンを定義（全角スペースや半角スペースを考慮）
    delimiters = r'[ 　、/,\t／\n\\]+'
    
    # 正規表現を使用してテキストを分割
    features = re.split(delimiters, text, 2)
    
    # 空文字を取り除く
    features = [feature for feature in features if feature.strip()]
    
    return features

# 特徴量を分割して新しい列を作成
# 特徴量の数に応じて動的に列を作成する
def add_features_columns(df, feature_column):
    # 特徴量の分割
    split_data = df[feature_column].apply(split_features)
    
    # 最大の特徴量の数を取得
    max_features = split_data.apply(len).max()
    
    # 新しい列名を作成
    new_columns = [f'特徴{i+1}' for i in range(max_features)]
    
    # 各行の特徴量を新しい列に分割
    split_df = pd.DataFrame(split_data.tolist(), columns=new_columns)
    
    # 元のデータフレームに新しい列を追加
    df = df.join(split_df)
    
    return df

In [10]:
# 特徴２を自動車所持している時を１、していない時を０に分類
def categorize_car(status):
    # 車を所持していない場合のキーワード
    no_car_keywords = [
        "車未所持", "自動車未所有", "自家用車なし", 
        "乗用車なし", "車なし", "車保有なし"
    ]
    
    # 車を所持している場合のキーワード
    has_car_keywords = [
        "車所持", "自動車所有", "自家用車あり", 
        "乗用車所持", "車保有", "車あり"
    ]
    
    if status in no_car_keywords:
        return 0
    elif status in has_car_keywords:
        return 1
    else:
        return None  # 不明な場合はNoneを返す

In [11]:
# 特徴３の整理

# 分類用の関数
def classify_children(text):
    if "なし" in text or "無し" in text or "ゼロ" in text or "無子" in text or "非育児家庭" in text:
        return 0
    elif "1人" in text or "こども1人" in text or "1児" in text or "子供有り(1人)" in text or "子供有り 1人" in text:
        return 1
    elif "2人" in text or "こども2人" in text or "2児" in text or "子供有り 2人" in text or "子供有り(2人)" in text:
        return 2
    elif "3人" in text or "こども3人" in text or "3児" in text or "子供有り 3人" in text or "子供有り(3人)" in text:
        return 3
    elif "不明" in text or "わからない" in text or "不詳" in text:
        return -1
    else:
        return -1  # 該当しない場合は不明として扱う

In [12]:
# DurationOfPitch：数値型に変換

# 分と秒の混合形式の文字列を秒単位に変換
def convert_to_seconds(time_str):
    if pd.isna(time_str):
        return time_str  # 欠損値はそのまま返す

    if isinstance(time_str, str):
        minutes = 0
        seconds = 0

        if '分' in time_str:
            # 分を抽出
            parts = time_str.split('分')
            minutes = int(parts[0].strip())
            time_str = parts[1] if len(parts) > 1 else '0秒'

        if '秒' in time_str:
            # 秒を抽出
            seconds = int(time_str.split('秒')[0].strip())

        # 総秒数を計算
        total_seconds = minutes * 60 + seconds
        return total_seconds
    else:
        return time_str  # 文字列でない場合はそのまま返す

In [13]:
# Gender分類用の関数
def categorize_gender(status):
    # 車を所持していない場合のキーワード
    male_keywords = [
        "Male", "male", "MALE", 
        "Ｍａｌｅ", "ｍａｌｅ", "ＭＡＬＥ"
    ]
    
    # 車を所持している場合のキーワード
    female_keywords = [
        "Female", "female", "Fe Male", "FEMALE", "Ｆｅｍａｌｅ", "ｆｅｍａｌｅ",
        "fe male", "FE MALE", "Ｆｅ　Ｍａｌｅ", "ＦＥＭＡＬＥ", "ｆｅ　ｍａｌｅ", "ＦＥ　ＭＡＬＥ"
    ]
    
    if status in male_keywords:
        return 1
    elif status in female_keywords:
        return 0
    else:
        return None  # 不明な場合はNoneを返す

In [14]:
# ProductPitched分類用の関数
def categorize_Product(status):
    basic_keywords = [
        "Basic", "basic", "BASIC", "Basıc", "Βasic", "Basi𝘤",
        "Вasic", "𐊡asic", "Basiс", "B𝖺sic", "B𝖺sic", "𐊡asi𝘤",
        "Basıϲ", "Βasıc", "BASIС", "B𝖺si𝘤", "ΒASIС", "basiϲ",
        "В𝖺sic", "BAꓢIC", "BAՏIC", "BΑSIC", "Βası𝘤", "BASΙC",
        "Baｓic", "basıc", "Basiϲ", "Basiϲ", "Basiϲ"
    ]
    
    standard_keywords = [
        "Standard", "standard", "STANDARD", "Stand𝖺rd", "Տtandard", "Staոdard",
        "Standa𝘳d", "S𝘵andard", "Standar𝔡", "St𝖺ndard", "ꓢtandard", "staոdard",
        "Ѕtandard", "Տtanda𝘳d", "STANᗞARD", "STANDARᎠ", "ЅTANDARD"
    ]
    
    deluxe_keywords = [
        "Deluxe", "DELUXE", "deluxe", "De|uxe", "Delu×e", "ᗞeluxe", "𝙳eluxe",
        "DELUXΕ", "Ꭰeluxe", "de|u×e", "ᎠELUXE", "de|uxe", "DΕLUXΕ"
    ]
    
    super_deluxe_keywords = [
    "Super Deluxe", "super deluxe", "SUPER DELUXE", "Super De|uxe", "Super ᗞeluxe",
    "Super Ꭰeluxe", "ꓢuper Deluxe", "Ѕuper Deluxe", "Super 𝙳eluxe", "SUPER DΕLUXE",
    "ꓢuper De|uxe", "Տuper Deluxe ", "super de|uxe", "ｓuper deluxe", "Տuper Deluxe"
    ]
    
    king_keywords = [
    "King", "KING", "king", "Kıոg", "Kiոg", "Kıng"
    ]
    
    if status in basic_keywords:
        return 0
    elif status in standard_keywords:
        return 1
    elif status in deluxe_keywords:
        return 2
    elif status in super_deluxe_keywords:
        return 3
    elif status in king_keywords:
        return 4
    else:
        return None  # 不明な場合はNoneを返す

In [15]:
# Designation分類用の関数
def categorize_Product(status):
    executive_keywords = [
        "Executive", "Exеcutive", "Exеcutivе", "Executivе", "Executiѵe", "Execuｔive", "Exеcutiѵе",
        "Еxecutive", "Exеcｕtive", "Е×еcutive", "E×ecutive", "Executiѵе", "Еxecutivе", "E×ecｕtive",
        "Exеcｕtivе", "Еxеcutivе", "Execｕtive", "E×еcutiѵe", "Еxecuｔive", "Е×ecutive"
    ]
    
    manager_keywords = [
        "Manager", "Μanager", "Manαger", "Managеr", "Mαnager", "Manage𝙧", "Mαnage𝙧", "Mαnαger",
        "Mαnagеr", "Μanage𝙧", "Manαgеr", "Μanagеr"
    ]
    
    senior_manager_keywords = [
        "Senior Manager", "Senior Managеr", "Senior Manαger", "Sеnior Manager", "Senior Mαnαger",
        "Senior Manage𝙧", "Տenior Manager", "Ѕenior Manager", "Sеnior Managеr", "Senior Mαnager",
        "Senior Mαnager", "Ѕenior Manαger", "Տenior Μanager", "Senio𝙧 Manager", "Senior Managе𝙧",
        "Senior Μanαger"
    ]
    
    avp_keywords = [
    "AVP", "АVP", "ΑVP", "AVＰ"
    ]
    
    vp_keywords = [
    "VP", "VＰ"
    ]
    
    if status in executive_keywords:
        return 0
    elif status in manager_keywords:
        return 1
    elif status in senior_manager_keywords:
        return 2
    elif status in avp_keywords:
        return 3
    elif status in vp_keywords:
        return 4
    else:
        return None  # 不明な場合はNoneを返す

In [16]:
# NumberOfTrips分類用の関数
def categorize_NumberOfTrips(status):

    one = ["1", "年に1回"]
    
    two= ["2", "年に2回", "半年に1回"]
    
    three = ["3", "年に3回"]
    
    four = ["4", "年に4回", "四半期に1回"]
    
    five = ["5", "年に5回"]
    
    six = ["6", "年に6回"]
    
    seven = ["7", "年に7回"]

    eight = ["8", "年に8回"]
    
    if status in one:
        return 1
    elif status in two:
        return 2
    elif status in three:
        return 3
    elif status in four:
        return 4
    elif status in five:
        return 5
    elif status in six:
        return 6
    elif status in seven:
        return 7
    elif status in eight:
        return 8
    else:
        return None

In [17]:
# 関数を適用して新しい列を作成
all_df['Age'] = all_df['Age'].apply(convert_age_to_number)

# 特徴量列を分割して新しい列を作成
all_df = add_features_columns(all_df, 'customer_info')
all_df = all_df.drop('customer_info', axis=1)
all_df = all_df.rename(columns={'特徴1': '結婚歴', '特徴2': '車の有無', '特徴3': '子供の有無'}, copy=False)

all_df['車の有無'] = all_df['車の有無'].apply(categorize_car)

# データを分類
all_df['子供の有無'] = all_df['子供の有無'].apply(classify_children)

all_df['DurationOfPitch'] = all_df['DurationOfPitch'].apply(convert_to_seconds)

all_df['Gender'] = all_df['Gender'].apply(categorize_gender)

all_df['ProductPitched'] = all_df['ProductPitched'].apply(categorize_Product)

all_df['Designation'] = all_df['Designation'].apply(categorize_Product)

all_df['NumberOfTrips'] = all_df['NumberOfTrips'].apply(categorize_NumberOfTrips)

# MonthlyIncome分類用の関数
# 数値以外の文字を削除し、float型に変換
all_df['MonthlyIncome'] = all_df['MonthlyIncome'].str.replace('月収', '').str.replace('万円', '').astype(float) * 10000

In [18]:
all_df.head()

Unnamed: 0,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfPersonVisiting,NumberOfFollowups,ProductPitched,PreferredPropertyStar,NumberOfTrips,Passport,PitchSatisfactionScore,Designation,MonthlyIncome,ProdTaken,結婚歴,車の有無,子供の有無
0,50.0,Self Enquiry,2,900.0,Large Business,1,1.0,4.0,,3.0,5.0,1,4,0.0,2539050000.0,1.0,未婚,0,0
1,56.0,Company Invited,1,840.0,Salaried,1,1.0,4.0,,3.0,2.0,1,4,2.0,4044750000.0,0.0,離婚済み,1,0
2,,Self Enquiry,1,600.0,Large Business,0,1.0,3.0,,3.0,4.0,0,4,0.0,2781450000.0,1.0,結婚済み,0,0
3,37.0,Self Enquiry,2,1080.0,Small Business,0,1.0,3.0,,4.0,1.0,0,5,2.0,3268050000.0,0.0,離婚済み,1,0
4,48.0,Company Invited,3,1020.0,Small Business,0,1.0,3.0,,4.0,4.0,0,4,0.0,2584350000.0,1.0,独身,1,0


In [19]:
# One-Hot Encoding
all_df = pd.get_dummies(all_df, drop_first=True)

In [45]:
new_train_df = all_df[~all_df["ProdTaken"].isnull()]
new_test_df = all_df[all_df["ProdTaken"].isnull()].drop("ProdTaken",axis=1).reset_index(drop=True)


In [46]:
new_test_df.head()

Unnamed: 0,Age,CityTier,DurationOfPitch,Gender,NumberOfPersonVisiting,NumberOfFollowups,PreferredPropertyStar,NumberOfTrips,Passport,PitchSatisfactionScore,Designation,MonthlyIncome,車の有無,子供の有無,TypeofContact_Self Enquiry,Occupation_Salaried,Occupation_Small Business,結婚歴_独身,結婚歴_結婚済み,結婚歴_離婚済み
0,48.0,2,780.0,1,1.0,4.0,3.0,7.0,0,3,3.0,4969500000.0,1,0,True,False,True,False,True,False
1,30.0,2,720.0,0,1.0,4.0,3.0,4.0,1,3,2.0,300000.0,0,0,True,False,True,False,True,False
2,25.0,1,540.0,0,1.0,4.0,3.0,1.0,0,3,0.0,260000.0,0,0,True,True,False,False,False,True
3,21.0,2,420.0,1,1.0,4.0,4.0,1.0,0,3,2.0,2598750000.0,1,0,False,True,False,False,False,True
4,41.0,1,420.0,1,1.0,4.0,3.0,1.0,0,4,0.0,2688300000.0,1,0,False,True,False,True,False,False


In [22]:
train_X = new_train_df.drop(["ProdTaken"], axis=1)
train_Y = new_train_df["ProdTaken"]

In [23]:
X_train, X_val, y_train, y_val = train_test_split(train_X, train_Y, test_size=0.25, random_state=42)

In [106]:
"""
Training API：Scikit-learn APIより細かな設定が可能なため
Stratified K-Fold: クラス不均衡がある分類問題であるため
パラメータ調整：optuna
今回の評価指標であるroc_auc_scoreは確率からスコアを計算するため、model.predict_proba()を使用
（分類における予測方法には、予測を0と1の二値で判定するmodel.predict()と、確率を計算するmodel.predict_proba()の二つ）

"""
# パラメータ探索（Optuna）
def objective(trial):

    params = {
        'random_seed':42,
        'objective': 'binary',  # 目的に応じて変更 (binary, multiclass, etc.)
        'metric': 'auc',  # メトリックも目的に応じて変更
        'boosting_type': 'gbdt',
        'learning_rate': trial.suggest_categorical('learning_rate', [0.008,0.01,0.012,0.014,0.016,0.018, 0.02]),
        'n_estimators': 10000,
        'num_leaves': trial.suggest_int('num_leaves', 20, 150),
        'max_bin': trial.suggest_int("max_bin",50,200),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 2, 256),
        'min_sum_hessian_in_leaf': trial.suggest_int('min_sum_hessian_in_leaf', 1, 10),        
#        'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
#        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0]),
#        'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
#        'max_depth': trial.suggest_categorical('max_depth', [5,7,9,11,13,15,17]),
#        'min_child_weight': trial.suggest_int('min_child_weight', 1, 300),
#        'max_depth': trial.suggest_int('max_depth', 3, 8),
#        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
#        'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
#        'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
    }

    lgb_train = lgb.Dataset(X_train, label=y_train)
    lgb_eval = lgb.Dataset(X_val, label=y_val, reference=lgb_train)

    model = lgb.train(params, lgb_train, valid_sets=lgb_eval,
                      num_boost_round=100, early_stopping_rounds=100, verbose_eval=False)
    y_prob = model.predict(X_val, num_iteration=model.best_iteration)
    y_pred = np.round(y_prob)  # 0.5を閾値として0/1に分類
    score = roc_auc_score(np.round(y_val), np.round(y_pred))

    return score

In [107]:
study = optuna.create_study(sampler=optuna.samplers.RandomSampler(seed=42))
study.optimize(objective, n_trials=50)
study.best_params

print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

[I 2024-09-01 17:26:59,057] A new study created in memory with name: no-name-481bab69-b6d3-4381-8edf-d1b689992207
  'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
[I 2024-09-01 17:26:59,361] Trial 0 finished with value: 0.5 and parameters: {'learning_rate': 0.01, 'num_leaves': 133, 'max_bin': 140, 'bagging_fraction': 0.8248435466776274, 'bagging_freq': 1, 'feature_fraction': 0.9819459112971965, 'min_data_in_leaf': 214, 'min_sum_hessian_in_leaf': 3}. Best is trial 0 with value: 0.5.


[LightGBM] [Info] Number of positive: 397, number of negative: 2394
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 289
[LightGBM] [Info] Number of data points in the train set: 2791, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.142243 -> initscore=-1.796785
[LightGBM] [Info] Start training from score -1.796785


  'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
[I 2024-09-01 17:26:59,466] Trial 1 finished with value: 0.5 and parameters: {'learning_rate': 0.02, 'num_leaves': 38, 'max_bin': 94, 'bagging_fraction': 0.619817105976215, 'bagging_freq': 4, 'feature_fraction': 0.8711055768358081, 'min_data_in_leaf': 52, 'min_sum_hessian_in_leaf': 6}. Best is trial 0 with value: 0.5.
  'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),


[LightGBM] [Info] Number of positive: 397, number of negative: 2394
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 243
[LightGBM] [Info] Number of data points in the train set: 2791, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.142243 -> initscore=-1.796785
[LightGBM] [Info] Start training from score -1.796785
[LightGBM] [Info] Number of positive: 397, number of negative: 2394
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 244
[LightGBM] [Info] Number of data points in the train set: 2791, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.142243 -> initscore=-1.796785
[LightGBM] [Info] Start training from score -1.796785


[I 2024-09-01 17:26:59,654] Trial 2 finished with value: 0.5899832775919732 and parameters: {'learning_rate': 0.02, 'num_leaves': 125, 'max_bin': 95, 'bagging_fraction': 0.45860326840383037, 'bagging_freq': 5, 'feature_fraction': 0.6640914962437607, 'min_data_in_leaf': 33, 'min_sum_hessian_in_leaf': 5}. Best is trial 0 with value: 0.5.
  'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),


[LightGBM] [Info] Number of positive: 397, number of negative: 2394
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 345
[LightGBM] [Info] Number of data points in the train set: 2791, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.142243 -> initscore=-1.796785
[LightGBM] [Info] Start training from score -1.796785


[I 2024-09-01 17:26:59,838] Trial 3 finished with value: 0.535 and parameters: {'learning_rate': 0.01, 'num_leaves': 44, 'max_bin': 196, 'bagging_fraction': 0.8650796940166687, 'bagging_freq': 7, 'feature_fraction': 0.9368964102565893, 'min_data_in_leaf': 154, 'min_sum_hessian_in_leaf': 10}. Best is trial 0 with value: 0.5.
  'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
[I 2024-09-01 17:26:59,954] Trial 4 finished with value: 0.51 and parameters: {'learning_rate': 0.02, 'num_leaves': 66, 'max_bin': 92, 'bagging_fraction': 0.7256176498949491, 'bagging_freq': 1, 'feature_fraction': 0.8813181884524238, 'min_data_in_leaf': 21, 'min_sum_hessian_in_leaf': 10}. Best is trial 0 with value: 0.5.


[LightGBM] [Info] Number of positive: 397, number of negative: 2394
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 241
[LightGBM] [Info] Number of data points in the train set: 2791, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.142243 -> initscore=-1.796785
[LightGBM] [Info] Start training from score -1.796785
[LightGBM] [Info] Number of positive: 397, number of negative: 2394
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 253
[LightGBM] [Info] Number of data points in the train set: 2791, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.142243 -> initscore=-1.796785
[LightGBM] [Info] Start training from score -1.796785


  'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
[I 2024-09-01 17:27:00,164] Trial 5 finished with value: 0.570819397993311 and parameters: {'learning_rate': 0.014, 'num_leaves': 29, 'max_bin': 104, 'bagging_fraction': 0.4695214357150779, 'bagging_freq': 7, 'feature_fraction': 0.7739788760965347, 'min_data_in_leaf': 86, 'min_sum_hessian_in_leaf': 1}. Best is trial 0 with value: 0.5.


[LightGBM] [Info] Number of positive: 397, number of negative: 2394
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 313
[LightGBM] [Info] Number of data points in the train set: 2791, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.142243 -> initscore=-1.796785
[LightGBM] [Info] Start training from score -1.796785


  'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
[I 2024-09-01 17:27:00,279] Trial 6 finished with value: 0.5 and parameters: {'learning_rate': 0.016, 'num_leaves': 113, 'max_bin': 164, 'bagging_fraction': 0.7367663185416977, 'bagging_freq': 6, 'feature_fraction': 0.6962773578186345, 'min_data_in_leaf': 135, 'min_sum_hessian_in_leaf': 5}. Best is trial 0 with value: 0.5.
  'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
[I 2024-09-01 17:27:00,375] Trial 7 finished with value: 0.5 and parameters: {'learning_rate': 0.02, 'num_leaves': 52, 'max_bin': 111, 'bagging_fraction': 0.8533306831258292, 'bagging_freq': 2, 'feature_fraction': 0.44618794589727584, 'min_data_in_leaf': 75, 'min_sum_hessian_in_leaf': 2}. Best is trial 0 with value: 0.5.


[LightGBM] [Info] Number of positive: 397, number of negative: 2394
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 260
[LightGBM] [Info] Number of data points in the train set: 2791, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.142243 -> initscore=-1.796785
[LightGBM] [Info] Start training from score -1.796785


  'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
[I 2024-09-01 17:27:00,528] Trial 8 finished with value: 0.5 and parameters: {'learning_rate': 0.008, 'num_leaves': 90, 'max_bin': 171, 'bagging_fraction': 0.937654779954096, 'bagging_freq': 3, 'feature_fraction': 0.46603115471660606, 'min_data_in_leaf': 60, 'min_sum_hessian_in_leaf': 5}. Best is trial 0 with value: 0.5.
  'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),


[LightGBM] [Info] Number of positive: 397, number of negative: 2394
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 320
[LightGBM] [Info] Number of data points in the train set: 2791, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.142243 -> initscore=-1.796785
[LightGBM] [Info] Start training from score -1.796785
[LightGBM] [Info] Number of positive: 397, number of negative: 2394
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 341
[LightGBM] [Info] Number of data points in the train set: 2791, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.142243 -> initscore=-1.796785
[LightGBM] [Info] Start training from score -1.796785


[I 2024-09-01 17:27:00,637] Trial 9 finished with value: 0.5 and parameters: {'learning_rate': 0.01, 'num_leaves': 64, 'max_bin': 192, 'bagging_fraction': 0.5939217592124532, 'bagging_freq': 4, 'feature_fraction': 0.8218113753371068, 'min_data_in_leaf': 94, 'min_sum_hessian_in_leaf': 10}. Best is trial 0 with value: 0.5.
  'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
[I 2024-09-01 17:27:00,754] Trial 10 finished with value: 0.5 and parameters: {'learning_rate': 0.008, 'num_leaves': 85, 'max_bin': 57, 'bagging_fraction': 0.5671878785419668, 'bagging_freq': 7, 'feature_fraction': 0.5437371344001835, 'min_data_in_leaf': 38, 'min_sum_hessian_in_leaf': 5}. Best is trial 0 with value: 0.5.


[LightGBM] [Info] Number of positive: 397, number of negative: 2394
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 206
[LightGBM] [Info] Number of data points in the train set: 2791, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.142243 -> initscore=-1.796785
[LightGBM] [Info] Start training from score -1.796785
[LightGBM] [Info] Number of positive: 397, number of negative: 2394
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 294
[LightGBM] [Info] Number of data points in the train set: 2791, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.142243 -> initscore=-1.796785
[LightGBM] [Info] Start training from score -1.796785


  'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
[I 2024-09-01 17:27:00,914] Trial 11 finished with value: 0.5 and parameters: {'learning_rate': 0.008, 'num_leaves': 102, 'max_bin': 145, 'bagging_fraction': 0.7214648104448551, 'bagging_freq': 1, 'feature_fraction': 0.9011814973535428, 'min_data_in_leaf': 83, 'min_sum_hessian_in_leaf': 2}. Best is trial 0 with value: 0.5.
  'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),


[LightGBM] [Info] Number of positive: 397, number of negative: 2394
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 303
[LightGBM] [Info] Number of data points in the train set: 2791, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.142243 -> initscore=-1.796785
[LightGBM] [Info] Start training from score -1.796785


[I 2024-09-01 17:27:01,030] Trial 12 finished with value: 0.5 and parameters: {'learning_rate': 0.012, 'num_leaves': 42, 'max_bin': 154, 'bagging_fraction': 0.6320412077803225, 'bagging_freq': 7, 'feature_fraction': 0.482512566487596, 'min_data_in_leaf': 88, 'min_sum_hessian_in_leaf': 2}. Best is trial 0 with value: 0.5.
  'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),


[LightGBM] [Info] Number of positive: 397, number of negative: 2394
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 213
[LightGBM] [Info] Number of data points in the train set: 2791, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.142243 -> initscore=-1.796785
[LightGBM] [Info] Start training from score -1.796785


[I 2024-09-01 17:27:01,216] Trial 13 finished with value: 0.5 and parameters: {'learning_rate': 0.008, 'num_leaves': 51, 'max_bin': 64, 'bagging_fraction': 0.938329454771996, 'bagging_freq': 7, 'feature_fraction': 0.7798608743639608, 'min_data_in_leaf': 88, 'min_sum_hessian_in_leaf': 4}. Best is trial 0 with value: 0.5.
  'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),


[LightGBM] [Info] Number of positive: 397, number of negative: 2394
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 290
[LightGBM] [Info] Number of data points in the train set: 2791, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.142243 -> initscore=-1.796785
[LightGBM] [Info] Start training from score -1.796785


[I 2024-09-01 17:27:01,456] Trial 14 finished with value: 0.52 and parameters: {'learning_rate': 0.01, 'num_leaves': 137, 'max_bin': 141, 'bagging_fraction': 0.4055182309699778, 'bagging_freq': 1, 'feature_fraction': 0.7981010614648335, 'min_data_in_leaf': 3, 'min_sum_hessian_in_leaf': 2}. Best is trial 0 with value: 0.5.


[LightGBM] [Info] Number of positive: 397, number of negative: 2394
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 297
[LightGBM] [Info] Number of data points in the train set: 2791, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.142243 -> initscore=-1.796785
[LightGBM] [Info] Start training from score -1.796785


  'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
[I 2024-09-01 17:27:01,656] Trial 15 finished with value: 0.515 and parameters: {'learning_rate': 0.016, 'num_leaves': 117, 'max_bin': 148, 'bagging_fraction': 0.9095340462965068, 'bagging_freq': 5, 'feature_fraction': 0.7409851620012831, 'min_data_in_leaf': 25, 'min_sum_hessian_in_leaf': 4}. Best is trial 0 with value: 0.5.
  'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
[I 2024-09-01 17:27:01,777] Trial 16 finished with value: 0.5 and parameters: {'learning_rate': 0.012, 'num_leaves': 85, 'max_bin': 137, 'bagging_fraction': 0.6955106162913183, 'bagging_freq': 2, 'feature_fraction': 0.8334712691569033, 'min_data_in_leaf': 73, 'min_sum_hessian_in_leaf': 1}. Best is trial 0 with value: 0.5.


[LightGBM] [Info] Number of positive: 397, number of negative: 2394
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 286
[LightGBM] [Info] Number of data points in the train set: 2791, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.142243 -> initscore=-1.796785
[LightGBM] [Info] Start training from score -1.796785
[LightGBM] [Info] Number of positive: 397, number of negative: 2394
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 263
[LightGBM] [Info] Number of data points in the train set: 2791, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.142243 -> initscore=-1.796785
[LightGBM] [Info] Start training from score -1.796785


  'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
[I 2024-09-01 17:27:02,000] Trial 17 finished with value: 0.5808193979933111 and parameters: {'learning_rate': 0.014, 'num_leaves': 141, 'max_bin': 114, 'bagging_fraction': 0.9799928914262017, 'bagging_freq': 7, 'feature_fraction': 0.9118056732804161, 'min_data_in_leaf': 77, 'min_sum_hessian_in_leaf': 4}. Best is trial 0 with value: 0.5.




  'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
[I 2024-09-01 17:27:02,113] Trial 18 finished with value: 0.5241638795986622 and parameters: {'learning_rate': 0.016, 'num_leaves': 32, 'max_bin': 142, 'bagging_fraction': 0.994032310062558, 'bagging_freq': 1, 'feature_fraction': 0.710997791418242, 'min_data_in_leaf': 225, 'min_sum_hessian_in_leaf': 8}. Best is trial 0 with value: 0.5.
  'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),


[LightGBM] [Info] Number of positive: 397, number of negative: 2394
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 291
[LightGBM] [Info] Number of data points in the train set: 2791, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.142243 -> initscore=-1.796785
[LightGBM] [Info] Start training from score -1.796785
[LightGBM] [Info] Number of positive: 397, number of negative: 2394
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 276
[LightGBM] [Info] Number of data points in the train set: 2791, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.142243 -> initscore=-1.796785
[LightGBM] [Info] Start training from score -1.796785


[I 2024-09-01 17:27:02,314] Trial 19 finished with value: 0.6041471571906354 and parameters: {'learning_rate': 0.02, 'num_leaves': 139, 'max_bin': 127, 'bagging_fraction': 0.7009097768123198, 'bagging_freq': 6, 'feature_fraction': 0.7899783584666591, 'min_data_in_leaf': 181, 'min_sum_hessian_in_leaf': 8}. Best is trial 0 with value: 0.5.
  'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),


[LightGBM] [Info] Number of positive: 397, number of negative: 2394
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 242
[LightGBM] [Info] Number of data points in the train set: 2791, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.142243 -> initscore=-1.796785
[LightGBM] [Info] Start training from score -1.796785


[I 2024-09-01 17:27:02,493] Trial 20 finished with value: 0.5041638795986622 and parameters: {'learning_rate': 0.008, 'num_leaves': 91, 'max_bin': 93, 'bagging_fraction': 0.7544999563414065, 'bagging_freq': 1, 'feature_fraction': 0.4224089132495287, 'min_data_in_leaf': 211, 'min_sum_hessian_in_leaf': 4}. Best is trial 0 with value: 0.5.
  'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),


[LightGBM] [Info] Number of positive: 397, number of negative: 2394
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 280
[LightGBM] [Info] Number of data points in the train set: 2791, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.142243 -> initscore=-1.796785
[LightGBM] [Info] Start training from score -1.796785


[I 2024-09-01 17:27:02,660] Trial 21 finished with value: 0.535 and parameters: {'learning_rate': 0.012, 'num_leaves': 89, 'max_bin': 131, 'bagging_fraction': 0.782457940898924, 'bagging_freq': 6, 'feature_fraction': 0.9855112476775207, 'min_data_in_leaf': 133, 'min_sum_hessian_in_leaf': 4}. Best is trial 0 with value: 0.5.
  'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
[I 2024-09-01 17:27:02,732] Trial 22 finished with value: 0.5 and parameters: {'learning_rate': 0.018, 'num_leaves': 111, 'max_bin': 111, 'bagging_fraction': 0.5039765920425074, 'bagging_freq': 2, 'feature_fraction': 0.5501457388987572, 'min_data_in_leaf': 142, 'min_sum_hessian_in_leaf': 8}. Best is trial 0 with value: 0.5.


[LightGBM] [Info] Number of positive: 397, number of negative: 2394
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 260
[LightGBM] [Info] Number of data points in the train set: 2791, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.142243 -> initscore=-1.796785
[LightGBM] [Info] Start training from score -1.796785
[LightGBM] [Info] Number of positive: 397, number of negative: 2394
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 252
[LightGBM] [Info] Number of data points in the train set: 2791, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.142243 -> initscore=-1.796785
[LightGBM] [Info] Start training from score -1.796785


  'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
[I 2024-09-01 17:27:03,117] Trial 23 finished with value: 0.5699832775919732 and parameters: {'learning_rate': 0.012, 'num_leaves': 52, 'max_bin': 103, 'bagging_fraction': 0.8547076662786215, 'bagging_freq': 1, 'feature_fraction': 0.46964358430414976, 'min_data_in_leaf': 13, 'min_sum_hessian_in_leaf': 1}. Best is trial 0 with value: 0.5.
  'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
[I 2024-09-01 17:27:03,236] Trial 24 finished with value: 0.5 and parameters: {'learning_rate': 0.008, 'num_leaves': 76, 'max_bin': 110, 'bagging_fraction': 0.76951005883133, 'bagging_freq': 5, 'feature_fraction': 0.42718240586322676, 'min_data_in_leaf': 97, 'min_sum_hessian_in_leaf': 7}. Best is trial 0 with value: 0.5.
  'bagging_fraction': trial.suggest_u

[LightGBM] [Info] Number of positive: 397, number of negative: 2394
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 259
[LightGBM] [Info] Number of data points in the train set: 2791, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.142243 -> initscore=-1.796785
[LightGBM] [Info] Start training from score -1.796785
[LightGBM] [Info] Number of positive: 397, number of negative: 2394
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 340
[LightGBM] [Info] Number of data points in the train set: 2791, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.142243 -> initscore=-1.796785
[LightGBM] [Info] Start training from score -1.796785


[I 2024-09-01 17:27:03,390] Trial 25 finished with value: 0.505 and parameters: {'learning_rate': 0.01, 'num_leaves': 96, 'max_bin': 191, 'bagging_fraction': 0.7452845067255274, 'bagging_freq': 3, 'feature_fraction': 0.785972931065412, 'min_data_in_leaf': 118, 'min_sum_hessian_in_leaf': 6}. Best is trial 0 with value: 0.5.
  'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
[I 2024-09-01 17:27:03,472] Trial 26 finished with value: 0.5 and parameters: {'learning_rate': 0.012, 'num_leaves': 22, 'max_bin': 64, 'bagging_fraction': 0.8098040640498141, 'bagging_freq': 1, 'feature_fraction': 0.5913853781762568, 'min_data_in_leaf': 217, 'min_sum_hessian_in_leaf': 1}. Best is trial 0 with value: 0.5.


[LightGBM] [Info] Number of positive: 397, number of negative: 2394
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 213
[LightGBM] [Info] Number of data points in the train set: 2791, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.142243 -> initscore=-1.796785
[LightGBM] [Info] Start training from score -1.796785
[LightGBM] [Info] Number of positive: 397, number of negative: 2394
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 241
[LightGBM] [Info] Number of data points in the train set: 2791, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.142243 -> initscore=-1.796785
[LightGBM] [Info] Start training from score -1.796785


  'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
[I 2024-09-01 17:27:03,671] Trial 27 finished with value: 0.586638795986622 and parameters: {'learning_rate': 0.018, 'num_leaves': 125, 'max_bin': 92, 'bagging_fraction': 0.5064637262678338, 'bagging_freq': 6, 'feature_fraction': 0.8841008435603583, 'min_data_in_leaf': 254, 'min_sum_hessian_in_leaf': 5}. Best is trial 0 with value: 0.5.
  'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),


[LightGBM] [Info] Number of positive: 397, number of negative: 2394
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 214
[LightGBM] [Info] Number of data points in the train set: 2791, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.142243 -> initscore=-1.796785
[LightGBM] [Info] Start training from score -1.796785


[I 2024-09-01 17:27:03,929] Trial 28 finished with value: 0.5966387959866221 and parameters: {'learning_rate': 0.014, 'num_leaves': 118, 'max_bin': 65, 'bagging_fraction': 0.94153174400774, 'bagging_freq': 4, 'feature_fraction': 0.895874479664645, 'min_data_in_leaf': 83, 'min_sum_hessian_in_leaf': 9}. Best is trial 0 with value: 0.5.




  'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
[I 2024-09-01 17:27:04,001] Trial 29 finished with value: 0.5 and parameters: {'learning_rate': 0.02, 'num_leaves': 95, 'max_bin': 145, 'bagging_fraction': 0.6690673131869919, 'bagging_freq': 3, 'feature_fraction': 0.5971987272219496, 'min_data_in_leaf': 173, 'min_sum_hessian_in_leaf': 8}. Best is trial 0 with value: 0.5.
  'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),


[LightGBM] [Info] Number of positive: 397, number of negative: 2394
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 294
[LightGBM] [Info] Number of data points in the train set: 2791, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.142243 -> initscore=-1.796785
[LightGBM] [Info] Start training from score -1.796785
[LightGBM] [Info] Number of positive: 397, number of negative: 2394
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 251
[LightGBM] [Info] Number of data points in the train set: 2791, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.142243 -> initscore=-1.796785
[LightGBM] [Info] Start training from score -1.796785


[I 2024-09-01 17:27:04,192] Trial 30 finished with value: 0.5291638795986622 and parameters: {'learning_rate': 0.008, 'num_leaves': 136, 'max_bin': 102, 'bagging_fraction': 0.47024020985656356, 'bagging_freq': 2, 'feature_fraction': 0.8569063790304834, 'min_data_in_leaf': 159, 'min_sum_hessian_in_leaf': 2}. Best is trial 0 with value: 0.5.
  'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),


[LightGBM] [Info] Number of positive: 397, number of negative: 2394
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 255
[LightGBM] [Info] Number of data points in the train set: 2791, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.142243 -> initscore=-1.796785
[LightGBM] [Info] Start training from score -1.796785


[I 2024-09-01 17:27:04,414] Trial 31 finished with value: 0.6024749163879599 and parameters: {'learning_rate': 0.014, 'num_leaves': 149, 'max_bin': 106, 'bagging_fraction': 0.6223852882401346, 'bagging_freq': 6, 'feature_fraction': 0.9683491464303152, 'min_data_in_leaf': 253, 'min_sum_hessian_in_leaf': 8}. Best is trial 0 with value: 0.5.


[LightGBM] [Info] Number of positive: 397, number of negative: 2394
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 200
[LightGBM] [Info] Number of data points in the train set: 2791, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.142243 -> initscore=-1.796785
[LightGBM] [Info] Start training from score -1.796785


  'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
[I 2024-09-01 17:27:04,560] Trial 32 finished with value: 0.51 and parameters: {'learning_rate': 0.018, 'num_leaves': 84, 'max_bin': 51, 'bagging_fraction': 0.6811963851964757, 'bagging_freq': 1, 'feature_fraction': 0.4712907497608432, 'min_data_in_leaf': 31, 'min_sum_hessian_in_leaf': 7}. Best is trial 0 with value: 0.5.
  'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),


[LightGBM] [Info] Number of positive: 397, number of negative: 2394
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 200
[LightGBM] [Info] Number of data points in the train set: 2791, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.142243 -> initscore=-1.796785
[LightGBM] [Info] Start training from score -1.796785


[I 2024-09-01 17:27:04,783] Trial 33 finished with value: 0.5433277591973245 and parameters: {'learning_rate': 0.012, 'num_leaves': 146, 'max_bin': 51, 'bagging_fraction': 0.9819272960245834, 'bagging_freq': 1, 'feature_fraction': 0.9346858682188427, 'min_data_in_leaf': 136, 'min_sum_hessian_in_leaf': 10}. Best is trial 0 with value: 0.5.
  'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
[I 2024-09-01 17:27:04,873] Trial 34 finished with value: 0.5 and parameters: {'learning_rate': 0.012, 'num_leaves': 102, 'max_bin': 138, 'bagging_fraction': 0.9406948062945933, 'bagging_freq': 1, 'feature_fraction': 0.5685779137553382, 'min_data_in_leaf': 244, 'min_sum_hessian_in_leaf': 9}. Best is trial 0 with value: 0.5.


[LightGBM] [Info] Number of positive: 397, number of negative: 2394
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 287
[LightGBM] [Info] Number of data points in the train set: 2791, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.142243 -> initscore=-1.796785
[LightGBM] [Info] Start training from score -1.796785
[LightGBM] [Info] Number of positive: 397, number of negative: 2394
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 346
[LightGBM] [Info] Number of data points in the train set: 2791, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.142243 -> initscore=-1.796785
[LightGBM] [Info] Start training from score -1.796785


  'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
[I 2024-09-01 17:27:05,080] Trial 35 finished with value: 0.5 and parameters: {'learning_rate': 0.01, 'num_leaves': 30, 'max_bin': 197, 'bagging_fraction': 0.9917264466877618, 'bagging_freq': 5, 'feature_fraction': 0.7216578198064723, 'min_data_in_leaf': 80, 'min_sum_hessian_in_leaf': 9}. Best is trial 0 with value: 0.5.


[LightGBM] [Info] Number of positive: 397, number of negative: 2394
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 339
[LightGBM] [Info] Number of data points in the train set: 2791, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.142243 -> initscore=-1.796785
[LightGBM] [Info] Start training from score -1.796785


  'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
[I 2024-09-01 17:27:05,216] Trial 36 finished with value: 0.505 and parameters: {'learning_rate': 0.016, 'num_leaves': 74, 'max_bin': 190, 'bagging_fraction': 0.9196383337002451, 'bagging_freq': 1, 'feature_fraction': 0.41582018469835125, 'min_data_in_leaf': 97, 'min_sum_hessian_in_leaf': 9}. Best is trial 0 with value: 0.5.
  'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
[I 2024-09-01 17:27:05,268] Trial 37 finished with value: 0.5 and parameters: {'learning_rate': 0.008, 'num_leaves': 81, 'max_bin': 112, 'bagging_fraction': 0.5640442431584237, 'bagging_freq': 1, 'feature_fraction': 0.918833425753032, 'min_data_in_leaf': 209, 'min_sum_hessian_in_leaf': 10}. Best is trial 0 with value: 0.5.
  'bagging_fraction': trial.suggest_uniform('bag

[LightGBM] [Info] Number of positive: 397, number of negative: 2394
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 261
[LightGBM] [Info] Number of data points in the train set: 2791, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.142243 -> initscore=-1.796785
[LightGBM] [Info] Start training from score -1.796785
[LightGBM] [Info] Number of positive: 397, number of negative: 2394
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 343
[LightGBM] [Info] Number of data points in the train set: 2791, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.142243 -> initscore=-1.796785
[LightGBM] [Info] Start training from score -1.796785


[I 2024-09-01 17:27:05,462] Trial 38 finished with value: 0.5416555183946489 and parameters: {'learning_rate': 0.008, 'num_leaves': 36, 'max_bin': 194, 'bagging_fraction': 0.763704780670528, 'bagging_freq': 2, 'feature_fraction': 0.803020410643514, 'min_data_in_leaf': 159, 'min_sum_hessian_in_leaf': 4}. Best is trial 0 with value: 0.5.
  'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),


[LightGBM] [Info] Number of positive: 397, number of negative: 2394
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 331
[LightGBM] [Info] Number of data points in the train set: 2791, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.142243 -> initscore=-1.796785
[LightGBM] [Info] Start training from score -1.796785


[I 2024-09-01 17:27:05,578] Trial 39 finished with value: 0.505 and parameters: {'learning_rate': 0.018, 'num_leaves': 93, 'max_bin': 182, 'bagging_fraction': 0.6420897197274382, 'bagging_freq': 1, 'feature_fraction': 0.41726960578800343, 'min_data_in_leaf': 194, 'min_sum_hessian_in_leaf': 7}. Best is trial 0 with value: 0.5.
  'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),


[LightGBM] [Info] Number of positive: 397, number of negative: 2394
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 335
[LightGBM] [Info] Number of data points in the train set: 2791, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.142243 -> initscore=-1.796785
[LightGBM] [Info] Start training from score -1.796785


[I 2024-09-01 17:27:05,910] Trial 40 finished with value: 0.5741471571906355 and parameters: {'learning_rate': 0.008, 'num_leaves': 77, 'max_bin': 186, 'bagging_fraction': 0.6089532802139802, 'bagging_freq': 4, 'feature_fraction': 0.8701918076446858, 'min_data_in_leaf': 103, 'min_sum_hessian_in_leaf': 7}. Best is trial 0 with value: 0.5.


[LightGBM] [Info] Number of positive: 397, number of negative: 2394
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 273
[LightGBM] [Info] Number of data points in the train set: 2791, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.142243 -> initscore=-1.796785
[LightGBM] [Info] Start training from score -1.796785


  'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
[I 2024-09-01 17:27:06,114] Trial 41 finished with value: 0.5 and parameters: {'learning_rate': 0.01, 'num_leaves': 148, 'max_bin': 124, 'bagging_fraction': 0.597250966172505, 'bagging_freq': 5, 'feature_fraction': 0.5440873712669159, 'min_data_in_leaf': 21, 'min_sum_hessian_in_leaf': 2}. Best is trial 0 with value: 0.5.


[LightGBM] [Info] Number of positive: 397, number of negative: 2394
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 299
[LightGBM] [Info] Number of data points in the train set: 2791, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.142243 -> initscore=-1.796785
[LightGBM] [Info] Start training from score -1.796785


  'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
[I 2024-09-01 17:27:06,256] Trial 42 finished with value: 0.51 and parameters: {'learning_rate': 0.02, 'num_leaves': 82, 'max_bin': 150, 'bagging_fraction': 0.5033919227209779, 'bagging_freq': 2, 'feature_fraction': 0.42452116975988735, 'min_data_in_leaf': 45, 'min_sum_hessian_in_leaf': 3}. Best is trial 0 with value: 0.5.
  'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
[I 2024-09-01 17:27:06,358] Trial 43 finished with value: 0.5 and parameters: {'learning_rate': 0.02, 'num_leaves': 110, 'max_bin': 55, 'bagging_fraction': 0.8796462393454256, 'bagging_freq': 5, 'feature_fraction': 0.4490554191693232, 'min_data_in_leaf': 224, 'min_sum_hessian_in_leaf': 10}. Best is trial 0 with value: 0.5.


[LightGBM] [Info] Number of positive: 397, number of negative: 2394
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 204
[LightGBM] [Info] Number of data points in the train set: 2791, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.142243 -> initscore=-1.796785
[LightGBM] [Info] Start training from score -1.796785
[LightGBM] [Info] Number of positive: 397, number of negative: 2394
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 292
[LightGBM] [Info] Number of data points in the train set: 2791, number of used features: 20


  'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),


[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.142243 -> initscore=-1.796785
[LightGBM] [Info] Start training from score -1.796785


[I 2024-09-01 17:27:06,590] Trial 44 finished with value: 0.5 and parameters: {'learning_rate': 0.012, 'num_leaves': 83, 'max_bin': 143, 'bagging_fraction': 0.6213481837418635, 'bagging_freq': 4, 'feature_fraction': 0.8484825628802539, 'min_data_in_leaf': 11, 'min_sum_hessian_in_leaf': 3}. Best is trial 0 with value: 0.5.


[LightGBM] [Info] Number of positive: 397, number of negative: 2394
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 239
[LightGBM] [Info] Number of data points in the train set: 2791, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.142243 -> initscore=-1.796785
[LightGBM] [Info] Start training from score -1.796785


  'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
[I 2024-09-01 17:27:06,798] Trial 45 finished with value: 0.52 and parameters: {'learning_rate': 0.01, 'num_leaves': 51, 'max_bin': 90, 'bagging_fraction': 0.6263704978627735, 'bagging_freq': 1, 'feature_fraction': 0.593247499349907, 'min_data_in_leaf': 55, 'min_sum_hessian_in_leaf': 4}. Best is trial 0 with value: 0.5.
  'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
[I 2024-09-01 17:27:06,964] Trial 46 finished with value: 0.5 and parameters: {'learning_rate': 0.01, 'num_leaves': 90, 'max_bin': 138, 'bagging_fraction': 0.8472636845105981, 'bagging_freq': 4, 'feature_fraction': 0.47654818167733826, 'min_data_in_leaf': 74, 'min_sum_hessian_in_leaf': 4}. Best is trial 0 with value: 0.5.


[LightGBM] [Info] Number of positive: 397, number of negative: 2394
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 287
[LightGBM] [Info] Number of data points in the train set: 2791, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.142243 -> initscore=-1.796785
[LightGBM] [Info] Start training from score -1.796785
[LightGBM] [Info] Number of positive: 397, number of negative: 2394
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 236
[LightGBM] [Info] Number of data points in the train set: 2791, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.142243 -> initscore=-1.796785
[LightGBM] [Info] Start training from score -1.796785


  'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
[I 2024-09-01 17:27:07,151] Trial 47 finished with value: 0.5 and parameters: {'learning_rate': 0.014, 'num_leaves': 40, 'max_bin': 87, 'bagging_fraction': 0.49640882395573344, 'bagging_freq': 2, 'feature_fraction': 0.5710571012163083, 'min_data_in_leaf': 46, 'min_sum_hessian_in_leaf': 9}. Best is trial 0 with value: 0.5.
  'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),


[LightGBM] [Info] Number of positive: 397, number of negative: 2394
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 322
[LightGBM] [Info] Number of data points in the train set: 2791, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.142243 -> initscore=-1.796785
[LightGBM] [Info] Start training from score -1.796785


[I 2024-09-01 17:27:07,332] Trial 48 finished with value: 0.5799832775919732 and parameters: {'learning_rate': 0.014, 'num_leaves': 133, 'max_bin': 173, 'bagging_fraction': 0.5547416962269639, 'bagging_freq': 2, 'feature_fraction': 0.8011859319546586, 'min_data_in_leaf': 238, 'min_sum_hessian_in_leaf': 6}. Best is trial 0 with value: 0.5.
  'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),


[LightGBM] [Info] Number of positive: 397, number of negative: 2394
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 216
[LightGBM] [Info] Number of data points in the train set: 2791, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.142243 -> initscore=-1.796785
[LightGBM] [Info] Start training from score -1.796785


[I 2024-09-01 17:27:07,574] Trial 49 finished with value: 0.5566555183946489 and parameters: {'learning_rate': 0.012, 'num_leaves': 51, 'max_bin': 67, 'bagging_fraction': 0.7663720254649795, 'bagging_freq': 3, 'feature_fraction': 0.7487429328535673, 'min_data_in_leaf': 41, 'min_sum_hessian_in_leaf': 5}. Best is trial 0 with value: 0.5.


Number of finished trials: 50
Best trial: {'learning_rate': 0.01, 'num_leaves': 133, 'max_bin': 140, 'bagging_fraction': 0.8248435466776274, 'bagging_freq': 1, 'feature_fraction': 0.9819459112971965, 'min_data_in_leaf': 214, 'min_sum_hessian_in_leaf': 3}


In [102]:
"""
# 得られたパラメータを設定(１回目)
lgbm_params = {
    'learning_rate': 0.018,
     'num_leaves': 125,
     'max_bin': 92,
     'bagging_fraction': 0.5064637262678338,
     'bagging_freq': 6,
     'feature_fraction': 0.8841008435603583,
     'min_data_in_leaf': 254,
     'min_sum_hessian_in_leaf': 5
}
"""

"# 得られたパラメータを設定(１回目)\nlgbm_params = {\n    'learning_rate': 0.018,\n     'num_leaves': 125,\n     'max_bin': 92,\n     'bagging_fraction': 0.5064637262678338,\n     'bagging_freq': 6,\n     'feature_fraction': 0.8841008435603583,\n     'min_data_in_leaf': 254,\n     'min_sum_hessian_in_leaf': 5\n}\n"

In [108]:
# optuna修正後
lgbm_params = {
    # 二値分類問題
    'objective': 'binary',
    # AUC の最大化を目指す
    'metric': 'auc',
    'learning_rate': 0.01, 'num_leaves': 133, 'max_bin': 140, 'bagging_fraction': 0.8248435466776274, 'bagging_freq': 1, 'feature_fraction': 0.9819459112971965, 'min_data_in_leaf': 214, 'min_sum_hessian_in_leaf': 3
}

In [111]:
models = []
aucs = []
oof = np.zeros(len(train_X))

# クロスバリデーションの設定
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

for train_index, val_index in skf.split(train_X, train_Y):
    X_train = train_X.iloc[train_index]
    X_val = train_X.iloc[val_index]
    y_train = train_Y.iloc[train_index]
    y_val = train_Y.iloc[val_index]
    
    lgb_train = lgb.Dataset(X_train, label=y_train)
    lgb_eval = lgb.Dataset(X_val, label=y_val, reference=lgb_train)

    model = lgb.train(lgbm_params, lgb_train, valid_sets=lgb_eval,
                      num_boost_round=100, early_stopping_rounds=20, verbose_eval=10)
    y_pred = model.predict(X_val, num_iteration=model.best_iteration)
    #y_pred_label = np.rint(y_pred)  # 0.5を閾値として0/1に分類
    fpr, tpr, thresholds = roc_curve(y_val, y_pred)
    tmp_auc = auc(fpr, tpr)
    print(tmp_auc)
    
    models.append(model)
    aucs.append(tmp_auc)
    oof[val_index] = y_pred



[LightGBM] [Info] Number of positive: 397, number of negative: 2394
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 289
[LightGBM] [Info] Number of data points in the train set: 2791, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.142243 -> initscore=-1.796785
[LightGBM] [Info] Start training from score -1.796785
Training until validation scores don't improve for 20 rounds
[10]	valid_0's auc: 0.809038
[20]	valid_0's auc: 0.816028
[30]	valid_0's auc: 0.816012
Early stopping, best iteration is:
[18]	valid_0's auc: 0.820192
0.8201923076923077
[LightGBM] [Info] Number of positive: 397, number of negative: 2394
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 288
[LightGBM] [Info] Number of data points in the train set: 2791, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.142243 -> initscore=-1.796785
[LightGBM] [Info] Start training from score -1.796785



In [112]:
# ROC曲線をプロット
plt.plot(fpr, tpr, label='ROC curve (area = %.2f)'%auc)
plt.legend()
plt.title('ROC curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.grid(True)

TypeError: must be real number, not function

In [113]:
sum(aucs)/len(aucs)

0.8006513592381344

In [114]:
# クロスバリデーション毎の各モデルで予測値を算出
preds = []

for model in models:
    pred = model.predict(new_test_df)
    preds.append(pred)
    
preds_array = np.array(preds)
preds_mean = np.mean(preds_array, axis=0)

In [115]:
# 予測値を元のスケールに戻す
# preds_exp = np.exp(preds_mean)

In [116]:
preds_mean

array([0.13333572, 0.13997995, 0.17507951, ..., 0.24897999, 0.17856732,
       0.10823259])

In [117]:
submit_df.head()

Unnamed: 0,0,1
0,3489,0.088587
1,3490,0.104305
2,3491,0.26471
3,3492,0.1605
4,3493,0.308525


In [118]:
submit_df[1] = preds_mean

In [119]:
submit_df.to_csv("./submit/submission_2.csv", index=False, header=None)

### 

In [43]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# パラメータの探索空間の定義
param_distributions = {
    'n_estimators': optuna.distributions.IntDistribution(2, 150),
    'max_depth': optuna.distributions.IntDistribution(1, 32, log=True)
}
# モデルの定義
clf = RandomForestClassifier()
# OptunaSearchCVの設定
optuna_search = OptunaSearchCV(
    clf,
    param_distributions,
    n_trials=100, # トライアル数（この例では100回）だけハイパーパラメータの組み合わせを試す
    scoring="accuracy",
    cv=cv,
    random_state=42,
    error_score='raise'
)
# モデルの最適化実行
optuna_search.fit(X_train, y_train)
# 最適化後のベストなパラメータとスコアの表示
print(f"Best parameters: {optuna_search.best_params_}")
print(f"Best cross-validation score: {optuna_search.best_score_:.3f}")
# テストデータに対する評価
test_score = optuna_search.score(X_val, y_val)
print(f"Test accuracy: {test_score:.3f}")
# 最適化されたモデルの特徴を取得して表示
best_estimator = optuna_search.best_estimator_
print(f"Best estimator: {best_estimator}")
# さらに、Optunaのスタディオブジェクトを取得して、詳細な最適化の結果を分析することも可能です。
study = optuna_search.study_
print(f"Number of finished trials: {len(study.trials)}")
print(f"Best trial:")
trial = study.best_trial
print(f"  Value: {trial.value}")
print(f"  Params: {trial.params}")
# Optunaのビジュアライゼーション機能を使って、探索の過程を視覚的に確認することもできます。
# 以下は、すべてのトライアルの結果をプロットする例です。
optuna.visualization.plot_optimization_history(study)

  optuna_search = OptunaSearchCV(
[I 2024-08-30 02:16:19,850] A new study created in memory with name: no-name-cc0586e9-4e25-4698-8045-32a556ae9987
[W 2024-08-30 02:16:19,859] Trial 0 failed with parameters: {'n_estimators': 69, 'max_depth': 16} because of the following error: ValueError('Input X contains NaN.\nRandomForestClassifier does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values').
Traceback (most recent call last):
  File "/Users/ishidzukama

ValueError: Input X contains NaN.
RandomForestClassifier does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [None]:
"""
Training API：Scikit-learn APIより細かな設定が可能なため
Stratified K-Fold: クラス不均衡がある分類問題であるため
パラメータ調整：optuna
"""
# クロスバリデーションの設定
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# パラメータ探索（Optuna）
def objective(trial, data=X, target=y):

    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.25, random_state=42)
    dtrain = lgb.Dataset(train_x, label=train_y)

    param = {
        'objective': 'binary',  # 目的に応じて変更 (binary, multiclass, etc.)
        'metric': 'binary_logloss',  # メトリックも目的に応じて変更
        'boosting_type': 'gbdt',
        'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.008,0.01,0.012,0.014,0.016,0.018, 0.02]),
        'n_estimators': 10000,
        'max_depth': trial.suggest_categorical('max_depth', [5,7,9,11,13,15,17]),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 300),
        'num_leaves': trial.suggest_int('num_leaves', 20, 150),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 2, 256),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'max_depth': trial.suggest_int('max_depth', 3, 8),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
        'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
    }

    # クロスバリデーションによる評価
    cv_scores = []
    for train_idx, valid_idx in skf.split(X, y):
        X_train, X_valid = X[train_idx], X[valid_idx]
        y_train, y_valid = y[train_idx], y[valid_idx]

        train_data = lgb.Dataset(X_train, label=y_train)
        valid_data = lgb.Dataset(X_valid, label=y_valid, reference=train_data)

        model = lgb.train(param, train_data, valid_sets=[valid_data], early_stopping_rounds=100, verbose_eval=False)
        preds = model.predict(X_valid, num_iteration=model.best_iteration)
        pred_labels = np.rint(preds)  # 0.5を閾値として0/1に分類
        cv_scores.append(accuracy_score(y_valid, pred_labels))

    return np.mean(cv_scores)


In [None]:
# Optunaによる最適化の実行
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

# 最適なハイパーパラメータ
best_params = study.best_params
print(f"Best Parameters: {best_params}")

# ベストパラメータでの最終モデルの学習
final_model = lgb.LGBMClassifier(**best_params)
final_model.fit(X, y)

# モデルによる予測
predictions = final_model.predict(X)

In [None]:
# kfoldの分割数
k = 5
skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)

#目的変数
col = "ProdTaken"
#学習データの目的変数を設定
y = df[col]
#学習データの説明変数を設定（目的変数を除く）
X = df.drop(col, axis=1)
feature_names = df.columns.to_list()

auc_list = []
precision_list = []
recall_list = []

#lgbに渡す用のパラメータを設定
params = {
    # task: デフォルトは’train’。’train’, ‘prediction’, ‘refit’などから選択
    'task': 'train',
    # boosting_type: ‘gbdt’がデフォルトでGradient Boosting Decision Tree。他には’rf’とすることでランダムフォレストを選択する事も可能
    'boosting_type': 'gbdt',
    # objective: 何を目的としたモデルを訓練するのかを指定。’regression’がデフォルトで回帰。二値分類を目的とする’binary’や多クラス分類を行うための’multiclass’もある
    'objective': 'binary',
    # 訓練の指標とする損失関数を指定。二値分類は’binary_logloss’を選択。回帰のためのモデルだと’rmse’。多クラス分類だと’multi_logloss’などを指定
    'metric': 'binary_logloss',
    'num_leaves': 64,
    'min_data_in_leaf': 20,
    'max_depth': 7,
    # verbose: どれくらい詳しく訓練の途中経過を表示するかを指定
    'verbose': 0,
}

for train_index, test_index in tqdm(skf.split(X, y), total=k, desc="K-Fold Cross-Validation"):

    X_train = X.iloc[train_index]
    y_train = y.iloc[train_index]
    X_test = X.iloc[test_index]
    y_test = y.iloc[test_index]

    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=100)

    print('Number of finished trials: {}'.format(len(study.trials)))

    print('Best trial:')
    trial = study.best_trial

    print('  Value: {}'.format(trial.value))

    print('  Params: ')
    for key, value in trial.params.items():
        print('    {}: {}'.format(key, value))  
        
    # optunaでサーチしたパラメータ
    trial.params['objective'] = 'binary'
    lgbm_params = trial.params

    # ここではvalidをモデル評価、evalをフォールドアウト検証に使う・・・分割の大きさはデータセットと相談
    X_eval, X_valid, y_eval, y_valid = train_test_split(X_test, y_test, random_state=42, 
                                                        shuffle=True, stratify=y_test, test_size=0.2)

    # データセットを生成する
    lgb_train = lgb.Dataset(X_train, y_train)

    # モデル評価用
    lgb_valid = lgb.Dataset(X_valid, y_valid, reference=lgb_train)

    #model = lgb.train(パラメータ, 学習データ, 検証データ, 学習木の数,学習が止まるまでの最大試行回数)
    model = lgb.train(params=lgbm_params, 
                      train_set=lgb_train,
                      valid_sets=lgb_valid,
                      num_boost_round=10000, #num_boost_round=1000:決定木を直列に結んで学習を行う回数
                      early_stopping_rounds=100)# early_stopping_rounds=100:過学習を防ぐため学習を打ち切るもの、100回行って推定精度があがらなかった場合とまる

    predict_proba = model.predict(X_test, num_iteration=model.best_iteration)
    predict = [0 if i < 0.5 else 1 for i in predict_proba]

    auc, precision, recall = get_evaluate(y_test, predict)

    print('AUC:{}, precision:{}, recall:{}'.format(auc, precision, recall))

    auc_list.append(auc)
    precision_list.append(precision)
    recall_list.append(recall)

# kfoldの平均値を取得
print('Kfold平均 AUC:{}, precision:{}, recall:{}'.format(np.mean(auc_list), 
                                                         np.mean(precision_list), 
                                                         np.mean(recall_list)))

[I 2024-08-14 19:46:03,233] A new study created in memory with name: no-name-73f7d64b-1fe6-46b0-9ed1-782a343db695
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-8, 1.0)
  param['drop_rate'] = trial.suggest_loguniform('drop_rate', 1e-8, 1.0)
  param['skip_drop'] = trial.suggest_loguniform('skip_drop', 1e-8, 1.0)
[I 2024-08-14 19:46:03,634] Trial 0 finished with value: 0.8581661891117478 and parameters: {'boosting': 'dart', 'num_leaves': 53, 'learning_rate': 1.1723208851755019e-06, 'drop_rate': 1.3237185189900724e-05, 'skip_drop': 0.00010599318312542955}. Best is trial 0 with value: 0.8581661891117478.
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-8, 1.0)
  param['drop_rate'] = trial.suggest_loguniform('drop_rate', 1e-8, 1.0)
  param['skip_drop'] = trial.suggest_loguniform('skip_drop', 1e-8, 1.0)
[I 2024-08-14 19:46:03,783] Trial 1 finished with value: 0.8581661891117478 and parameters: {'boosting': 'dart', 'num_leaves': 242, 'learning_rate': 2.07303303

Number of finished trials: 100
Best trial:
  Value: 0.8825214899713467
  Params: 
    boosting: dart
    num_leaves: 368
    learning_rate: 0.05097448086895073
    drop_rate: 0.13412471855446875
    skip_drop: 0.0009530910150196744
[LightGBM] [Info] Number of positive: 397, number of negative: 2394
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 409
[LightGBM] [Info] Number of data points in the train set: 2791, number of used features: 21
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.142243 -> initscore=-1.796785
[LightGBM] [Info] Start training from score -1.796785
[1]	valid_0's binary_logloss: 0.393647
[2]	valid_0's binary_logloss: 0.381092
[3]	valid_0's binary_logloss: 0.371201
[4]	valid_0's binary_logloss: 0.362775
[5]	valid_0's binary_logloss: 0.362746
[6]	valid_0's binary_logloss: 0.364578
[7]	valid_0's binary_logloss: 0.35507
[8]	valid_0's binary_logloss: 0.348791
[9]	vali

In [None]:
# 訓練したモデルをファイルとして保存
model.save_model('model.txt', num_iteration=bst.best_iteration)

In [None]:
#検証データの予測値の算出
#予測値を返す
y_pred = model.predict(X_val, num_iteration=model.best_iteration)

roc = roc_curve(y_val, y_pred)

In [None]:
#予測結果を出力
predict = model.predict(df_test)
df_test["取引価格(総額)_log"] = predict
df_test[["取引価格(総額)_log"]].to_csv("submit_test.csv")

In [None]:
#特徴量重要度を示す
#feature_importance:それぞれの特徴量重要度が格納されている
#index=val_x.columns 検証データの説明変数が入っているval_xを持ってくる
pd.DataFrame(model.feature_importance(), index=val_x.columns, columns=["importance"]).sort_values("importance", ascending=False)

#決定木の分岐に使われた回数を示すものであるため、数値間の単純比較はできない
#線形回帰モデルなら偏回帰係数で行える

In [None]:
#別のpythonファイルで関数化したものを呼び出してデータ加工、モデル構築を行えるようにする
#テキストファイル(txt→py)に記述
import glob
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
#新たな関数のインポート
import test_func as tf
%matplotlib inline

# 今後の方針
dropna
fillna
平均などで缺損値補完
標準化