In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import random
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
import warnings

warnings.filterwarnings('ignore')
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold

#表示するテーブルの最大行と最大列を指定
pd.options.display.max_columns = 100
pd.options.display.max_rows = 999

seed = 42
# 乱数初期化
random.seed(seed)
np.random.seed(seed)

# データの読み込み
train = pd.read_csv('src/train.csv', index_col=0)
test = pd.read_csv('src/test.csv', index_col=0)
sample_submit = pd.read_csv('src/sample_submit.csv')

In [2]:
# 訓練データの欠損値の個数をカウント
print('train data')
print(train.isnull().sum())

train data
Age         0
Gender      0
T_Bil       0
D_Bil       0
ALP         0
ALT_GPT     0
AST_GOT     0
TP          0
Alb         0
AG_ratio    4
disease     0
dtype: int64


In [3]:
# テストデータの欠損値の個数をカウント
print('test data')
print(test.isnull().sum())

test data
Age         0
Gender      0
T_Bil       0
D_Bil       0
ALP         0
ALT_GPT     0
AST_GOT     0
TP          0
Alb         0
AG_ratio    0
dtype: int64


In [4]:
# データ処理のためtrainとtestを結合
train_data = pd.concat([train, test])
print('trainサイズ:', len(train))
print('testサイズ:', len(test))
print('train_dataサイズ:', len(train_data))

trainサイズ: 891
testサイズ: 382
train_dataサイズ: 1273


In [5]:
train_data.dtypes

Age           int64
Gender       object
T_Bil       float64
D_Bil       float64
ALP         float64
ALT_GPT     float64
AST_GOT     float64
TP          float64
Alb         float64
AG_ratio    float64
disease     float64
dtype: object

In [6]:
# オブジェクトカラムをラベルエンコーディング
object_columns = train_data.select_dtypes(include=['object']).columns.tolist()
le = LabelEncoder()
for column in object_columns:
    train_data[column] = le.fit_transform(train_data[column])
    # train[column] = train[column].astype('cateogyr')
train_data[object_columns] = train_data[object_columns].astype('category')
train_data.dtypes

Age            int64
Gender      category
T_Bil        float64
D_Bil        float64
ALP          float64
ALT_GPT      float64
AST_GOT      float64
TP           float64
Alb          float64
AG_ratio     float64
disease      float64
dtype: object

In [7]:
#ハイパーパラメータのチューニング
import optuna
from sklearn.metrics import mean_squared_error

# MeanF1スコアを格納するリスト
AUC_scores = []

X = train_data.drop('disease', axis=1).iloc[: len(train)]
y = train_data['disease'].iloc[: len(train)]

X_train, X_val, y_train, y_val = train_test_split(X,
                                                  y, shuffle=True,
                                                  train_size=0.7,
                                                  random_state=seed,
                                                  stratify=y)


def objective(trial):
    params = {
        'objective': 'binary',
        'metric': 'multy_logloss',
        'random_seed': seed,
        'learning_rate': 0.1,
        'verbosity': -1,
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.1, 1.0),
        'num_leaves': trial.suggest_int('num_leaves', 2, 256),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'max_bin': trial.suggest_int('max_bin', 255, 500),
        'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
        'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 5, 50),
    }

    train_lgb = lgb.Dataset(X_train, label=y_train)
    val_lgb = lgb.Dataset(X_val, label=y_val)

    model = lgb.train(params, train_lgb,
                      valid_sets=[train_lgb, val_lgb],
                      num_boost_round=10000,
                      # callbacks=[lgb.early_stopping(stopping_rounds=10,
                      #                               verbose=False), lgb.log_evaluation(100)],
                      )

    y_pred_valid_prob = model.predict(X_val, num_iteration=model.best_iteration)
    y_pred_valid = np.where(y_pred_valid_prob < 0.5, 0, 1)
    score = np.sqrt(mean_squared_error(y_val, y_pred_valid))
    return score


study = optuna.create_study(sampler=optuna.samplers.RandomSampler(seed=seed))
study.optimize(objective, n_trials=1000)
study.best_params

[I 2024-05-01 21:56:39,478] A new study created in memory with name: no-name-084e79db-ce9f-49e0-98a8-c50bf99bd7c7
[I 2024-05-01 21:56:45,719] Trial 0 finished with value: 0.34554737023254406 and parameters: {'feature_fraction': 0.4370861069626263, 'num_leaves': 244, 'bagging_fraction': 0.839196365086843, 'bagging_freq': 5, 'max_bin': 293, 'lambda_l1': 2.5348407664333426e-07, 'lambda_l2': 3.3323645788192616e-08, 'min_data_in_leaf': 44}. Best is trial 0 with value: 0.34554737023254406.
[I 2024-05-01 21:56:48,436] Trial 1 finished with value: 0.3814739677352853 and parameters: {'feature_fraction': 0.6410035105688879, 'num_leaves': 182, 'bagging_fraction': 0.41235069657748147, 'bagging_freq': 7, 'max_bin': 459, 'lambda_l1': 8.148018307012941e-07, 'lambda_l2': 4.329370014459266e-07, 'min_data_in_leaf': 13}. Best is trial 0 with value: 0.34554737023254406.
[I 2024-05-01 21:56:52,272] Trial 2 finished with value: 0.34554737023254406 and parameters: {'feature_fraction': 0.373818018663584, 'num

{'feature_fraction': 0.8055746654550305,
 'num_leaves': 102,
 'bagging_fraction': 0.7876281664579576,
 'bagging_freq': 2,
 'max_bin': 484,
 'lambda_l1': 2.8023944085522116e-05,
 'lambda_l2': 0.03678686662899775,
 'min_data_in_leaf': 5}

In [None]:
from sklearn.metrics import roc_auc_score

#交差検証

params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'verbosity': -1,
    'learning_rate': 0.1,  # default = 0.1
    'random_state': seed,
    'feature_fraction': 0.8055746654550305,
    'num_leaves': 102,
    'bagging_fraction': 0.7876281664579576,
    'bagging_freq': 2,
    'max_bin': 484,
    'lambda_l1': 2.8023944085522116e-05,
    'lambda_l2': 0.03678686662899775,
    'min_data_in_leaf': 5
}

# クラスの比率が同じになるように分割
cv = StratifiedKFold(n_splits=200, random_state=seed, shuffle=True)

# AUCスコアを格納するリスト
AUC_scores = []

# モデルを保存するリスト
models = []

X = train_data.drop('disease', axis=1).iloc[: len(train)]
y = train_data['disease'].iloc[: len(train)]

flag = True

for train_index, val_index in cv.split(X, y):
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]

    # LightGBM用データセット
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_val = lgb.Dataset(X_val, y_val)

    num_boost_round = 1000000
    if flag:
        lgb_model_first = lgb.train(params, lgb_train,
                                    num_boost_round=num_boost_round,
                                    valid_sets=[lgb_train, lgb_val],
                                    callbacks=[
                                        lgb.early_stopping(stopping_rounds=100, verbose=False),
                                        lgb.log_evaluation(100)],
                                    )
        flag = False
    else:
        lgb_model_first = lgb.train(params, lgb_train,
                                    num_boost_round=num_boost_round,
                                    valid_sets=[lgb_train, lgb_val],
                                    callbacks=[
                                        lgb.early_stopping(stopping_rounds=100, verbose=False),
                                        lgb.log_evaluation(100)],
                                    init_model=lgb_model_first
                                    )

    models.append(lgb_model_first)
    y_pred_prob = lgb_model_first.predict(X_train)
    y_val_pred_prob = lgb_model_first.predict(X_val)
    # forで2回目以降のLightGBMを繰り返してみる
    # y_pred_prob = []
    # y_val_pred_prob = []

    # 1段目の訓練データ予測とテストデータ予測を格納する
    # y_pred_prob.append(pd.DataFrame(y_pred_prob_first))
    # y_val_pred_prob.append(pd.DataFrame(y_val_pred_prob_first))

    # repeat = 0  #50
    # for _ in range(repeat):
    #     lgb_train = lgb.Dataset(y_pred_prob[-1], y_train)
    #     lgb_model = lgb.train(params, lgb_train,
    #                           # num_boost_round=10000,
    #                           valid_sets=[lgb_train],
    #                           callbacks=[
    #                               lgb.early_stopping(stopping_rounds=100, verbose=False),
    #                               lgb.log_evaluation(100)],
    #                           # feval=f1_macro,
    #                           )
    #     #前のモデルが訓練データから予測した結果
    #     y_pred_prob.append(pd.DataFrame(lgb_model.predict(y_pred_prob[-1])))
    # 
    #     #前のモデルがテストデータから予測した結果
    #     y_val_pred_prob.append(pd.DataFrame(lgb_model.predict(y_val_pred_prob[-1])))
    # 
    #     models.append(lgb_model)

    # # 検証データで予測
    # y_pred = np.where(y_val_pred_prob[-1] < 0.1, 0, 1)
    y_pred = y_val_pred_prob
    # AUC scoreを計算してリストに追加
    AUC_score = roc_auc_score(y_val, y_pred)
    AUC_scores.append(AUC_score)

# スコアを表示
print('Mean AUC scores:', AUC_scores)
print('Average AUC Score:', np.mean(AUC_scores))


[100]	training's binary_logloss: 0.00302862	valid_1's binary_logloss: 0.479111
[100]	training's binary_logloss: 0.0038298	valid_1's binary_logloss: 0.0028667
[200]	training's binary_logloss: 0.000528507	valid_1's binary_logloss: 0.00204648
[300]	training's binary_logloss: 0.000248286	valid_1's binary_logloss: 0.00159812
[300]	training's binary_logloss: 0.000256221	valid_1's binary_logloss: 0.000211103
[400]	training's binary_logloss: 0.000157866	valid_1's binary_logloss: 0.000176524
[500]	training's binary_logloss: 0.000116605	valid_1's binary_logloss: 0.000154016
[600]	training's binary_logloss: 9.25893e-05	valid_1's binary_logloss: 0.000136118
[700]	training's binary_logloss: 7.80435e-05	valid_1's binary_logloss: 0.000135834
[700]	training's binary_logloss: 7.83723e-05	valid_1's binary_logloss: 0.000139792
[700]	training's binary_logloss: 7.87203e-05	valid_1's binary_logloss: 4.15313e-06
[800]	training's binary_logloss: 6.86521e-05	valid_1's binary_logloss: 3.39323e-06
[900]	training

In [369]:
X_test = train_data.drop('disease', axis=1).iloc[len(train):]
lgb_train = lgb.Dataset(X, y)

y_pred_prob = pd.DataFrame(lgb_model_first.predict(X))
y_test_pred_prob = pd.DataFrame(lgb_model_first.predict(X_test))
# # forで2回目以降のLightGBMを繰り返してみる
# y_pred_prob = []
# y_test_pred_prob = []
# 
# # 1段目の訓練データ予測とテストデータ予測を格納する
# y_pred_prob.append(y_pred_prob_first)
# y_test_pred_prob.append(y_test_pred_prob_first)

y_pred = np.where(y_test_pred_prob < 0.5, 0, 1)

print('判定結果')
print('0:', np.count_nonzero(y_pred == 0), '1:', np.count_nonzero(y_pred == 1))

判定結果
0: 215 1: 167


In [370]:
submit = pd.DataFrame(train_data['disease'].iloc[len(train):])
submit['disease'] = y_pred
submit = submit.astype('int')
submit.to_csv('lgb_submit.csv', index=True, header=False)
submit

      disease
id           
891         0
892         1
893         0
894         1
895         0
896         1
897         0
898         0
899         0
900         0
901         1
902         0
903         1
904         1
905         0
906         0
907         1
908         1
909         0
910         0
911         0
912         0
913         0
914         0
915         1
916         0
917         0
918         1
919         0
920         0
921         1
922         0
923         0
924         1
925         0
926         0
927         1
928         1
929         0
930         0
931         0
932         0
933         0
934         0
935         0
936         1
937         1
938         1
939         0
940         0
941         1
942         0
943         1
944         0
945         1
946         0
947         0
948         0
949         1
950         0
951         0
952         0
953         0
954         1
955         1
956         0
957         0
958         1
959         1
960   

Unnamed: 0_level_0,disease
id,Unnamed: 1_level_1
891,0
892,1
893,0
894,1
895,0
896,1
897,0
898,0
899,0
900,0


In [143]:
# submission.csvの提出
# !signate submit --competition-id=1247 submission.csv --note LGB**51 # ここにコメント残せる