### Modeling

In [1]:
import shap
import pandas as pd
from xgboost import XGBRegressor
from sklearn.svm import LinearSVR
from lightgbm import LGBMRegressor
from sklearn.linear_model import Ridge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.preprocessing import PolynomialFeatures, StandardScaler

In [2]:
df = pd.read_csv("./new_data/lstrip_tarin.csv")

In [3]:
X = df.drop(labels=["Income"], axis=1)
y = df["Income"]

In [4]:
X = X.rename(columns={"Working_Week (Yearly)" : "Working_Week_Yearly"})

In [5]:
X[["Working_Week_Yearly", "Age"]] = StandardScaler().fit_transform(X[["Working_Week_Yearly", "Age"]])

In [56]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=11, test_size=0.2)
model = XGBRegressor()  # when use LGBMRegressor : force_row_wise=True
param_grid = {
    "n_estimators" : [2000], 
    "learning_rate" : [0.1, 0.01, 0.001],
    "reg_alpha" : [0.001, 0.01, 0.1, 1, 10, 100],  # L1 규제
    "reg_lambda" : [0.001, 0.01, 0.1, 1, 10, 100],  # L2 규제
    "max_depth" : [3, 4, 5, 6],
    "gamma" : [0.001, 0.01, 0.1, 1, 10, 100],  # use when training with xgbregressor
    "colsample_bytree" : [0.2, 0.3, 0.4, 0.5, 0.6, 0.7],
    # "num_leaves" : [50, 60, 70, 80, 90],  # use when training with LBGMRegressor
}

skf = StratifiedKFold(n_splits=4, shuffle=True, random_state=11)  # 편향적 샘플링 방지 목적
grid = RandomizedSearchCV(model, cv=skf, param_distributions=param_grid, n_jobs=-1, random_state=11)
grid.fit(X_train, y_train)



ValueError: 
All the 40 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
40 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\admin\Desktop\CudaTest\test_cuda\lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\admin\Desktop\CudaTest\test_cuda\lib\site-packages\xgboost\core.py", line 730, in inner_f
    return func(**kwargs)
  File "c:\Users\admin\Desktop\CudaTest\test_cuda\lib\site-packages\xgboost\sklearn.py", line 1055, in fit
    train_dmatrix, evals = _wrap_evaluation_matrices(
  File "c:\Users\admin\Desktop\CudaTest\test_cuda\lib\site-packages\xgboost\sklearn.py", line 521, in _wrap_evaluation_matrices
    train_dmatrix = create_dmatrix(
  File "c:\Users\admin\Desktop\CudaTest\test_cuda\lib\site-packages\xgboost\sklearn.py", line 958, in _create_dmatrix
    return QuantileDMatrix(
  File "c:\Users\admin\Desktop\CudaTest\test_cuda\lib\site-packages\xgboost\core.py", line 730, in inner_f
    return func(**kwargs)
  File "c:\Users\admin\Desktop\CudaTest\test_cuda\lib\site-packages\xgboost\core.py", line 1529, in __init__
    self._init(
  File "c:\Users\admin\Desktop\CudaTest\test_cuda\lib\site-packages\xgboost\core.py", line 1588, in _init
    it.reraise()
  File "c:\Users\admin\Desktop\CudaTest\test_cuda\lib\site-packages\xgboost\core.py", line 576, in reraise
    raise exc  # pylint: disable=raising-bad-type
  File "c:\Users\admin\Desktop\CudaTest\test_cuda\lib\site-packages\xgboost\core.py", line 557, in _handle_exception
    return fn()
  File "c:\Users\admin\Desktop\CudaTest\test_cuda\lib\site-packages\xgboost\core.py", line 641, in <lambda>
    return self._handle_exception(lambda: self.next(input_data), 0)
  File "c:\Users\admin\Desktop\CudaTest\test_cuda\lib\site-packages\xgboost\data.py", line 1280, in next
    input_data(**self.kwargs)
  File "c:\Users\admin\Desktop\CudaTest\test_cuda\lib\site-packages\xgboost\core.py", line 730, in inner_f
    return func(**kwargs)
  File "c:\Users\admin\Desktop\CudaTest\test_cuda\lib\site-packages\xgboost\core.py", line 633, in input_data
    self.proxy.set_info(
  File "c:\Users\admin\Desktop\CudaTest\test_cuda\lib\site-packages\xgboost\core.py", line 730, in inner_f
    return func(**kwargs)
  File "c:\Users\admin\Desktop\CudaTest\test_cuda\lib\site-packages\xgboost\core.py", line 946, in set_info
    self.feature_names = feature_names
  File "c:\Users\admin\Desktop\CudaTest\test_cuda\lib\site-packages\xgboost\core.py", line 1322, in feature_names
    raise ValueError(
ValueError: feature_names must be string, and may not contain [, ] or <


In [31]:
grid.best_params_

{'reg_lambda': 1,
 'reg_alpha': 100,
 'num_leaves': 50,
 'n_estimators': 2000,
 'max_depth': 6,
 'learning_rate': 0.01,
 'colsample_bytree': 0.2}

In [32]:
y_pred = grid.best_estimator_.predict(X_test)
y_true = y_test

In [33]:
mean_squared_error(y_true, y_pred)**0.5   # 1st training = 488.23452  2nd training = 487.97  3rd training = 487.69  4th 487.16 5th 486.37

496.25551923140574

In [102]:
# import numpy as np
# from xgboost import XGBRegressor
# from sklearn.ensemble import VotingRegressor

# xgb_model = XGBRegressor()
# lgbm_model = LGBMRegressor(verbosity=0)

# voting_regressor = VotingRegressor([('xgb', xgb_model), ('lgbm', lgbm_model)], n_jobs=-1)
# param_distributions = {
#     "xgb__n_estimators" : [2000], 
#     "xgb__learning_rate" : [0.1, 0.01, 0.001],
#     "xgb__reg_alpha" : [0.001, 0.01, 0.1, 1, 10, 100],  # L1 규제
#     "xgb__reg_lambda" : [0.001, 0.01, 0.1, 1, 10, 100],  # L2 규제
#     "xgb__max_depth" : [6, 7, 8, 9],
#     "xgb__gamma" : [0.001, 0.01, 0.1, 1, 10, 100],  # use when training with xgbregressor
#     "xgb__colsample_bytree" : [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
#     "lgbm__n_estimators" : [2000], 
#     "lgbm__learning_rate" : [0.1, 0.01, 0.001],
#     "lgbm__reg_alpha" : [0.001, 0.01, 0.1, 1, 10, 100],  # L1 규제
#     "lgbm__reg_lambda" : [0.001, 0.01, 0.1, 1, 10, 100],  # L2 규제
#     "lgbm__max_depth" : [6, 7, 8, 9],
#     "lgbm__colsample_bytree" : [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
#     "lgbm__num_leaves" : [50, 60, 70, 80, 90],  # use when training with LBGMRegressor

# }

# vote = RandomizedSearchCV(voting_regressor, cv=skf, param_distributions=param_distributions, n_jobs=-1, n_iter=50, random_state=11)
# vote.fit(X_train, y_train)



In [103]:
# vote.best_params_

{'xgb__reg_lambda': 0.01,
 'xgb__reg_alpha': 100,
 'xgb__n_estimators': 2000,
 'xgb__max_depth': 9,
 'xgb__learning_rate': 0.001,
 'xgb__gamma': 0.001,
 'xgb__colsample_bytree': 0.5,
 'lgbm__reg_lambda': 0.001,
 'lgbm__reg_alpha': 10,
 'lgbm__num_leaves': 90,
 'lgbm__n_estimators': 2000,
 'lgbm__max_depth': 7,
 'lgbm__learning_rate': 0.01,
 'lgbm__colsample_bytree': 0.5}

In [104]:
# y_pred_vote = vote.best_estimator_.predict(X_test)
# y_true_vote = y_test

In [105]:
# mean_squared_error(y_true_vote, y_pred_vote)**0.5   # 1st training = 488.23452  2nd training = 487.97  3rd training = 487.69

489.1639735071626

In [100]:
# test = pd.read_csv("./new_data/test_without_loss_gain.csv")  # 데이터 정제 단계에서 분할했던 Test셋
# test = test.rename(columns={"Working_Week (Yearly)" : "Working_Week_Yearly"})
# submit = pd.read_csv("./data/sample_submission (2).csv")  # 제출파일
# test[["Age", "Working_Week_Yearly"]] = StandardScaler().fit_transform(test[["Age", "Working_Week_Yearly"]])
# submit["Income"] = grid.best_estimator_.predict(test)  # 예측 후 제출파일의 Income 컬럼에 추가
# submit.to_csv("../486_37.csv", index=False, sep=",", encoding="utf8")  # 저장