In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings(action='ignore')

In [2]:
def label_encoding(series: pd.Series) -> pd.Series:
    """범주형 데이터를 시리즈 형태로 받아 숫자형 데이터로 변환합니다."""

    my_dict = {}

    # 모든 요소를 문자열로 변환
    series = series.astype(str)

    for idx, value in enumerate(sorted(series.unique())):
        my_dict[value] = idx + 1
    series = series.map(my_dict)

    return series

In [3]:
import pandas as pd
import numpy as np
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
df_all = pd.concat([df_train, df_test], axis = 0)
df_all.shape, df_train.shape, df_test.shape

((30000, 23), (20000, 23), (10000, 22))

In [4]:
df_all.info()

<class 'pandas.core.frame.DataFrame'>
Index: 30000 entries, 0 to 9999
Data columns (total 23 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   ID                      30000 non-null  object 
 1   Age                     30000 non-null  int64  
 2   Gender                  30000 non-null  object 
 3   Education_Status        30000 non-null  object 
 4   Employment_Status       30000 non-null  object 
 5   Working_Week (Yearly)   30000 non-null  int64  
 6   Industry_Status         30000 non-null  object 
 7   Occupation_Status       30000 non-null  object 
 8   Race                    30000 non-null  object 
 9   Hispanic_Origin         30000 non-null  object 
 10  Martial_Status          30000 non-null  object 
 11  Household_Status        29999 non-null  object 
 12  Household_Summary       30000 non-null  object 
 13  Citizenship             30000 non-null  object 
 14  Birth_Country           30000 non-null  obje

In [5]:
df_all.describe(include = 'int64')

Unnamed: 0,Age,Working_Week (Yearly),Gains,Losses,Dividends
count,30000.0,30000.0,30000.0,30000.0,30000.0
mean,35.736667,35.022633,394.979467,39.278,129.0012
std,17.979206,22.232172,4264.589678,275.037897,1327.460917
min,0.0,0.0,0.0,0.0,0.0
25%,23.0,7.0,0.0,0.0,0.0
50%,35.0,52.0,0.0,0.0,0.0
75%,47.0,52.0,0.0,0.0,0.0
max,90.0,52.0,99999.0,4356.0,99999.0


In [6]:
# 이상치 99999 제거한 후 그 다음 최댓값으로 대체
df_all['Gains'].replace(99999, np.nan, inplace = True)
df_all['Dividends'].replace(99999, np.nan, inplace = True)

df_all['Gains'].fillna(df_all['Gains'].max(), inplace = True)
df_all['Dividends'].fillna(df_all['Dividends'].max(), inplace = True)

In [7]:
df_all.describe(include = ['int64', 'float64'])

Unnamed: 0,Age,Working_Week (Yearly),Gains,Losses,Dividends,Income
count,30000.0,30000.0,30000.0,30000.0,30000.0,20000.0
mean,35.736667,35.022633,289.533067,39.278,127.501233,554.56525
std,17.979206,22.232172,2029.459535,275.037897,1236.946305,701.553155
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,23.0,7.0,0.0,0.0,0.0,0.0
50%,35.0,52.0,0.0,0.0,0.0,500.0
75%,47.0,52.0,0.0,0.0,0.0,875.0
max,90.0,52.0,34095.0,4356.0,55000.0,9999.0


In [8]:
# 레이블 인코딩할 칼럼들
label_columns = df_all.select_dtypes(include = 'object').columns.drop('ID')

for col in label_columns:
    df_all[col] = label_encoding(df_all[col])

df_all.drop(columns='ID', inplace = True)
df_all.columns = df_all.columns.str.replace(" ", '_')

df_train = df_all.iloc[:len(df_train)]
df_test = df_all.iloc[len(df_train):]
df_test.drop(columns = 'Income', axis = 1, inplace = True)

In [10]:
from sklearn.model_selection import train_test_split, GridSearchCV
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error

X_train, X_test, y_train, y_test = train_test_split(df_train.drop(columns = 'Income', axis = 1), df_train['Income'], test_size= 0.3, random_state = 42)

In [67]:
help(LGBMRegressor)

Help on class LGBMRegressor in module lightgbm.sklearn:

class LGBMRegressor(sklearn.base.RegressorMixin, LGBMModel)
 |  LGBMRegressor(boosting_type: str = 'gbdt', num_leaves: int = 31, max_depth: int = -1, learning_rate: float = 0.1, n_estimators: int = 100, subsample_for_bin: int = 200000, objective: Union[str, Callable[[Optional[numpy.ndarray], numpy.ndarray], Tuple[numpy.ndarray, numpy.ndarray]], Callable[[Optional[numpy.ndarray], numpy.ndarray, Optional[numpy.ndarray]], Tuple[numpy.ndarray, numpy.ndarray]], Callable[[Optional[numpy.ndarray], numpy.ndarray, Optional[numpy.ndarray], Optional[numpy.ndarray]], Tuple[numpy.ndarray, numpy.ndarray]], NoneType] = None, class_weight: Union[Dict, str, NoneType] = None, min_split_gain: float = 0.0, min_child_weight: float = 0.001, min_child_samples: int = 20, subsample: float = 1.0, subsample_freq: int = 0, colsample_bytree: float = 1.0, reg_alpha: float = 0.0, reg_lambda: float = 0.0, random_state: Union[int, numpy.random.mtrand.RandomState

In [56]:
lgbm = LGBMRegressor(random_state = 42, verbose = 0)

param_grid = {'n_estimators' : np.arange(100, 501, 50), 'learning_rate' : [0.01, 0.02, 0.03, 0.04],
              'subsample' : [0.4, 0.5, 0.6, 0.7], 'colsample_bytree' : [0.4, 0.5, 0.6, 0.7],
              'reg_lambda': [0, 0.01, 0.05], 'reg_alpha' : [0, 0.01, 0.05, 0.1]}

grid_lgbm = GridSearchCV(lgbm, param_grid, verbose = True, scoring='neg_root_mean_squared_error')
grid_lgbm.fit(X_train, y_train)

Fitting 5 folds for each of 900 candidates, totalling 4500 fits


In [59]:
print(grid_lgbm.best_score_)
grid_lgbm.best_params_

-597.6421240623902


{'colsample_bytree': 0.4,
 'learning_rate': 0.02,
 'n_estimators': 250,
 'subsample': 0.4}

In [94]:
lgbm = LGBMRegressor(random_state = 42, verbose = 0)

param_grid = {'n_estimators' : np.arange(100, 501, 50), 'learning_rate' : [0.01, 0.02, 0.03, 0.04],
              'subsample' : [0.3, 0.4, 0.5, 0.6, 0.7], 'colsample_bytree' : [0.3, 0.4, 0.5, 0.6, 0.7],
              'reg_alpha' : [0, 0.01, 0.05, 0.1], 'reg_lambda' : [0, 0.01, 0.05, 0.1]}

grid_lgbm = GridSearchCV(lgbm, param_grid, verbose = True, scoring='neg_root_mean_squared_error')
grid_lgbm.fit(X_train, y_train)

Fitting 5 folds for each of 14400 candidates, totalling 72000 fits


KeyboardInterrupt: 

In [89]:
print(grid_lgbm.best_score_)
grid_lgbm.best_params_

-597.6421240623902


{'colsample_bytree': 0.4,
 'learning_rate': 0.02,
 'n_estimators': 250,
 'subsample': 0.3}

In [90]:
y_pred = grid_lgbm.predict(X_test)
y_pred[y_pred < 0] = 0
print('rmse :', np.sqrt(mean_squared_error(y_pred, y_test)))

rmse : 577.8888341995449


In [91]:
y_pred = grid_lgbm.predict(df_test)
y_pred[y_pred < 0] = 0
y_pred

array([ 18.58434001,  29.47081499, 437.34434645, ..., 388.90826199,
         0.        , 708.48116415])

In [92]:
submission = pd.read_csv('sample_submission.csv')
submission['Income'] = y_pred
submission.to_csv('./baseline_submission.csv', index=False)

## pycaret

In [24]:
pip install pycaret

Collecting pycaret
  Downloading pycaret-3.3.0-py3-none-any.whl (485 kB)
                                              0.0/485.9 kB ? eta -:--:--
     ------------------------------------- 485.9/485.9 kB 15.3 MB/s eta 0:00:00
Collecting scikit-learn>1.4.0 (from pycaret)
  Downloading scikit_learn-1.4.1.post1-cp311-cp311-win_amd64.whl (10.6 MB)
                                              0.0/10.6 MB ? eta -:--:--
     -----                                    1.6/10.6 MB 48.3 MB/s eta 0:00:01
     -----------                              3.1/10.6 MB 48.6 MB/s eta 0:00:01
     -----------------                        4.6/10.6 MB 41.4 MB/s eta 0:00:01
     ----------------------                   6.1/10.6 MB 42.8 MB/s eta 0:00:01
     ----------------------------             7.5/10.6 MB 43.8 MB/s eta 0:00:01
     ---------------------------------        9.0/10.6 MB 41.1 MB/s eta 0:00:01
     --------------------------------------  10.5/10.6 MB 40.9 MB/s eta 0:00:01
     -----------------

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tables 3.8.0 requires blosc2~=2.0.0, which is not installed.


In [None]:
from pycaret.regression import *

reg_setup = setup(data=df_train, target='Income')
compare_models(sort = 'MSE')

In [None]:
# extra tree 를 기본모델로 사용 - 추후 앙상블에 여러 모델 사용하자

In [28]:
lgbm_model = create_model('lightgbm')

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,315.4206,287124.8909,535.8404,0.3003,3.0968,0.3154
1,346.4105,447949.4454,669.2903,0.2328,3.0558,0.3268
2,324.4778,261500.9345,511.3716,0.2995,3.0881,0.3624
3,340.7171,340960.9598,583.9186,0.3056,3.0509,0.3084
4,327.6572,292624.6058,540.9479,0.3281,3.0711,0.3147
5,328.3274,296769.9516,544.766,0.317,3.1098,0.3132
6,349.7388,453406.4727,673.3546,0.2347,3.0863,0.2956
7,325.6925,302392.1898,549.902,0.3031,3.2025,0.2969
8,337.9864,434888.7023,659.4609,0.2461,3.0778,0.285
9,330.8076,368566.4486,607.0967,0.2473,3.1592,0.2896


In [30]:
# 모델 튜닝
tuned_lgbm = tune_model(lgbm_model)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


In [32]:
tuned_lgbm.get_params()

{'boosting_type': 'gbdt',
 'class_weight': None,
 'colsample_bytree': 1.0,
 'importance_type': 'split',
 'learning_rate': 0.15,
 'max_depth': -1,
 'min_child_samples': 91,
 'min_child_weight': 0.001,
 'min_split_gain': 0.9,
 'n_estimators': 300,
 'n_jobs': -1,
 'num_leaves': 4,
 'objective': None,
 'random_state': 2045,
 'reg_alpha': 0.2,
 'reg_lambda': 0.01,
 'subsample': 1.0,
 'subsample_for_bin': 200000,
 'subsample_freq': 0,
 'feature_fraction': 0.8,
 'bagging_freq': 3,
 'bagging_fraction': 1.0}

In [None]:
tuned_lgbm