In [138]:
import numpy as np
import random
import os
import pandas as pd
from catboost import CatBoostRegressor, Pool
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.model_selection import KFold
import lightgbm as lgb

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed=42
seed_everything(seed) # Seed 고정

#### 데이터 불러오기 및 전처리

In [139]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [140]:
train.head()

Unnamed: 0,ID,제조사,모델,차량상태,배터리용량,구동방식,주행거리(km),보증기간(년),사고이력,연식(년),가격(백만원)
0,TRAIN_0000,P사,TayGTS,Nearly New,86.077,AWD,13642,0,No,2,159.66
1,TRAIN_0001,K사,Niro,Nearly New,56.0,FWD,10199,6,No,0,28.01
2,TRAIN_0002,A사,eT,Brand New,91.2,AWD,2361,7,No,0,66.27
3,TRAIN_0003,A사,RSeTGT,Nearly New,,AWD,21683,3,No,0,99.16
4,TRAIN_0004,B사,i5,Pre-Owned,61.018,AWD,178205,1,No,0,62.02


In [141]:
train = train.drop(columns=['ID'])
test = test.drop(columns=['ID'])

In [142]:


# Calculate the correlation matrix
correlation_matrix = train.corr()

# Extract the correlation with the target variable
correlation_with_target = correlation_matrix["가격(백만원)"].sort_values(ascending=False)
print(correlation_with_target)

가격(백만원)     1.000000
배터리용량       0.431668
주행거리(km)   -0.035488
연식(년)      -0.058455
보증기간(년)    -0.349962
Name: 가격(백만원), dtype: float64


In [143]:
# # 결측값을 각 열의 평균으로 대체
# train['배터리용량'].fillna(train['배터리용량'].mean(), inplace=True)
# test['배터리용량'].fillna(test['배터리용량'].mean(), inplace=True)

In [144]:
categorical_features = [
    '제조사'
    ,'모델'
    ,'차량상태'
    ,'구동방식'
    ,'사고이력'
]
for i in categorical_features:
    train[i] = train[i].astype('category')
    test[i] = test[i].astype('category')
    
target= train['가격(백만원)']
train = train.drop('가격(백만원)', axis=1)


In [145]:
train_X, valid_X, train_y, valid_y = train_test_split(train, target, test_size=0.2, random_state=seed)

##### lgbm



In [146]:
# LightGBM 모델 정의
lgb_model = lgb.LGBMRegressor(force_row_wise=True)

# 랜덤 서치 하이퍼파라미터 공간 정의
param_dist = {
    'num_leaves': np.arange(5, 31, 5).tolist(),          # 리프 수 감소
    'learning_rate': np.arange(0.01, 0.5, 0.01).tolist(),# 학습 속도 줄임
    'n_estimators': np.arange(50, 301, 50).tolist(),     # 더 많은 부스팅 라운드
    'max_depth': [2, 4, 8, 16],                          # 깊이 제한 완화
    'min_data_in_leaf': np.arange(1, 11, 2).tolist(),    # 리프의 최소 데이터 수 감소
    'min_split_gain': np.arange(0.0, 0.05, 0.01).tolist(), # 최소 분할 이득
    'bagging_fraction': np.arange(0.7, 1.0, 0.1).tolist(), # 데이터 샘플링
    'feature_fraction': np.arange(0.7, 1.0, 0.1).tolist() # 피처 샘플링
}

# 랜덤 서치 객체 생성
random_search = RandomizedSearchCV(
    estimator=lgb_model,
    param_distributions=param_dist,
    n_iter=100,  # 랜덤 샘플링 횟수
    scoring='neg_root_mean_squared_error',  # 평가 지표
    cv=5,  # 교차 검증 폴드 수
    verbose=1,
    random_state=42,
    n_jobs=-1  # 병렬 처리
)

# 랜덤 서치 실행
random_search.fit(train_X, train_y, categorical_feature=categorical_features)

# 최적의 하이퍼파라미터 출력
print("최적 하이퍼파라미터:", random_search.best_params_)

# 최적 모델로 테스트 데이터 예측
best_model = random_search.best_estimator_
y_pred = best_model.predict(valid_X)

# RMSE 계산
rmse = mean_squared_error(valid_y, y_pred)
print(f"테스트 데이터 RMSE: {rmse:.4f}")

Fitting 5 folds for each of 100 candidates, totalling 500 fits
[LightGBM] [Info] Total Bins 409
[LightGBM] [Info] Number of data points in the train set: 5997, number of used features: 9
[LightGBM] [Info] Start training from score 62.221487
최적 하이퍼파라미터: {'num_leaves': 10, 'n_estimators': 150, 'min_split_gain': 0.02, 'min_data_in_leaf': 9, 'max_depth': 4, 'learning_rate': 0.25, 'feature_fraction': 0.9999999999999999, 'bagging_fraction': 0.7999999999999999}
테스트 데이터 RMSE: 2.1231


#### 예측값 출력

In [147]:
pred = best_model.predict(test)



In [148]:
submit = pd.read_csv('sample_submission.csv')
submit['가격(백만원)'] = pred

In [149]:
import datetime
now = datetime.datetime.now()
formatted_time = now.strftime("%Y%m%d_%H%M")  
file_path = f"submission_{formatted_time}.csv"
submit.to_csv(file_path, index=False)