In [1]:
import numpy as np
import random
import os
import pandas as pd
from catboost import CatBoostRegressor, Pool
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed=42
seed_everything(seed) # Seed 고정

#### 데이터 불러오기 및 전처리

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
train = train.drop('ID',axis=1)
test = test.drop('ID',axis=1)

In [4]:
# 1. 데이터 준비
data_with_battery = train.dropna(subset=['배터리용량']).copy()  # 명시적으로 복사
data_missing_battery = train[train['배터리용량'].isnull()].copy()  # 명시적으로 복사
features = ['제조사', '모델', '차량상태', '가격(백만원)', '연식(년)', '보증기간(년)']
categorical_features = ['제조사', '모델', '차량상태']

# 범주형 변수를 문자열로 변환
for col in categorical_features:
    data_with_battery[col] = data_with_battery[col].astype(str)
    data_missing_battery[col] = data_missing_battery[col].astype(str)

X = data_with_battery[features]
y = data_with_battery['배터리용량']

# 2. K-Fold Cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
rmse_scores = []

for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Pool 객체 생성
    train_pool = Pool(X_train, y_train, cat_features=categorical_features)
    test_pool = Pool(X_test, y_test, cat_features=categorical_features)
    
    # 모델 학습
    model = CatBoostRegressor(random_state=42, verbose=0)
    model.fit(train_pool)
    
    # 예측 및 RMSE 계산
    y_pred = model.predict(test_pool)
    rmse = root_mean_squared_error(y_test, y_pred)  # 최신 방식
    rmse_scores.append(rmse)

# 3. 결과 출력
mean_rmse = np.mean(rmse_scores)
print(f"Cross-Validation RMSE Scores: {rmse_scores}")
print(f"Mean RMSE: {mean_rmse:.2f}")

Cross-Validation RMSE Scores: [1.936261154582396, 1.9594501199208065, 1.761134873467221, 1.8639205143160407, 2.1763778290068974]
Mean RMSE: 1.94


In [5]:
# 4. 결측치 대체
# 최종 모델 학습: 전체 데이터 사용
final_train_pool = Pool(X, y, cat_features=categorical_features)

final_model = CatBoostRegressor(random_state=42, verbose=0)
final_model.fit(final_train_pool)

# 결측치 데이터의 예측을 위한 Pool 생성
X_missing = data_missing_battery[features]
missing_pool = Pool(X_missing, cat_features=categorical_features)

# 결측치 예측 및 대체
data_missing_battery['배터리용량'] = final_model.predict(missing_pool)

# 5. 결측치 대체 후 데이터 합치기
train_imp = pd.concat([data_with_battery, data_missing_battery])

In [6]:
# 가격 대비 배터리 효율성 (Price per kWh)
train_imp['Price_per_kWh'] = train_imp['가격(백만원)'] / train_imp['배터리용량']

# 가격-배터리 비율 (Normalized Price by Capacity)
price_per_kWh_mean = train_imp['Price_per_kWh'].mean()
train_imp['Normalized_Price_by_Capacity'] = train_imp['Price_per_kWh'] / price_per_kWh_mean
# train 데이터의 평균 가격 사용
mean_price = train_imp['가격(백만원)'].mean()
test['가격(백만원)'] = mean_price

# Price_per_kWh 및 Normalized_Price_by_Capacity 계산
test['Price_per_kWh'] = test['가격(백만원)'] / test['배터리용량']

price_per_kWh_mean = train_imp['Price_per_kWh'].mean()  # train 데이터의 평균 사용
test['Normalized_Price_by_Capacity'] = test['Price_per_kWh'] / price_per_kWh_mean


In [7]:
categorical_features = [
    '제조사'
    ,'모델'
    ,'차량상태'
    ,'구동방식'
    ,'사고이력'
]
for i in categorical_features:
    train_imp[i] = train_imp[i].astype('category')
    test[i] = test[i].astype('category')
    
target= train_imp['가격(백만원)']
train_imp = train_imp.drop('가격(백만원)', axis=1)


In [8]:
train_X, valid_X, train_y, valid_y = train_test_split(train_imp, target, test_size=0.2, random_state=seed)

#### 모델 학습

##### CatBoost



In [9]:
train_pool = Pool(data=train_X, label=train_y, cat_features=categorical_features)
clf = CatBoostRegressor(random_state=seed, verbose=False)
clf.fit(train_pool)

<catboost.core.CatBoostRegressor at 0x2a5753efd90>

#### 모델 평가

In [None]:
def RMSE(y, pred):
  return np.sqrt(root_mean_squared_error(y, pred))

In [None]:
valid_pool = Pool(data=valid_X, cat_features=categorical_features)
pred = clf.predict(valid_pool)

In [None]:
RMSE(valid_y, pred)

0.5784985102623522

#### 예측값 출력

In [None]:
test_pool = Pool(data=test, cat_features=categorical_features)
pred = clf.predict(test_pool)

In [None]:
submit = pd.read_csv('sample_submission.csv')
submit['가격(백만원)'] = pred

In [None]:
import datetime

# 현재 날짜와 시간 가져오기
now = datetime.datetime.now()
formatted_time = now.strftime("%Y%m%d_%H%M")  # 예: 20231208_153045

# 파일 경로에 날짜와 시간 추가
file_path = f"submission_{formatted_time}.csv"

# DataFrame 저장
submit.to_csv(file_path, index=False)