In [7]:
import numpy as np
import random
import os
import pandas as pd
from catboost import CatBoostRegressor, Pool
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed=42
seed_everything(seed) # Seed 고정

#### 데이터 불러오기 및 전처리

In [8]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [9]:
train = train.drop('ID',axis=1)
test = test.drop('ID',axis=1)

In [None]:
# 결측값을 각 열의 평균으로 대체
train['배터리용량'].fillna(train['배터리용량'].mean(), inplace=True)
test['배터리용량'].fillna(test['배터리용량'].mean(), inplace=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7497 entries, 0 to 7496
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   제조사       7497 non-null   object 
 1   모델        7497 non-null   object 
 2   차량상태      7497 non-null   object 
 3   배터리용량     4786 non-null   float64
 4   구동방식      7497 non-null   object 
 5   주행거리(km)  7497 non-null   int64  
 6   보증기간(년)   7497 non-null   int64  
 7   사고이력      7497 non-null   object 
 8   연식(년)     7497 non-null   int64  
 9   가격(백만원)   7497 non-null   float64
dtypes: float64(2), int64(3), object(5)
memory usage: 585.8+ KB


In [10]:
categorical_features = [
    '제조사'
    ,'모델'
    ,'차량상태'
    ,'구동방식'
    ,'사고이력'
]
for i in categorical_features:
    train[i] = train[i].astype('category')
    test[i] = test[i].astype('category')
    
target= train['가격(백만원)']
train = train.drop('가격(백만원)', axis=1)


In [11]:
train_X, valid_X, train_y, valid_y = train_test_split(train, target, test_size=0.2, random_state=seed)

#### 모델 학습

##### CatBoost



In [12]:
train_pool = Pool(data=train_X, label=train_y, cat_features=categorical_features)
clf = CatBoostRegressor(random_state=seed, verbose=False)
clf.fit(train_pool)

<catboost.core.CatBoostRegressor at 0x26103a81f50>

#### 모델 평가

In [13]:
def RMSE(y, pred):
  return np.sqrt(root_mean_squared_error(y, pred))

In [14]:
valid_pool = Pool(data=valid_X, cat_features=categorical_features)
pred = clf.predict(valid_pool)

In [15]:
RMSE(valid_y, pred)

1.2192647743351863

#### 예측값 출력

In [16]:
test_pool = Pool(data=test, cat_features=categorical_features)
pred = clf.predict(test_pool)

In [17]:
submit = pd.read_csv('sample_submission.csv')
submit['가격(백만원)'] = pred

In [18]:
import datetime

# 현재 날짜와 시간 가져오기
now = datetime.datetime.now()
formatted_time = now.strftime("%Y%m%d_%H%M")  # 예: 20231208_153045

# 파일 경로에 날짜와 시간 추가
file_path = f"submission_{formatted_time}.csv"

# DataFrame 저장
submit.to_csv(file_path, index=False)