# import / 라이브러리 호출

In [11]:
import pandas as pd
import random
import os
import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import LabelEncoder
import catboost
from catboost import CatBoostRegressor, Pool
from sklearn.metrics import mean_squared_error

# Fixed RandomSeed / 랜덤시드 고정

In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

# Data Load / 데이터 불러오기

In [3]:
train = pd.read_csv('open/train.csv')
test = pd.read_csv('open/test.csv')

In [4]:
train.head()

Unnamed: 0,ID,추석까지 남은 기간(주),쇼핑몰 구분,가격(원),프로모션 여부,도시 유형,지역 유형,쇼핑몰 유형,선물 유형,수요량
0,TRAIN_0000,1,쇼핑몰 15,212000,0,도시 6,지역 1,쇼핑몰 유형 2,명절혼합과일선물세트,28
1,TRAIN_0001,2,쇼핑몰 72,113000,0,도시 21,지역 1,쇼핑몰 유형 1,발효홍삼선물세트,27
2,TRAIN_0002,0,쇼핑몰 15,67000,0,도시 6,지역 1,쇼핑몰 유형 2,실속스팸선물세트,769
3,TRAIN_0003,1,쇼핑몰 13,206000,0,도시 12,지역 3,쇼핑몰 유형 1,자연산프리미엄버섯선물세트,27
4,TRAIN_0004,1,쇼핑몰 65,140000,0,도시 16,지역 2,쇼핑몰 유형 2,자연산새우선물세트,337


# Feature & Target Split / 독립변수, 종속변수로 나누기

In [5]:
train_x = train.drop(['ID', '수요량'], axis = 1)
train_y = train['수요량']

test_x = test.drop('ID', axis = 1)

In [6]:
train_x.head()

Unnamed: 0,추석까지 남은 기간(주),쇼핑몰 구분,가격(원),프로모션 여부,도시 유형,지역 유형,쇼핑몰 유형,선물 유형
0,1,쇼핑몰 15,212000,0,도시 6,지역 1,쇼핑몰 유형 2,명절혼합과일선물세트
1,2,쇼핑몰 72,113000,0,도시 21,지역 1,쇼핑몰 유형 1,발효홍삼선물세트
2,0,쇼핑몰 15,67000,0,도시 6,지역 1,쇼핑몰 유형 2,실속스팸선물세트
3,1,쇼핑몰 13,206000,0,도시 12,지역 3,쇼핑몰 유형 1,자연산프리미엄버섯선물세트
4,1,쇼핑몰 65,140000,0,도시 16,지역 2,쇼핑몰 유형 2,자연산새우선물세트


# Data Preprocessing / 데이터 전처리

In [7]:
ordinal_features = ['쇼핑몰 구분', '도시 유형', '지역 유형', '쇼핑몰 유형', '선물 유형']

for feature in ordinal_features:
    le = LabelEncoder()
    le = le.fit(train_x[feature])
    train_x[feature] = le.transform(train_x[feature])

    # train데이터에서 존재하지 않았던 값이 test 데이터에 존재할 수도 있습니다.
    # 따라서 test 데이터를 바로 변형시키지 않고 고유값을 확인후 test 데이터를 변환합니다.
    # Data Leakage를 발생시키지 않기 위함이니, 반드시 주의해주세요.
    for label in np.unique(test_x[feature]):
        if label not in le.classes_:
            le.classes_ = np.append(le.classes_, label)
    test_x[feature] = le.transform(test_x[feature])

In [8]:
train_x.head()

Unnamed: 0,추석까지 남은 기간(주),쇼핑몰 구분,가격(원),프로모션 여부,도시 유형,지역 유형,쇼핑몰 유형,선물 유형
0,1,6,212000,0,20,0,1,9
1,2,69,113000,0,13,0,0,15
2,0,6,67000,0,20,0,1,22
3,1,4,206000,0,3,2,0,32
4,1,61,140000,0,7,1,1,30


In [9]:
# train_test_split
x_train, x_valid, y_train, y_valid = train_test_split(train_x, train_y, test_size=0.2, shuffle=True, random_state=42)

# Regression Model Definition / 회귀 모델 정의

In [12]:
regressor = CatBoostRegressor(random_state = 42)

# Model fitting / 모델 학습

In [13]:
regressor.fit(x_train, y_train)

Learning rate set to 0.051403
0:	learn: 331.8369570	total: 66ms	remaining: 1m 5s
1:	learn: 327.0731275	total: 83.5ms	remaining: 41.6s
2:	learn: 322.5743568	total: 99.4ms	remaining: 33s
3:	learn: 318.0303126	total: 118ms	remaining: 29.4s
4:	learn: 313.8256451	total: 132ms	remaining: 26.3s
5:	learn: 309.9477522	total: 148ms	remaining: 24.5s
6:	learn: 306.6682639	total: 168ms	remaining: 23.8s
7:	learn: 303.3600030	total: 185ms	remaining: 22.9s
8:	learn: 300.0352358	total: 200ms	remaining: 22.1s
9:	learn: 296.9345518	total: 216ms	remaining: 21.4s
10:	learn: 294.0872622	total: 230ms	remaining: 20.7s
11:	learn: 291.5018836	total: 245ms	remaining: 20.2s
12:	learn: 289.1790030	total: 261ms	remaining: 19.8s
13:	learn: 286.8974790	total: 277ms	remaining: 19.5s
14:	learn: 284.5183597	total: 297ms	remaining: 19.5s
15:	learn: 282.6497752	total: 314ms	remaining: 19.3s
16:	learn: 280.4249738	total: 330ms	remaining: 19.1s
17:	learn: 278.4796604	total: 347ms	remaining: 18.9s
18:	learn: 276.8976784	tota

<catboost.core.CatBoostRegressor at 0x7f34dac9cb80>

# Inference / 추론

In [14]:
preds = regressor.predict(x_valid)

In [15]:
#score
mean_squared_error(y_valid, preds)**0.5

175.07298404676018

# submit / 제출

In [10]:
submission = pd.read_csv('sample_submission.csv')

In [11]:
submission['수요량'] = preds
submission.head()

Unnamed: 0,ID,수요량
0,TEST_0000,55.0
1,TEST_0001,26.0
2,TEST_0002,378.0
3,TEST_0003,109.0
4,TEST_0004,271.0


In [12]:
submission.to_csv('./base_cat.csv', index = False)