In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings(action='ignore')

sales_train = pd.read_csv('./sales_train.csv')
shops = pd.read_csv('./shops.csv')
items = pd.read_csv('./items.csv')
item_categories = pd.read_csv('./item_categories.csv')
test = pd.read_csv('./test.csv')
submission = pd.read_csv('./sample_submission.csv')

### 피처명 엔지니어링
1. 피처명 한글화

In [2]:
sales_train = sales_train.rename(columns={'date':'날짜',
                                         'date_block_num' : '월id',
                                         'shop_id':'상점id',
                                         'item_id' : '상품id',
                                         'item_price':'판매가',
                                         'item_cnt_day':'판매량'})
sales_train.head()

Unnamed: 0,날짜,월id,상점id,상품id,판매가,판매량
0,02.01.2013,0,59,22154,999.0,1.0
1,03.01.2013,0,25,2552,899.0,1.0
2,05.01.2013,0,25,2552,899.0,-1.0
3,06.01.2013,0,25,2554,1709.05,1.0
4,15.01.2013,0,25,2555,1099.0,1.0


In [3]:
shops = shops.rename(columns={'shop_name':'상점명', 'shop_id':'상점id'})
shops.head()

Unnamed: 0,상점명,상점id
0,"!Якутск Орджоникидзе, 56 фран",0
1,"!Якутск ТЦ ""Центральный"" фран",1
2,"Адыгея ТЦ ""Мега""",2
3,"Балашиха ТРК ""Октябрь-Киномир""",3
4,"Волжский ТЦ ""Волга Молл""",4


In [4]:
items = items.rename(columns={'item_name':'상품명',
                             'item_id':'상품id',
                             'item_category_id':'상품분류id'})
items.head()

Unnamed: 0,상품명,상품id,상품분류id
0,! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.) D,0,40
1,!ABBYY FineReader 12 Professional Edition Full...,1,76
2,***В ЛУЧАХ СЛАВЫ (UNV) D,2,40
3,***ГОЛУБАЯ ВОЛНА (Univ) D,3,40
4,***КОРОБКА (СТЕКЛО) D,4,40


In [5]:
item_categories = item_categories.rename(columns={'item_category_name':'상품분류명',
                                                  'item_category_id':'상품분류id'})
item_categories.head()

Unnamed: 0,상품분류명,상품분류id
0,PC - Гарнитуры/Наушники,0
1,Аксессуары - PS2,1
2,Аксессуары - PS3,2
3,Аксессуары - PS4,3
4,Аксессуары - PSP,4


In [6]:
test = test.rename(columns={'shop_id':'상점id', 'item_id':'상품id'})
test.head()

Unnamed: 0,ID,상점id,상품id
0,0,5,5037
1,1,5,5320
2,2,5,5233
3,3,5,5232
4,4,5,5268


2) 다운캐스팅(downcasting) : 더 작은 데이터 타입으로 변환

In [7]:
def downcast(df, verbose=True):
    start_men = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        dtype_name = df[col].dtype.name
        if dtype_name == 'object':
            pass
        elif dtype_name == 'bool':
            df[col] = df[col].astype('int8')
        elif dtype_name.startswith('int') or (df[col].round() == df[col]).all():
            df[col] = pd.to_numeric(df[col], downcast='integer')
        else:
            df[col] = pd.to_numeric(df[col], downcast='float')
    end_men = df.memory_usage().sum() / 1024**2
    if verbose:
        print('{:.1f}% 압축됨'.format(100*(start_men - end_men)/start_men))
        
    return df

In [8]:
all_df = [sales_train, shops, items, item_categories, test]
for df in all_df:
    df = downcast(df)

62.5% 압축됨
38.6% 압축됨
54.2% 압축됨
39.9% 압축됨
70.8% 압축됨


3) 데이터 조합 생성

In [9]:
test.head()

Unnamed: 0,ID,상점id,상품id
0,0,5,5037
1,1,5,5320
2,2,5,5233
3,3,5,5232
4,4,5,5268


In [10]:
from itertools import product

train = []
# 월id, 상점id, 상품id 조합 생성
for i in sales_train['월id'].unique():
    all_shop = sales_train.loc[sales_train['월id']==i, '상점id'].unique()
    all_item = sales_train.loc[sales_train['월id']==i, '상품id'].unique()
    train.append(np.array(list(product([i], all_shop, all_item))))
    
idx_features = ['월id', '상점id', '상품id'] # 기준피처
# 리스트 타입인 train을 dataframe으로 변환
train = pd.DataFrame(np.vstack(train), columns=idx_features)

train

Unnamed: 0,월id,상점id,상품id
0,0,59,22154
1,0,59,2552
2,0,59,2554
3,0,59,2555
4,0,59,2564
...,...,...,...
10913845,33,21,7635
10913846,33,21,7638
10913847,33,21,7640
10913848,33,21,7632


4) 타깃값 추가

In [11]:
group = sales_train.groupby(idx_features).agg({'판매량':'sum'})
group = group.reset_index()

group = group.rename(columns={'판매량':'월간 판매량'})

group

Unnamed: 0,월id,상점id,상품id,월간 판매량
0,0,0,32,6
1,0,0,33,3
2,0,0,35,1
3,0,0,43,1
4,0,0,51,2
...,...,...,...,...
1609119,33,59,22087,6
1609120,33,59,22088,2
1609121,33,59,22091,1
1609122,33,59,22100,1


In [12]:
train = train.merge(group, on=idx_features, how='left')
train

Unnamed: 0,월id,상점id,상품id,월간 판매량
0,0,59,22154,1.0
1,0,59,2552,
2,0,59,2554,
3,0,59,2555,
4,0,59,2564,
...,...,...,...,...
10913845,33,21,7635,
10913846,33,21,7638,
10913847,33,21,7640,
10913848,33,21,7632,


In [13]:
# garbage collection
import gc
del group
gc.collect();

5) 테스트 데이터 이어붙이기

In [14]:
test['월id'] = 34

In [15]:
all_data = pd.concat([train, test.drop('ID', axis=1)],
                    ignore_index=True, keys=idx_features)

In [16]:
all_data = all_data.fillna(0)

all_data

Unnamed: 0,월id,상점id,상품id,월간 판매량
0,0,59,22154,1.0
1,0,59,2552,0.0
2,0,59,2554,0.0
3,0,59,2555,0.0
4,0,59,2564,0.0
...,...,...,...,...
11128045,34,45,18454,0.0
11128046,34,45,16188,0.0
11128047,34,45,15757,0.0
11128048,34,45,19648,0.0


6) 나머지 데이터 병합

In [17]:
all_data = all_data.merge(shops, on='상점id', how='left')
all_data = all_data.merge(items, on='상품id', how='left')
all_data = all_data.merge(item_categories, on='상품분류id', how='left')

all_data = downcast(all_data)

del shops, items, item_categories
gc.collect();

26.4% 압축됨


In [18]:
all_data = all_data.drop(['상점명', '상품명', '상품분류명'], axis=1)

7) 마무리
- train : 2013년 1월부터 2015년 9월(월id=32)까지 판매 내역
- valid : 2015년 10월(월id=33) 판매 내역
- test : 2015년 11월(월id=34) 판매 내역

In [22]:
X_train = all_data[all_data['월id']<33]
X_train = X_train.drop(['월간 판매량'], axis=1)

X_valid = all_data[all_data['월id']==33]
X_valid = X_valid.drop(['월간 판매량'], axis=1)

X_test = all_data[all_data['월id']== 34]
X_test = X_test.drop(['월간 판매량'], axis=1)

y_train = all_data[all_data['월id']<33]['월간 판매량']
y_train = y_train.clip(0,20) # 0~20으로 제한

y_valid = all_data[all_data['월id']==33]['월간 판매량']
y_valid = y_valid.clip(0,20)

In [23]:
del all_data
gc.collect();

### LightGBM
- 상품id와 같이 고윳값 개수가 상당히 많은 경우 즉, 범주형 데이터는 수치형 데이터로 취급해야 성능이 더 잘 나온다. 고윳값이 너무 많으면 수치형 데이터와 다르지 않기 때문에 상품id는 범주형 데이터로 취급하지 않음

In [25]:
import lightgbm as lgb

params = {'metric':'rmse',
         'num_leaves':225,
         'learning_rate':0.01,
         'force_col_wise':True,
         'random_state':10}

# 범주형 피처
cat_features = ['상점id', '상품분류id']

dtrain = lgb.Dataset(X_train, y_train)
dvalid = lgb.Dataset(X_valid, y_valid)

lgb_model = lgb.train(params=params, train_set = dtrain,
                     num_boost_round=500, valid_sets=(dtrain, dvalid),
                     categorical_feature=cat_features, verbose_eval=50)

[LightGBM] [Info] Total Bins 426
[LightGBM] [Info] Number of data points in the train set: 10675678, number of used features: 4
[LightGBM] [Info] Start training from score 0.299125
[50]	training's rmse: 1.1493	valid_1's rmse: 1.06838
[100]	training's rmse: 1.11599	valid_1's rmse: 1.04002
[150]	training's rmse: 1.09891	valid_1's rmse: 1.02832
[200]	training's rmse: 1.08816	valid_1's rmse: 1.02172
[250]	training's rmse: 1.07993	valid_1's rmse: 1.01722
[300]	training's rmse: 1.07233	valid_1's rmse: 1.01389
[350]	training's rmse: 1.06627	valid_1's rmse: 1.01142
[400]	training's rmse: 1.06044	valid_1's rmse: 1.00946
[450]	training's rmse: 1.05544	valid_1's rmse: 1.00812
[500]	training's rmse: 1.05133	valid_1's rmse: 1.00683


In [26]:
preds = lgb_model.predict(X_test).clip(0,20)

submission['item_cnt_month'] = preds
submission.to_csv('./submission.csv', index=False)