In [58]:
import numpy as pd
import pandas as pd
import warnings

warnings.filterwarnings(action='ignore')

# 데이터 경로
data_path = './input/'

sales_train = pd.read_csv(data_path + 'sales_train.csv')
shops = pd.read_csv(data_path + 'shops.csv')
items = pd.read_csv(data_path + 'items.csv')
item_categories = pd.read_csv(data_path + 'item_categories.csv')
test = pd.read_csv(data_path + 'test.csv')
submission = pd.read_csv(data_path + 'sample_submission.csv')

In [59]:
sales_train = sales_train.rename(columns={
    'date': '날짜',
    'date_block_num':'월ID',
    'shop_id':'상점ID',
    'item_id':'상품ID',
    'item_price':'판매가',
    'item_cnt_day':'판매량'
})
sales_train.head()

Unnamed: 0,날짜,월ID,상점ID,상품ID,판매가,판매량
0,02.01.2013,0,59,22154,999.0,1.0
1,03.01.2013,0,25,2552,899.0,1.0
2,05.01.2013,0,25,2552,899.0,-1.0
3,06.01.2013,0,25,2554,1709.05,1.0
4,15.01.2013,0,25,2555,1099.0,1.0


In [60]:
shops = shops.rename(columns={
    'shop_name':'상점명',
    'shop_id':'상점ID'
})
shops.head()

Unnamed: 0,상점명,상점ID
0,"!Якутск Орджоникидзе, 56 фран",0
1,"!Якутск ТЦ ""Центральный"" фран",1
2,"Адыгея ТЦ ""Мега""",2
3,"Балашиха ТРК ""Октябрь-Киномир""",3
4,"Волжский ТЦ ""Волга Молл""",4


In [61]:
items = items.rename(columns={
    'item_name':'상품명',
    'item_id':'상품ID',
    'item_category_id':'상품분류ID'
})
items.head()

Unnamed: 0,상품명,상품ID,상품분류ID
0,! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.) D,0,40
1,!ABBYY FineReader 12 Professional Edition Full...,1,76
2,***В ЛУЧАХ СЛАВЫ (UNV) D,2,40
3,***ГОЛУБАЯ ВОЛНА (Univ) D,3,40
4,***КОРОБКА (СТЕКЛО) D,4,40


In [62]:
item_categories = item_categories.rename(columns={
    'item_category_name':'상품분류명',
    'item_category_id':'상품분류ID'
})
item_categories.head()

Unnamed: 0,상품분류명,상품분류ID
0,PC - Гарнитуры/Наушники,0
1,Аксессуары - PS2,1
2,Аксессуары - PS3,2
3,Аксессуары - PS4,3
4,Аксессуары - PSP,4


In [63]:
test = test.rename(columns={
    'shop_id':'상점ID',
    'item_id':'상품ID'
})
test.head()

Unnamed: 0,ID,상점ID,상품ID
0,0,5,5037
1,1,5,5320
2,2,5,5233
3,3,5,5232
4,4,5,5268


In [64]:
print(sales_train.memory_usage())
print(len(sales_train['월ID']))

Index         128
날짜       23486792
월ID      23486792
상점ID     23486792
상품ID     23486792
판매가      23486792
판매량      23486792
dtype: int64
2935849


In [65]:
def downcast(df, verbose=True):
    # 메가바이트 환산
    start_mem = df.memory_usage().sum() / (1024**2)
    for col in df.columns:
        dtype_name = df[col].dtype.name
        if dtype_name == 'object':
            pass
        elif dtype_name == 'bool':
            df[col] = df[col].astype('int8')
        elif dtype_name.startswith('int') or (df[col].round() == df[col]).all():
            df[col] = pd.to_numeric(df[col], downcast='integer')
        else:
            df[col] = pd.to_numeric(df[col], downcast='float')
    end_mem = df.memory_usage().sum() / (1024**2)
    if verbose:
        print('{:.1f}% 압축됨'.format(100*(start_mem - end_mem)/ start_mem))
        
    return df

In [66]:
all_df = [sales_train, shops, items, item_categories, test]
for df in all_df:
    df = downcast(df)

62.5% 압축됨
38.6% 압축됨
54.2% 압축됨
39.9% 압축됨
70.8% 압축됨


**데이터 조합 생성**

In [70]:
import numpy as np
from itertools import product 

train = []

# 월 ID, 상점 ID, 상품ID 조합 생성
for i in sales_train['월ID'].unique():
    all_shop = sales_train.loc[sales_train['월ID']==i, '상점ID'].unique()
    all_item = sales_train.loc[sales_train['월ID']==i, '상품ID'].unique()
    train.append(np.array(list(product([i], all_shop, all_item))))

idx_features = ['월ID', '상점ID', '상품ID'] # 기준피처
# 리스트 타입인 train 을 Dataframe 타입으로 변환
train = pd.DataFrame(np.vstack(train), columns = idx_features)

train


Unnamed: 0,월ID,상점ID,상품ID
0,0,59,22154
1,0,59,2552
2,0,59,2554
3,0,59,2555
4,0,59,2564
...,...,...,...
10913845,33,21,7635
10913846,33,21,7638
10913847,33,21,7640
10913848,33,21,7632


In [71]:
# idx_features 기준으로 그룹화해 판매량 합 구하기
group = sales_train.groupby(idx_features).agg({'판매량':'sum'})
# 인덱스 재설정
group = group.reset_index()
# 피처명을 '판매량' 에서 '월간 판매량'으로 변경
group = group.rename(columns={'판매량': '월간 판매량'})

group

Unnamed: 0,월ID,상점ID,상품ID,월간 판매량
0,0,0,32,6
1,0,0,33,3
2,0,0,35,1
3,0,0,43,1
4,0,0,51,2
...,...,...,...,...
1609119,33,59,22087,6
1609120,33,59,22088,2
1609121,33,59,22091,1
1609122,33,59,22100,1


In [72]:
train = train.merge(group, on=idx_features, how='left')

In [73]:
train.head()

Unnamed: 0,월ID,상점ID,상품ID,월간 판매량
0,0,59,22154,1.0
1,0,59,2552,
2,0,59,2554,
3,0,59,2555,
4,0,59,2564,


In [74]:
import gc

del group # 더는 사용하지 않는 변수 지정
gc.collect()

0

피처 엔지니어링으로 테스트 데이터 이어붙이기

In [75]:
test['월ID'] = 34

In [76]:
# train 과 test 이어붙이기
all_data = pd.concat([train, test.drop('ID', axis=1)],
                     ignore_index=True, # 기존 인덱스 무시(0부터 새로 시작)
                     keys=idx_features) # 이어붙이는 기준이 되는 피처

In [77]:
all_data = all_data.fillna(0)

all_data

Unnamed: 0,월ID,상점ID,상품ID,월간 판매량
0,0,59,22154,1.0
1,0,59,2552,0.0
2,0,59,2554,0.0
3,0,59,2555,0.0
4,0,59,2564,0.0
...,...,...,...,...
11128045,34,45,18454,0.0
11128046,34,45,16188,0.0
11128047,34,45,15757,0.0
11128048,34,45,19648,0.0


In [78]:
# 나머지 데이터 병합 
all_data = all_data.merge(shops, on='상점ID', how='left')
all_data = all_data.merge(items, on='상품ID', how='left')
all_data = all_data.merge(item_categories, on='상품분류ID', how='left')

# 데이터 다운캐스팅
all_data = downcast(all_data)

# 가비지 컬렉션
del shops, items, item_categories
gc.collect()

26.4% 압축됨


28

In [79]:
all_data = all_data.drop(['상점명', '상품명', '상품분류명'], axis=1)

피처엔지니어링 마무리

● 훈련 데이터 : 2013년 1월부터 2015년 9월(월ID=32)까지 판매내역  
● 검증 데이터 : 2015년 10월(월ID=33) 판매내역(분석 정리2)  
● 테스트 데이터: 2015년 11월(월ID=34) 판매내역  

In [80]:
# 훈련 데이터 (피처)
X_train = all_data[all_data['월ID'] < 33]
X_train = X_train.drop(['월간 판매량'], axis=1)

# 검증 데이터 (피처)
X_valid = all_data[all_data['월ID'] == 33]
X_valid = X_valid.drop(['월간 판매량'], axis=1)

# 테스트 데이터 (피처)
X_test = all_data[all_data['월ID'] == 34]
X_test = X_test.drop(['월간 판매량'], axis=1)

# 훈련 데이터 (타깃값)
y_train = all_data[all_data['월ID'] < 33]['월간 판매량']
y_train = y_train.clip(0,20)

# 검증 데이터 (타깃값)
y_valid = all_data[all_data['월ID'] == 33]['월간 판매량']
y_valid = y_valid.clip(0, 20)

In [81]:
del all_data
gc.collect()

21

In [82]:
import lightgbm as lgb

# LightGMB 용 하이퍼 파라미터
params = {
    'metric': 'rsme', # 평가지표 = rmse
    'num_leaves': 255,
    'learning_rate': 0.01,
    'force_col_wise': True,
    'random_state': 10
}

# 범주형 피처 설정
cat_features = ['상점ID', '상품분류ID']

# LightGBM 용 훈련 및 검증 데이터셋
dtrain = lgb.Dataset(X_train, y_train)
dvalid = lgb.Dataset(X_valid, y_valid)

In [83]:
# LightGBM 모델 훈련
lgb_model = lgb.train(params=params,
                      train_set=dtrain,
                      num_boost_round=500,
                      valid_sets=(dtrain, dvalid),
                      categorical_feature=cat_features,
                      verbose_eval=50)

[LightGBM] [Info] Total Bins 426
[LightGBM] [Info] Number of data points in the train set: 10675678, number of used features: 4
[LightGBM] [Info] Start training from score 0.299125


In [84]:
# 예측
preds = lgb_model.predict(X_test).clip(0, 20)

# 제출 파일 생성
submission['item_cnt_month'] = preds
submission.to_csv('./output/submission.csv',index=False)

In [85]:
del X_train, y_train, X_valid, y_valid, X_test, lgb_model, dtrain, dvalid
gc.collect()

121

In [86]:
!kaggle competitions submit -c competitive-data-science-predict-future-sales -f ./output/submission.csv -m submit_in_local

Successfully submitted to Predict Future Sales



  0%|          | 0.00/5.54M [00:00<?, ?B/s]
  0%|          | 8.00k/5.54M [00:00<01:39, 58.5kB/s]
  3%|▎         | 184k/5.54M [00:00<00:06, 898kB/s]  
 17%|█▋        | 944k/5.54M [00:00<00:01, 3.48MB/s]
 35%|███▍      | 1.92M/5.54M [00:00<00:00, 5.80MB/s]
 45%|████▌     | 2.50M/5.54M [00:01<00:02, 1.18MB/s]
 83%|████████▎ | 4.58M/5.54M [00:01<00:00, 2.91MB/s]
 98%|█████████▊| 5.44M/5.54M [00:02<00:00, 3.56MB/s]
100%|██████████| 5.54M/5.54M [00:04<00:00, 1.24MB/s]
