In [1]:
import tensorflow as tf
from tensorflow import keras
import torch

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

In [2]:
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
cd drive/MyDrive/weather_bigdata_contest/code

/content/drive/MyDrive/weather_bigdata_contest/code


# 1. 전체 데이터 준비

In [5]:
wea_sale_ohe = pd.read_csv('../data/weather_sale_ohe.csv')
wea_sns_ohe = pd.read_csv('../data/weather_sns_ohe.csv')

In [28]:
wea_sale_ohe.columns[-10:-1]

Index(['20대 남성 판매량(개)', '30대 여성 판매량(개)', '30대 남성 판매량(개)', '40대 여성 판매량(개)',
       '40대 남성 판매량(개)', '50대 여성 판매량(개)', '50대 남성 판매량(개)', '60대 여성 판매량(개)',
       '60대 남성 판매량(개)'],
      dtype='object')

In [6]:
def sale_dataset_total(ohe_df):
    # 상품 목록을 정의합니다. 
    goods = ohe_df.loc[:,'소분류_가열식 가습기':'소분류_히터'].columns.tolist()
    # 상품 및 날짜로 데이터를 정렬합니다.
    ohe_df.sort_values(goods+['날짜'], inplace=True)
    # 정렬로 뒤죽박죽이 된 행 인덱스를 초기화합니다. 
    ohe_df.index = range(0, len(ohe_df))

    # 모델의 성능 향상을 위해 MinMaxScale을 진행합니다.
    sc = MinMaxScaler(feature_range=(0, 1))
    scaled_ohe = sc.fit_transform(ohe_df.loc[:,'평균기온(°C)':'미세먼지(PM10)'].values)

    # 반드시 판매량 데이터를 마지막 열에 두어야 합니다.
    # scale된 데이터를 병합해서 total data를 생성합니다.
    # 상품 소분류 원핫인코딩 열은 MinMaxScale을 진행하지 않았습니다. (0과 1로 이루어졌으므로)
    total = np.hstack([ohe_df.loc[:,'소분류_가열식 가습기':'소분류_히터'].values,
                    scaled_ohe,
                    ohe_df.loc[:,'20대 여성 판매량(개)':'60대 남성 판매량(개)'].values])

    day = 20

    # X, y 생성
    
    df = total[total[:, 0]==1] 
    X = df[day-1:, :-10]
    y = df[day-1:, -10:]

    # good_index는 상품의 인덱스를 의미합니다.
    # 총 '126'개의 상품이 있습니다. 

    for good_index in range(1, 126):
      # 해당 인덱스의 상품에 대한 판매 데이터만 추출합니다.
      df = total[total[:, good_index]==1] 
      X = np.vstack([X, df[day-1:, :-10]])
      y = np.vstack([y, df[day-1:, -10:]])

    return X, y

In [7]:
def sns_dataset_total(ohe_df):
    # 상품 목록을 정의합니다. 
    goods = ohe_df.loc[:,'소분류_가열식 가습기':'소분류_히터'].columns.tolist()

    # 상품 및 날짜로 데이터를 정렬합니다.
    ohe_df.sort_values(goods+['날짜'], inplace=True)

    # 정렬로 뒤죽박죽이 된 행 인덱스를 초기화합니다. 
    ohe_df.index = range(0, len(ohe_df))

    # 모델의 성능 향상을 위해 MinMaxScale을 진행합니다.
    sc = MinMaxScaler(feature_range=(0, 1))
    scaled_ohe = sc.fit_transform(ohe_df.loc[:,'평균기온(°C)':'미세먼지(PM10)'].values)

    # 반드시 판매량 데이터를 마지막 열에 두어야 합니다.
    # scale된 데이터를 병합해서 total data를 생성합니다.
    # 상품 소분류 원핫인코딩 열은 MinMaxScale을 진행하지 않았습니다. (0과 1로 이루어졌으므로)
    total = np.hstack([ohe_df.loc[:,'소분류_가열식 가습기':'소분류_히터'].values,
                    scaled_ohe,
                    ohe_df.loc[:,'SNS언급량'].values.reshape(-1, 1)])

    # 전체 상품 목록 리스트
    goods = ohe_df.loc[:,'소분류_가열식 가습기':'소분류_히터'].columns.tolist()

    day = 20

    df = total[total[:, 0]==1] 
    X = df[day-1:, :-1]
    y = df[day-1:, -1].reshape(-1,1)

    # good_index는 상품의 인덱스를 의미합니다.
    # 총 '126'개의 상품이 있습니다. 

    for good_index in range(1, 126):
      # 해당 인덱스의 상품에 대한 판매 데이터만 추출합니다.
      df = total[total[:, good_index]==1] 
      X = np.vstack([X, df[day-1:, :-1]])
      y = np.vstack([y, df[day-1:, -1].reshape(-1,1)])

    return X, y

#2. 예측 모델 훈련


In [8]:
!pip install catboost

Collecting catboost
  Downloading catboost-0.26-cp37-none-manylinux1_x86_64.whl (69.2 MB)
[K     |████████████████████████████████| 69.2 MB 4.9 kB/s 
Installing collected packages: catboost
Successfully installed catboost-0.26


In [9]:
from lightgbm import LGBMRegressor
from sklearn.multioutput import MultiOutputRegressor
from catboost import CatBoostRegressor

import tensorflow as tf
from tensorflow import keras

import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

from sklearn.externals import joblib



##1) 연령, 성별 카테고리별 총 판매량 예측 Final Model : Ensemble Model(LGBM, CatBoost, DNN, LSTM) 

In [None]:
X, y = sale_dataset_total(wea_sale_ohe)

### (1) LGBMRegressor

In [None]:
lgbm_sale = LGBMRegressor(n_estimators = 300)
ensemble_lgbm_sale = MultiOutputRegressor(lgbm_sale)

ensemble_lgbm_sale.fit(X, y)

joblib.dump(ensemble_lgbm_sale, '../data/ensemble_lgbm_sale.pkl')

['../data/ensemble_lgbm_sale.pkl']

In [19]:
ensemble_lgbm_sale = joblib.load('../data/ensemble_lgbm_sale.pkl')

###(2) CatBoostRegressor

In [None]:
cb_sale = CatBoostRegressor(n_estimators = 300)
ensemble_cb_sale = MultiOutputRegressor(cb_sale)

ensemble_cb_sale.fit(X, y)


joblib.dump(ensemble_cb_sale, '../data/ensemble_cb_sale.pkl')

Learning rate set to 0.216024
0:	learn: 47.4395781	total: 59ms	remaining: 17.6s
1:	learn: 41.8789501	total: 68.4ms	remaining: 10.2s
2:	learn: 37.8651885	total: 77.9ms	remaining: 7.71s
3:	learn: 35.0579402	total: 87.4ms	remaining: 6.46s
4:	learn: 33.1133011	total: 96.8ms	remaining: 5.71s
5:	learn: 31.6574419	total: 106ms	remaining: 5.21s
6:	learn: 30.5766299	total: 116ms	remaining: 4.85s
7:	learn: 29.6994870	total: 126ms	remaining: 4.61s
8:	learn: 28.9283952	total: 137ms	remaining: 4.43s
9:	learn: 28.3436859	total: 147ms	remaining: 4.25s
10:	learn: 27.8172997	total: 156ms	remaining: 4.11s
11:	learn: 27.3707053	total: 166ms	remaining: 3.98s
12:	learn: 26.9517619	total: 176ms	remaining: 3.88s
13:	learn: 26.5755015	total: 186ms	remaining: 3.79s
14:	learn: 26.2085189	total: 199ms	remaining: 3.77s
15:	learn: 25.8811533	total: 210ms	remaining: 3.72s
16:	learn: 25.5301907	total: 220ms	remaining: 3.66s
17:	learn: 25.2532699	total: 230ms	remaining: 3.6s
18:	learn: 25.0057837	total: 241ms	remaini

['../data/ensemble_cb_sale.pkl']

In [21]:
ensemble_cb_sale = joblib.load('../data/ensemble_cb_sale.pkl')

###(3) DNN

In [None]:
with tf.device('/device:GPU:0'):
    ensemble_dnn_sale = keras.Sequential([
                        keras.layers.Dense(64, input_dim = 134),
                        keras.layers.BatchNormalization(),
                        keras.layers.ReLU(),
                        keras.layers.Dense(64),
                        keras.layers.BatchNormalization(),
                        keras.layers.ReLU(),
                        keras.layers.Dense(10)
])
    adam = keras.optimizers.Adam(learning_rate = 0.01)
    ensemble_dnn_sale.compile(loss = 'mean_squared_error', optimizer = adam)
    ensemble_dnn_sale.fit(X, y, epochs = 300, batch_size = 1000)

    ensemble_dnn_sale.save('../data/ensemble_dnn_sale.h5')

Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoch 27/300
Epoch 28/300
Epoch 29/300
Epoch 30/300
Epoch 31/300
Epoch 32/300
Epoch 33/300
Epoch 34/300
Epoch 35/300
Epoch 36/300
Epoch 37/300
Epoch 38/300
Epoch 39/300
Epoch 40/300
Epoch 41/300
Epoch 42/300
Epoch 43/300
Epoch 44/300
Epoch 45/300
Epoch 46/300
Epoch 47/300
Epoch 48/300
Epoch 49/300
Epoch 50/300
Epoch 51/300
Epoch 52/300
Epoch 53/300
Epoch 54/300
Epoch 55/300
Epoch 56/300
Epoch 57/300
Epoch 58/300
Epoch 59/300
Epoch 60/300
Epoch 61/300
Epoch 62/300
Epoch 63/300
Epoch 64/300
Epoch 65/300
Epoch 66/300
Epoch 67/300
Epoch 68/300
Epoch 69/300
Epoch 70/300
Epoch 71/300
Epoch 72/300
Epoch 73/300
Epoch 74/300
Epoch 75/300
Epoch 76/300
Epoch 77/300
Epoch 78

In [22]:
ensemble_dnn_sale = keras.models.load_model('../data/ensemble_dnn_sale.h5')

###(4) LSTM

In [18]:
ensemble_lstm_sale = keras.models.load_model('../data/lstm_sale.h5')

### Ensemble Model: LGBMRegressor + CatBoostRegressor + DNN + LSTM

In [13]:
def ensemble_sale(base_input, lstm_input):
    pred = []
    pred.append(ensemble_lgbm_sale.predict(base_input))
    pred.append(ensemble_cb_sale.predict(base_input))
    pred.append(ensemble_dnn_sale.predict(base_input))
    pred.append(ensemble_lstm_sale.predict(lstm_input))
    return np.array(pred).mean(axis = 0)

##2) SNS 언급량 예측 Final Model : LSTM

In [23]:
lstm_sns = keras.models.load_model('../data/lstm_sns.h5')

#3. 2021-07-29 날씨 예보(2021-07-28 17시 기준)에 따른 판매량 및 SNS 언급량 예측

In [10]:
weather_input = pd.read_csv('../data/final_weather_input.csv')

In [11]:
weather_input

Unnamed: 0,tm,avgTa,temp_diff,sumRn,avgWs,avgRhm,avgTca,O3,PM10
0,2021-07-09,25.035173,5.770388,2.963365,1.758153,81.953459,7.053429,0.029647,15.430012
1,2021-07-10,25.675155,6.256063,2.532003,2.38835,81.726342,6.932619,0.042718,25.255703
2,2021-07-11,25.88385,5.511758,0.534969,2.314651,84.890819,7.065721,0.041383,25.969001
3,2021-07-12,27.69833,8.133836,0.554214,1.888517,77.807261,6.499966,0.035778,28.591155
4,2021-07-13,28.436104,6.149457,0.0,2.064738,75.8115,6.371003,0.024073,24.613414
5,2021-07-14,29.076756,7.092512,0.0,2.32849,73.145023,6.261285,0.018449,17.400724
6,2021-07-15,28.459964,7.817832,0.369476,2.00042,73.993647,4.867408,0.021857,17.635676
7,2021-07-16,27.888442,9.351953,1.908116,2.149735,73.172179,4.555646,0.034547,20.231486
8,2021-07-17,27.601036,7.408757,1.444012,2.238423,69.565068,5.58088,0.028798,14.176159
9,2021-07-18,28.14021,7.750407,2.07645,2.016302,68.409492,6.066361,0.031989,11.385914


In [12]:
def cat_ohe(category):
    small_cat_lst=['가열식 가습기','감/홍시','감귤/한라봉/오렌지','감말랭이',
                  '감자','건포도','견과류 밤','계란','고추/피망/파프리카','공기정화 용품',
                  '공기청정기','과일류','과채 음료/주스','굴 생물','기능성 아이케어 화장품',
                  '기초 화장용 로션','기초 화장용 미스트','기초 화장용 오일/앰플',
                  '기초 화장용 크림','기타 주스류','김치류','남성 로션','남성 메이크업',
                  '남성 선케어','남성 세트','남성 에센스','냉풍기','네일 메이크업 용품',
                  '다이어트보조식','다이어트용 헬스보충식품','대게/킹크랩','더치커피',
                  '데오드란트','돈풍기','둥굴레차','딸기/복분자/블루베리','딸기우유',
                  '라디에이터','레몬/자몽','멀티형 에어컨','메이크업 브러쉬',
                  '무김치','물김치','미숫가루/곡물가루','바나나/파인애플/망고','바디 보습제',
                  '바디 세트','바디 클렌져','바디케어용 땀패드','바디케어용 때비누',
                  '바디케어용 제모제','배/포도/과일즙','배추김치','베이스 메이크업용 쿠션팩트',
                  '베이스 메이크업용 파우더팩트','벽걸이 에어컨','벽걸이형 선풍기',
                  '복합식 가습기','뷰티 타투','비타민/화이바 음료','색조 메이크업 립밤',
                  '색조 메이크업 립스틱','색조 메이크업 마스카라','색조 메이크업 아이섀도우',
                  '생수','선로션','선스프레이','선케어용 선밤','선크림','스킨케어 코팩',
                  '스탠드형 냉온풍기','스탠드형 에어컨','시금치','아이스티','애프터선',
                  '어린이 음료','업소용 선풍기','에어워셔','에어컨 리모컨','에이드',
                  '오이/가지','옥수수','온수매트','온열매트','온풍기','유자차','유제품 음료',
                  '율무차','음용 식초','이동형 에어컨','이온음료','인스턴트커피',
                  '자연식 가습기','장어','전기온수기','전기장판','전통차','절임배추/김치속',
                  '제습기','조개','차/곡물 음료','참외/메론/수박','초음파식 가습기','카페 푸드',
                  '카페트매트','컨벡터','코코아/핫초코','키위/참다래','탁상/USB 선풍기','탄산수',
                  '탄산음료','태닝용 선크림','토마토','파/양파','풋크림','프로폴리스/로얄젤리',
                  '해초류','핸드크림','허브차','헤어에센스','호박','홍차','황토매트', '회',
                  '휴대용 선풍기','히터']
    ohe = OneHotEncoder()
    ohe.fit(np.reshape(small_cat_lst, (-1, 1)))

    cat_ohe = ohe.transform([[category]]).toarray().astype(int)
    return cat_ohe

In [None]:
# cat_ohe('탄산음료')

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [14]:
base_input = np.hstack((cat_ohe('탄산음료'), weather_input[weather_input['tm'] == '2021-07-29'].iloc[:, 1:].values))
lstm_input = np.hstack(([cat_ohe('탄산음료')[0] for i in range(20)], weather_input.iloc[1:, 1:].values)).reshape(1, 20, -1)

In [24]:
def predict_ss(base_input, lstm_input):
    sale_pred = ensemble_sale(base_input, lstm_input)
    sns_pred = lstm_sns(lstm_input).numpy()
    return sale_pred, sns_pred

In [26]:
sale_pred, sns_pred = predict_ss(base_input, lstm_input)

In [32]:
pd.DataFrame(data = sale_pred, columns = wea_sale_ohe.columns[-11:-1], index = ['탄산음료'])

Unnamed: 0,20대 여성 판매량(개),20대 남성 판매량(개),30대 여성 판매량(개),30대 남성 판매량(개),40대 여성 판매량(개),40대 남성 판매량(개),50대 여성 판매량(개),50대 남성 판매량(개),60대 여성 판매량(개),60대 남성 판매량(개)
탄산음료,3342.951199,1778.154984,6172.744303,3305.626503,4056.308783,2236.04309,1655.358437,881.773759,384.585808,249.978117


In [34]:
pd.DataFrame(data = sns_pred, columns = [wea_sns_ohe.columns[-1]], index = ['탄산음료'])

Unnamed: 0,SNS언급량
탄산음료,341.766113
