In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import datetime as dt
import seaborn as sns
from scipy import stats
import math

- 외부데이터 추가한 train

In [54]:
train = pd.read_excel('./data/train.xlsx')
train = train[train['판매단가'] < train['취급액']]

In [55]:
train.index = np.arange(7513)

In [56]:
train['노출(분)'] = train['노출(분)'].replace(0, math.nan)
train['노출(분)'] = train['노출(분)'].fillna(method='ffill')

- test

In [11]:
test = pd.read_excel('./test.xlsx', header=1)

## FE

- 판매단가 log 변환

In [78]:
train['new판매단가'] = np.log(train['판매단가'])
train.drop('판매단가', inplace=True, axis=1)

- 주문량 log 변환

In [82]:
train['new주문량'] = np.log(train['주문량'])
train.drop('주문량', inplace=True, axis=1)

- 시간대

In [83]:
time = train.groupby('HOUR').aggregate(np.mean)

time_rank = {}
rank = 1 
for idx, row in time.sort_values(by='new주문량').iterrows():
    time_rank[idx] = rank
    rank += 1

In [84]:
prime_time = []

for idx, row in train.iterrows():
    prime_time.append(time_rank[row.HOUR])
    
train['prime_time'] = prime_time

- 요일

In [85]:
day = train.groupby('요일').aggregate(np.mean)

day_rank = {}
rank = 1 
for idx, row in day.sort_values(by='new주문량').iterrows():
    day_rank[idx] = rank
    rank += 1
    
prime_day = []

for idx, row in train.iterrows():
    prime_day.append(day_rank[row.요일])
    
train['prime_day'] = prime_day

- test랑 겹치는 마더코드 기준 : 노출시간 대비보다 좋음

In [86]:
intercode = set(test['마더코드'].unique()).intersection(set(train['마더코드'].unique()))
interitem = train[train['마더코드'].isin(intercode)]

In [87]:
code = interitem.groupby('마더코드').aggregate(np.mean)

code_rank = {}
rank = 1
for idx, row in code.sort_values(by='new주문량').iterrows():
    code_rank[idx] = rank
    rank += 1

top_code = []

for idx, row in train.iterrows():
    if row.마더코드 in code_rank.keys():
        top_code.append(code_rank[row.마더코드])
    else:
        top_code.append(0)
        
train['top_code'] = top_code

- 분류 기준 topcode

In [88]:
cat = train.groupby('분류').aggregate(np.mean)

cat_rank = {}
rank = 1
for idx, row in cat.sort_values(by='new주문량').iterrows():
    cat_rank[idx] = rank
    rank += 1

top_cat = []

for idx, row in train.iterrows():
    top_cat.append(cat_rank[row.분류])
    
train['top_cat'] = top_cat

- 분류 변수 원핫/pca

In [89]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import PCA

In [90]:
enc1 = OneHotEncoder()
category = np.array(train['분류'])
category = category.reshape(-1, 1)

In [91]:
enc1.fit(category)
category_ = enc1.transform(category).toarray()

In [92]:
pca = PCA(n_components=2)
res = pca.fit_transform(category_)
res = pd.DataFrame(res, columns=['x1_cat','x2_cat'])

In [93]:
train['x1_cat'] = res['x1_cat']
train['x2_cat'] = res['x2_cat']

- 실제 날씨(서울)

In [94]:
real = train.groupby('실제_서울_날씨').aggregate(np.mean)

real_rank = {}
rank = 1
for idx, row in real.sort_values(by='new주문량').iterrows():
    real_rank[idx] = rank
    rank += 1

top_real_weather = []

for idx, row in train.iterrows():
    top_real_weather.append(real_rank[row.실제_서울_날씨])
    
train['top_real_weather'] = top_real_weather

- 실제 날씨 통합

In [95]:
real_total_weather = pd.read_excel('./data/날씨통합.xlsx')

In [96]:
train['실제_최고기온'] = real_total_weather['최고기온']
train['실제_최저기온'] = real_total_weather['최저기온']
train['실제_강수량'] = real_total_weather['강수량']
train['실제_평균풍속'] = real_total_weather['평균풍속']

- train 확인

In [103]:
train.head()

Unnamed: 0.1,Unnamed: 0,방송일시,노출(분),마더코드,상품코드,상품명,상품군,취급액,날짜,시간,...,prime_day,top_code,top_cat,x1_cat,x2_cat,top_real_weather,실제_최고기온,실제_최저기온,실제_강수량,실제_평균풍속
0,17373,2019-06-15 00:00:00,20.0,100305,200981,오모떼 미라클쉐이핑 브라팬티 시즌3,inner,16517000,2019-06-15,00:00:00,...,1,0,45,-0.006718,0.009977,1,27.266667,16.766667,0.8,2.2
1,17374,2019-06-15 00:20:00,20.0,100305,200981,오모떼 미라클쉐이핑 브라팬티 시즌3,inner,44829000,2019-06-15,00:20:00,...,1,0,45,-0.006718,0.009977,1,27.266667,16.766667,0.8,2.2
2,17375,2019-06-15 00:40:00,20.0,100305,200981,오모떼 미라클쉐이핑 브라팬티 시즌3,inner,56057000,2019-06-15,00:40:00,...,1,0,45,-0.006718,0.009977,1,27.266667,16.766667,0.8,2.2
3,17376,2019-06-15 01:00:00,30.0,100374,201202,USPA 남성 폴로셔츠 위켄드 컬렉션 3종,cloth,9996000,2019-06-15,01:00:00,...,1,0,32,-0.07689,0.13579,1,27.266667,16.766667,0.8,2.2
4,17377,2019-06-15 01:00:00,30.0,100374,201206,USPA 여성 폴로셔츠 위켄드 컬렉션 3종,cloth,9409000,2019-06-15,01:00:00,...,1,0,32,-0.07689,0.13579,1,27.266667,16.766667,0.8,2.2


### 변수 통합

1) 빈도수 인코딩

In [104]:
# 빈도수인코딩(위의 원핫+pca와 비교해보기)
def add_frequency_encoding(data,column): 
    #데이터프레임,열을 받아서 빈도수인코딩열을 추가해줌
    enc_nom = (data.groupby(column).size())/len(data)
    data['freq_encode_{}'.format(column)] = data[column].apply(lambda x:enc_nom[x])
    print("freq_encode column was added")

In [105]:
add_frequency_encoding(train,'분류')

freq_encode column was added


2) 모든 카테고리에 대해 변수 생성

In [228]:
#모든 카테고리에 대해 따로따로 자동 변수생성
category_list = train['상품군'].unique().tolist()
for i in category_list:
    #카테고리에 해당하는 주문량제외 train데이터 할당   -> 주문량 포함되어있음
    globals()['x_{}'.format(i)] = train.loc[train['상품군']==i] 
    #카테고리에 해당하는 train의 주문량을 할당
    globals()['y_{}'.format(i)] = train['new주문량'].loc[train['상품군']==i] 

## split

1) 카테고리별로 x_train_카테고리 , x_test_카테고리, y_train_카테고리, y_test_카테고리

In [230]:
from sklearn.model_selection import train_test_split
for i in category_list:
    x_for_split = globals()['x_{}'.format(i)]
    y_for_split = globals()['y_{}'.format(i)]
    globals()['x_train_{}'.format(i)], globals()['x_test_{}'.format(i)], globals()['y_train_{}'.format(i)], globals()['y_test_{}'.format(i)] = train_test_split(x_for_split, y_for_split, test_size=0.33, random_state=1234)
  

2) 카테고리별 20분 단위 3갈 각각 피팅

In [171]:
catlist = train['상품군'].unique()

In [151]:
#그룹내 방송행간의 시간차이(날짜-날짜) <= 해당그룹 최대 노출(분)인가? 
# 검토해서 분리하는 함수

def split_shoprow(df, time): 
    shoprow1 = pd.DataFrame()  # train
    shoprow2 = pd.DataFrame()  # validation 
    size = len(df)

    for i in range((size)-1):       
        if (df['방송일시'].iloc[i+1]-df['방송일시'].iloc[i] <= time):
            shoprow1 = shoprow1.append(df.iloc[i])
        else:
            shoprow2 = shoprow2.append(df.iloc[i])    
    i += 1
    shoprow2 = shoprow2.append(df.iloc[i])
            
    return shoprow1,shoprow2

- 실험

In [170]:
#기준시간. 방송끼리 이만큼 붙어있어야 같은 제품의 판매임.
criteria_time = pd.Timedelta('0 days 00:{}:00'.format(20))
# 3개를 기준으로, 방송타임 2개는 bigfold에 1개는smallfold에 넣겠다.
train_big_fold = pd.DataFrame()
train_small_fold = pd.DataFrame()
# 마더코드 100080에 실험
x1 = train[train['마더코드']==100080]
train_big_fold, train_small_fold = split_shoprow(x1,criteria_time) 
# 제대로 나옴
print(train_big_fold.head(15)['방송일시'])
print(train_small_fold.head(10)['방송일시'])

175   2019-06-16 15:00:00
176   2019-06-16 15:20:00
306   2019-06-17 22:00:00
307   2019-06-17 22:20:00
339   2019-06-18 12:00:00
340   2019-06-18 12:20:00
466   2019-06-19 22:00:00
467   2019-06-19 22:20:00
567   2019-06-20 23:00:00
568   2019-06-20 23:20:00
730   2019-06-22 23:20:00
731   2019-06-22 23:40:00
761   2019-06-23 11:00:00
762   2019-06-23 11:20:00
915   2019-06-24 22:00:00
Name: 방송일시, dtype: datetime64[ns]
177    2019-06-16 15:40:00
308    2019-06-17 22:40:00
341    2019-06-18 12:40:00
468    2019-06-19 22:40:00
569    2019-06-20 23:40:00
732    2019-06-23 00:00:00
763    2019-06-23 11:40:00
917    2019-06-24 22:40:00
950    2019-06-25 12:40:00
1143   2019-06-27 15:40:00
Name: 방송일시, dtype: datetime64[ns]


- reindex(할필요없을듯)

In [231]:
for cat in catlist:
    globals()['x_{}'.format(cat)].index = np.arange(len(globals()['x_{}'.format(cat)]))

- 카테고리별로 split 하기

In [246]:
for cat in catlist:
    selec_cat = globals()['x_{}'.format(cat)]
    momcode_cat = selec_cat['마더코드'].unique()
    # 걍 한 카테고리에서 가장 노출시간이 긴 시간을 기준으로 해도 될듯?
    # 20분씩
    max_time = selec_cat['노출(분)'].max()
    criteria_time = pd.Timedelta('0 days 00:{}:00'.format(int(max_time)))
    T = pd.DataFrame()
    V = pd.DataFrame()
    
    for momcode in momcode_cat:
        mom = train[train['마더코드']==momcode]
        t, v = split_shoprow(mom, criteria_time) 
        T = T.append(t)
        V = V.append(v)
    
    globals()['{}_bigfold'.format(cat)] = T
    globals()['{}_smallfold'.format(cat)] = V

#### error

In [225]:
def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

#### model 1) rf

In [226]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor()

In [None]:
rf.fit(beauty_bigfold, y_beauty)
predictions = rf.predict(shop_train_x_beauty)
mape = mean_absolute_percentage_error(y_train_beauty, predictions)
mape

#### model 2) dnn

- regularization

In [323]:
x_train_1.describe()

Unnamed: 0,노출(분),holiday(includeSS),실제_서울_최고기온,실제_서울_최저기온,실제_서울_강수량(mm),실제_수원_최고기온,실제_수원_최저기온,실제_수원_강수량(mm),실제_파주_최고기온,실제_파주_최저기온,...,예보_수원_강수량,예보_수원_일최고기온,예보_수원_일최저기온,new판매단가,prime_time,prime_day,top_code,top_cat,x1_cat,x2_cat
count,5033.0,5033.0,5033.0,5033.0,5033.0,5033.0,5033.0,5033.0,5033.0,5033.0,...,5033.0,5033.0,5033.0,5033.0,5033.0,5033.0,5033.0,5033.0,5033.0,5033.0
mean,20.546592,0.317504,30.417803,22.363501,4.956408,30.430836,21.849235,3.882277,29.28186,19.972601,...,2.902976,30.328116,22.036016,11.898127,9.241009,3.908007,10.049672,32.028412,0.00235,0.001141
std,3.596703,0.465552,2.969046,2.591747,12.336896,2.790699,2.806643,13.206496,2.808008,2.950269,...,5.370899,2.209398,2.602339,1.309688,5.875745,1.993451,17.080805,23.656923,0.25041,0.246119
min,5.0,0.0,24.8,16.1,0.0,25.2,15.3,0.0,22.3,13.5,...,0.0,26.263158,16.0,9.4572,1.0,1.0,0.0,1.0,-0.509676,-0.669941
25%,20.0,0.0,28.2,20.2,0.0,28.0,19.7,0.0,27.5,17.8,...,0.131579,28.631579,20.3125,10.913269,4.0,2.0,0.0,11.0,-0.011886,0.005879
50%,20.0,0.0,30.1,22.3,0.0,30.2,21.5,0.0,29.1,19.9,...,1.25,29.947368,21.5,11.502875,9.0,4.0,0.0,27.0,-0.006265,0.007684
75%,20.0,1.0,32.5,24.6,2.9,32.2,24.6,1.0,31.2,23.1,...,2.973684,31.789474,24.9375,12.992255,14.0,6.0,12.0,50.0,-0.004896,0.012942
max,30.0,1.0,36.8,27.9,62.3,36.5,27.9,108.0,34.7,24.7,...,30.789474,35.894737,26.0,15.110238,20.0,7.0,64.0,83.0,0.837735,0.684854


In [329]:
reg_col = ['실제_서울_최고기온', '실제_서울_최저기온', '실제_서울_강수량(mm)', 
           '실제_수원_최고기온', '실제_수원_최저기온', '실제_수원_강수량(mm)',
           '실제_파주_최고기온', '실제_파주_최저기온', '실제_파주_강수량(mm)', 
           '예보_서울_강수확률', '예보_서울_강수량', '예보_서울_하늘상태', 
           '예보_서울_일최고기온', '예보_서울_일최저기온', '예보_일산_강수확률', 
           '예보_일산_강수량', '예보_일산_일최고기온', '예보_일산_일최저기온', 
           '예보_수원_강수확률', '예보_수원_강수량', '예보_수원_일최고기온',
           '예보_수원_일최저기온', 'new판매단가']

In [325]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

In [406]:
x_train_1[reg_col] = scaler.fit_transform((x_train_1[reg_col]))
x_val_1[reg_col] = scaler.fit_transform((x_val_1[reg_col]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the cavea

In [336]:
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import SGD

In [431]:
n_inputs = 31
n_h1 = 50
n_h2 = 50
n_outputs = 1

#### optimizer = sgd

m1 = Sequential()
m1.add(Dense(n_h1, input_dim=31, kernel_initializer='normal', activation='relu'))
m1.add(Dense(n_h2, input_dim=n_h1, kernel_initializer='normal', activation='relu'))
m1.add(Dense(n_outputs, input_dim=n_h2, kernel_initializer='normal'))

In [433]:
m1.compile(optimizer='adam', loss='mean_squared_error')

hist1=m1.fit(x_train_1, y_train, epochs=20, batch_size=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [434]:
pred = m1.predict(x_train_1)

In [435]:
err = error(y_train, pred)
err

70.2866615537398