In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
import os
import argparse
import numpy as np
import pandas as pd
import random as rn
import tensorflow as tf
import keras.layers as L
from sklearn.preprocessing import MinMaxScaler
from keras import backend as K
from keras.models import load_model,Model
from keras.callbacks import ModelCheckpoint, Callback, EarlyStopping
from keras.preprocessing import sequence
from keras.utils.data_utils import Sequence
from keras.regularizers import l2

import matplotlib.pyplot as plt
import warnings
%matplotlib inline

In [None]:
#재현성을 위한 seed 설정
seed_num = 913
np.random.seed(seed_num)
rn.seed(seed_num)
tf.random.set_seed(seed_num)

# **1. 데이터 생성**

In [None]:
final_data = pd.read_csv('./gdrive/My Drive/빅콘 대상팀/data/model_data.csv')

In [None]:
gu = pd.read_excel('./gdrive/My Drive/빅콘 대상팀/data/지역데이터/구_동.xlsx')
gs = pd.read_csv('./gdrive/My Drive/빅콘 대상팀/data/all_amt.csv',parse_dates=['STD_YMD'])
gs = pd.concat([gs.iloc[:,[0,1]],gs.filter(like='GS')],axis=1)
gs = pd.merge(gs,gu,on='HDONG_NM')
gs['CITY'] = gs['HDONG_GU'].apply(lambda x: x[0:2])
gs_seoul = gs.query('CITY == "서울" & STD_YMD > "2020"')

gs_eat = gs_seoul.iloc[:,[0,1,3]].sort_values(['HDONG_NM','STD_YMD'])
gs_snack = gs_seoul.iloc[:,[0,1,4]].sort_values(['HDONG_NM','STD_YMD'])
gs_drink = gs_seoul.iloc[:,[0,1,5]].sort_values(['HDONG_NM','STD_YMD'])

In [None]:
# final data 여러가지 시도
#final_data2= final_data.drop(['holiday','event'],axis=1) #binary 변수 빼기
final_data3= final_data.drop(['COVID_CNT','covid_p1'],axis=1) 
final_data4 = final_data.drop(['COVID_CNT','covid_p1','sc_m1','cj_m1','covid_p1','최저기온','최고기온','일강수량'],axis=1) #최종사용데이터
final_data5 = final_data.drop(['COVID_CNT','covid_p1','sc_m1','cj_m1','covid_p1','기온','최저기온','최고기온','일강수량','일교차','dust'],axis=1)
final_weather = final_data.loc[:,['STD_YMD','HDONG_NM','기온','최저기온','최고기온','일강수량','일교차','dust']]

# **2. 함수 정의**

In [None]:
import numpy as np
from sklearn.metrics import mean_squared_log_error, mean_squared_error

def RMSLE_fun(origin,pred):
  rmsle = np.sqrt(mean_squared_log_error(origin+1, pred+1))
  return rmsle

In [None]:
def minmax_scalar(X):
  idx = X.index
  col = X.columns

  scalar = MinMaxScaler()
  scaled_X = pd.DataFrame(scalar.fit_transform(X))
  scaled_X.index = idx
  scaled_X.columns = col

  return scaled_X

In [None]:
def split_xy(dataset, time_steps, y_column):

  x, y = list(), list()
  for i in range(len(dataset)):
    x_end_number = i + time_steps
    y_end_number = x_end_number + y_column

    if y_end_number > len(dataset):
      break
    tmp_x = np.array(dataset)[i:x_end_number, :]
    tmp_y = np.array(dataset)[x_end_number:y_end_number, -1]
    x.append(tmp_x)
    y.append(tmp_y)
  return np.array(x), np.asarray(y)

In [None]:
def build_data(data,dong,cat):

  X = data.query('HDONG_NM==@dong').reset_index(drop=True)
  
  if cat == "식사":
    eat = gs_eat.query('HDONG_NM==@dong').reset_index(drop=True)
    #X['self_m7'] = eat['GS_식사'].shift(7)
    X['y'] = eat['GS_식사']
  elif cat == "간식":
    snack = gs_snack.query('HDONG_NM==@dong').reset_index(drop=True)
    #X['self_m7'] = snack['GS_간식'].shift(7)
    X['y'] = snack['GS_간식']
  elif cat == "마실거리":
    drink = gs_drink.query('HDONG_NM==@dong').reset_index(drop=True)
    #X['self_m7'] = drink['GS_마실거리'].shift(7)
    X['y'] = drink['GS_마실거리']
  
  X.index = X['STD_YMD']
  del X['STD_YMD'],X['HDONG_NM']

  return X

In [None]:
def data_pipeline(data, dong, cat, time_steps, y_columns):
  data = build_data(data,dong,cat)
  
  #y = data['y']
  #del data['y']
  min = data['y'].min()
  max = data['y'].max()

  X = minmax_scalar(data)
  #Xy = pd.concat([X,y],axis=1)
  Xy = X.dropna()
  
  X,y = split_xy(Xy,time_steps,y_columns)

  X_train, y_train = X[:-7],y[:-7]
  X_test, y_test = X[-7:],y[-7:]

  X_test=X_test.reshape(-1,time_steps,X_train.shape[2])
  y_test=y_test.reshape(-1,y_columns)


  return X_train,y_train,X_test,y_test,min,max

In [None]:
# alpha,beta layer output 받아오는 함수
def get_model_parameters(model):
    class ModelParameters:
        def __init__(self):
            self.output_weights = None     
            self.bias = None

    params = ModelParameters()
    params.output_weights, params.bias = model.get_layer(name='time_distributed_out').get_weights()

    return params

# **3. 최종모델**

In [None]:
def retain(data,dong,cat,timesteps,y_columns):
  X_train,y_train,X_test,y_test,min,max = data_pipeline(data,dong,cat,timesteps,y_columns)
  
  def reshape(data):
    return K.reshape(x=data, shape=(-1, 1,7))

  input = L.Input(shape = (timesteps, X_train.shape[2]), name='input')

  alpha = L.Bidirectional(L.LSTM(X_train.shape[2],
                                  return_sequences=True, implementation=2),
                                  name='alpha')
  beta = L.Bidirectional(L.LSTM(X_train.shape[2],
                                  return_sequences=True, implementation=2),
                                  name='beta')

  alpha_dense = L.Dense(1)
  beta_dense = L.Dense(X_train.shape[2])

  #Compute alpha, visit attention
  alpha_out = alpha(input)
  alpha_out = L.TimeDistributed(alpha_dense, name='alpha_dense_0')(alpha_out)
  alpha_out = L.Softmax(axis=1,name='softmax_1')(alpha_out)

  #Compute beta, codes attention
  beta_out = beta(input)
  beta_out = L.TimeDistributed(beta_dense)(beta_out)
  beta_out = L.Activation('tanh',name='beta_dense_0')(beta_out)
  #Compute context vector based on attentions and embeddings

  c_t = L.Multiply()([alpha_out, beta_out, input])
  c_t = L.Lambda(lambda x: K.sum(x, axis=1))(c_t)
  #Reshape to 3d vector for consistency between Many to Many and Many to One implementations
  contexts = L.Lambda(reshape)(c_t)

  #Make a prediction
  contexts = L.Dropout(0.1)(contexts)
  output_layer = L.Dense(1, name='dOut', activation = 'linear') 

  #TimeDistributed is used for consistency
  # between Many to Many and Many to One implementations
  output = L.TimeDistributed(output_layer, name='time_distributed_out')(contexts)
  #Define the model with appropriate inputs
  model = Model(inputs=input, outputs=[output])
  model.compile(optimizer='adam', loss='mean_squared_error', sample_weight_mode="temporal",metrics=['mse', 'mae', 'mape'])

  early_stopping = EarlyStopping(monitor='val_loss', patience=150, mode='min',restore_best_weights=True)

  #모델 저장 : 학습된 모델 개별 저장함. callbacks에 modelsaver 변수 추가시 저장가능
  #modelsaver = ModelCheckpoint("./gdrive/My Drive/빅콘 대상팀/분석 code/RETAIN/models_weather_shifting/{}_{}_retain.hdf5".format(dong,cat),
  #                            monitor = 'val_loss',mode = 'min',verbose=0,save_best_only=True)

  model.fit(X_train, y_train, epochs=5000, batch_size=32, verbose=0, callbacks=[early_stopping],validation_data = (X_test,y_test))
  
  y_pred = model.predict(X_test, batch_size=1)  
  y_test = y_test.reshape(-1,1) *(max-min)+min
  y_pred =y_pred.reshape(-1,1) *(max-min)+min

  y_train_pred = model.predict(X_train, batch_size=1)
  y_train_pred = y_train_pred.reshape(-1,1)
  y_train_test = y_train.reshape(-1,1)

  rmsle = RMSLE_fun(np.array(y_test),np.array(y_pred))
  mse = mean_squared_error(np.array(y_test),np.array(y_pred))
  rmse = np.sqrt(mse)


  return model, mse, rmse, rmsle#, model_with_attention, W

In [None]:
warnings.filterwarnings("ignore")

dong = list(gu.query('HDONG_GU =="서울 노원구"').HDONG_NM.unique())+list(gu.query('HDONG_GU =="서울 중구"').HDONG_NM.unique())
cat = ['식사'] #원하는 카테고리 설정

RMSLE_RETAIN=[]
MSE_RETAIN=[]
RMSE_RETAIN=[]
for j,c in enumerate(cat):
  for i,d in enumerate(dong):
    if d =="상계8동":
      continue
    #print('=============================result of {}============================'.format(d))
    model,mse,rmse,rmsle=retain(final_data4,d,c,7,1)
    print("동:{}, 카테고리:{}, mse:{} rmse:{} rmsle:{}".format(d,c,mse,rmse,rmsle))
    RMSLE_RETAIN.append(rmsle)
    RMSE_RETAIN.append(rmse)
    MSE_RETAIN.append(mse)

동:월계1동, 카테고리:식사, mse:4.9723499995981015 rmse:2.2298766781143082 rmsle:0.07266424240248377
동:월계2동, 카테고리:식사, mse:5.195597914885882 rmse:2.2793854248208842 rmsle:0.06463094686828076
동:월계3동, 카테고리:식사, mse:2.850596488589339 rmse:1.6883709570439012 rmsle:0.05346456101363269
동:공릉2동, 카테고리:식사, mse:1.8837002602952295 rmse:1.3724796028703776 rmsle:0.04496104002196639
동:하계1동, 카테고리:식사, mse:3.6668377359040596 rmse:1.9148988839894547 rmsle:0.05880295846008147
동:하계2동, 카테고리:식사, mse:19.76270065728765 rmse:4.445525914589594 rmsle:0.08805818687289609
동:중계본동, 카테고리:식사, mse:8.928172684198916 rmse:2.988004799895562 rmsle:0.07027335586164703
동:중계1동, 카테고리:식사, mse:12.298370903209959 rmse:3.506903321052629 rmsle:0.1044947566344791
동:중계4동, 카테고리:식사, mse:6.921445937470105 rmse:2.6308641047135266 rmsle:0.07368282017527247
동:상계1동, 카테고리:식사, mse:0.5801182878457597 rmse:0.7616549664026092 rmsle:0.028895695185163196
동:상계2동, 카테고리:식사, mse:3.1160776705594992 rmse:1.7652415332071414 rmsle:0.03990199208071177
동:상계5동, 카테고리:식사, m

In [None]:
len(MSE_RETAIN)

99

In [None]:
dong.remove('상계8동')
n=len(dong)
result_retain = pd.DataFrame({'dong':dong,
                       'mse_eat':MSE_RETAIN[:n],
                       'mse_snack':MSE_RETAIN[n:len(MSE_RETAIN)-n],
                       'mse_drink':MSE_RETAIN[len(MSE_RETAIN)-n:],
                       'rmse_eat':RMSE_RETAIN[:n],
                       'rmse_snack':RMSE_RETAIN[n:len(RMSE_RETAIN)-n],
                       'rmse_drink':RMSE_RETAIN[len(RMSE_RETAIN)-n:],
                       'rmsle_eat':RMSLE_RETAIN[:n],
                       'rmsle_snack':RMSLE_RETAIN[n:len(RMSLE_RETAIN)-n],
                       'rmsle_drink':RMSLE_RETAIN[len(RMSLE_RETAIN)-n:]}).reset_index(drop=True)


In [None]:
#학습결과
result_retain

Unnamed: 0,dong,mse_eat,mse_snack,mse_drink,rmse_eat,rmse_snack,rmse_drink,rmsle_eat,rmsle_snack,rmsle_drink
0,월계1동,5.26035,1.349331,0.736969,2.293545,1.161607,0.858469,0.074902,0.061438,0.025531
1,월계2동,5.500679,1.576256,8.650181,2.345353,1.25549,2.941119,0.066379,0.057524,0.078072
2,월계3동,1.981008,5.965878,2.21619,1.407483,2.442515,1.488687,0.044684,0.092051,0.030538
3,공릉2동,1.751621,0.828194,3.966682,1.323488,0.910052,1.991653,0.043408,0.042013,0.050953
4,하계1동,2.536506,3.064632,23.434515,1.592641,1.750609,4.840921,0.047699,0.06469,0.10077
5,하계2동,20.872503,6.127504,1.403731,4.568643,2.47538,1.184791,0.09093,0.055432,0.020066
6,중계본동,5.545145,1.680687,9.090574,2.354813,1.296413,3.015058,0.054787,0.036793,0.056594
7,중계1동,12.235006,2.903208,20.457996,3.497857,1.70388,4.523052,0.106377,0.061662,0.096757
8,중계4동,6.7137,4.820049,30.704737,2.591081,2.195461,5.541186,0.073007,0.091432,0.12992
9,상계1동,0.629966,0.765411,7.006757,0.793704,0.874878,2.647028,0.030175,0.043687,0.066163


In [None]:
result_retain.mean(axis=0)

mse_eat         6.813791
mse_snack       4.708938
mse_drink      10.597769
rmse_eat        2.288579
rmse_snack      1.974223
rmse_drink      2.990546
rmsle_eat       0.064912
rmsle_snack     0.075936
rmsle_drink     0.067753
dtype: float64