### 라이브러리 정의

In [2]:
# 데이터 처리 라이브러리
import pandas as pd
import numpy as np
# 분석알고리즘 DecisionTree 구현 라이브러리
from sklearn.tree import DecisionTreeRegressor 
# 과거데이터를 8:2, 7:3 이나 이런식으로 자동으로 나누어주는 라이브러리
from sklearn.model_selection import train_test_split

In [3]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor 
from sklearn.linear_model import LinearRegression

In [4]:
from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

### 데이터 불러오기

In [5]:
featuresData = \
    pd.read_csv("../dataset/feature_regression_example.csv")
featuresData.head(2)

Unnamed: 0,REGIONID,PRODUCTGROUP,PRODUCT,ITEM,YEARWEEK,YEAR,WEEK,QTY,HOLIDAY,HCLUS,PROMOTION,PRO_PERCENT
0,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201501,2015,1,1225,Y,1,Y,0.209442
1,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201502,2015,2,968,N,4,Y,0.209442


### 1. 데이터 전처리

### 1-1. 타입 통합 / 특성 숫자컬럼 추가

### 1-1-1. 데이터 타입 통합

In [6]:
# featuresData.info()

In [7]:
# 주의할 사항은 모든 컬럼에 대해서 고정시키는걸 고려햐는게 나을수도 있다
featuresData.QTY = featuresData.QTY.astype(float)

### 1-1-2. 특성 값 숫자컬럼 변경

In [8]:
featuresData["HOLIDAY_NEW"] = \
    np.where(featuresData.HOLIDAY=="Y",1,0)

In [9]:
featuresData["PROMOTION_NEW"] = \
    np.where(featuresData.PROMOTION=="Y",1,0)

### 1-2. 특성 선정 / 데이터 분리

### 1-2-1. 특성 선정

In [10]:
corrDf = featuresData.corr()

In [11]:
standardLimit = 0.5

In [12]:
features = \
    list(corrDf.loc[ ( abs(corrDf.QTY) > standardLimit ) & 
                    (corrDf.QTY != 1) ].index )

In [13]:
label = ["QTY"]

### 1-2-2. 데이터 분리

In [27]:
standardIndex = 0.8

In [28]:
sortKey = ["REGIONID","ITEM","YEARWEEK"]

In [29]:
sortedData = featuresData.sort_values(sortKey, ignore_index=True)

In [30]:
selectedIndex = int( list( sortedData.shape )[0] * standardIndex )

In [31]:
yearweekStd = sortedData.loc[selectedIndex].YEARWEEK

In [32]:
# 훈련데이터와 테스트데이터를 (문제지와 정답지로 구분해서 정의한다.)
trainingDataFeatures = \
    sortedData.loc[sortedData.YEARWEEK <= yearweekStd, features]
trainingDataLabel = \
    sortedData.loc[sortedData.YEARWEEK <= yearweekStd, label]
testDataFeatures = \
    sortedData.loc[sortedData.YEARWEEK > yearweekStd, features]
testDataLabel = \
    sortedData.loc[sortedData.YEARWEEK > yearweekStd, label]

### 2. 모델적용

### 2-1. 모델 적용

### 2-1-1 학습

In [55]:
# 모델 선언 (Decision, Random)
model_dt = DecisionTreeRegressor(random_state=10, max_depth=1)
model_rf = RandomForestRegressor(random_state=10)
model_lr = LinearRegression()

DecisionTreeRegressor(max_depth=1, random_state=10)

In [56]:
model_dt.fit(X=trainingDataFeatures, y=trainingDataLabel)
model_rf.fit(X=trainingDataFeatures, y=trainingDataLabel)
model_lr.fit(X=trainingDataFeatures, y=trainingDataLabel)

  model_rf.fit(X=trainingDataFeatures, y=trainingDataLabel)


LinearRegression()

### 3. 예측

In [57]:
predictValueDt = model_dt.predict(testDataFeatures)
predictValueRf = model_rf.predict(testDataFeatures)
predictValueLr = model_lr.predict(testDataFeatures)

In [58]:
testDataAll = \
    featuresData.loc[ testDataFeatures.index ] 

In [59]:
testDataAll["PREDICT_DT"] = predictValueDt
testDataAll["PREDICT_RF"] = predictValueRf

In [61]:
testDataAll["PREDICT_LR"] = predictValueLr
testDataAll

Unnamed: 0,REGIONID,PRODUCTGROUP,PRODUCT,ITEM,YEARWEEK,YEAR,WEEK,QTY,HOLIDAY,HCLUS,PROMOTION,PRO_PERCENT,HOLIDAY_NEW,PROMOTION_NEW,PREDICT_DT,PREDICT_RF,PREDICT_LR
85,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201633,2016,33,43.0,N,4,N,0.0,0,0,826.203125,353.535465,302.382057
86,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201634,2016,34,1700.0,Y,1,Y,0.308584,1,1,2062.857143,1409.048303,2042.087373
87,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201635,2016,35,1514.0,Y,1,Y,0.308584,1,1,2062.857143,1409.048303,2042.087373
88,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201636,2016,36,1501.0,Y,1,Y,0.308584,1,1,2062.857143,1409.048303,2042.087373
89,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201637,2016,37,1491.0,N,4,Y,0.308584,0,1,2062.857143,1439.099008,1678.587907
90,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201638,2016,38,806.0,N,4,Y,0.308584,0,1,2062.857143,1439.099008,1678.587907
91,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201639,2016,39,2111.0,N,4,Y,0.280258,0,1,2062.857143,1731.856825,1480.694437
92,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201640,2016,40,2400.0,N,4,Y,0.280258,0,1,2062.857143,1731.856825,1480.694437
93,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201641,2016,41,2010.0,Y,2,Y,0.280258,1,1,2062.857143,2576.315483,1879.40766
94,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201642,2016,42,1900.0,N,4,Y,0.280258,0,1,2062.857143,1731.856825,1480.694437


In [62]:
predictDtMae = mean_absolute_error(y_true=testDataAll.QTY,
                    y_pred=testDataAll.PREDICT_DT )
predictRfMae = mean_absolute_error(y_true=testDataAll.QTY,
                    y_pred=testDataAll.PREDICT_RF )
predictLrMae = mean_absolute_error(y_true=testDataAll.QTY,
                    y_pred=testDataAll.PREDICT_LR )

In [63]:
errorReportDf = pd.DataFrame( [[ predictDtMae, predictRfMae,predictLrMae     ]],
            columns=["DT_MAE","RF_MAE","LR_MAE"])

In [64]:
errorReportDf

Unnamed: 0,DT_MAE,RF_MAE,LR_MAE
0,522.448884,398.759886,625.493983


In [65]:
features

['HCLUS', 'PRO_PERCENT', 'HOLIDAY_NEW', 'PROMOTION_NEW']

In [66]:
# 대휴일 1 소휴일 4
userInputHCLUS = 1
# 제품 할인 %
userInputProPercent=0.5
# 홀리데이 유무 Y= 1 N = 0
userInputHoilidayYn = 1
# 프로모션 유무 Y=1 N= 0
userInputPromotionYn = 1

In [67]:
futureData = pd.DataFrame([[ userInputHCLUS,
              userInputProPercent,
              userInputHoilidayYn,
              userInputPromotionYn]]   )

In [68]:
# pickle 파일로 저장가능하다.
model_dt.predict(futureData)

array([2062.85714286])

In [69]:
trainingDataFeatures.shape

(85, 4)

In [70]:
treeStep1 = trainingDataFeatures.loc[trainingDataFeatures.PRO_PERCENT > 0.259]

In [71]:
treeStep2 = treeStep1.loc[treeStep1.PRO_PERCENT > 0.294]

In [72]:
treeStep3 = treeStep2.loc[treeStep2.HCLUS > 0.5]

In [73]:
treeStep3.index

Int64Index([77, 78, 79, 80, 81], dtype='int64')

In [74]:
featuresData.loc[  treeStep3.index ]

Unnamed: 0,REGIONID,PRODUCTGROUP,PRODUCT,ITEM,YEARWEEK,YEAR,WEEK,QTY,HOLIDAY,HCLUS,PROMOTION,PRO_PERCENT,HOLIDAY_NEW,PROMOTION_NEW
77,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201625,2016,25,968.0,Y,1,Y,0.308584,1,1
78,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201626,2016,26,1685.0,Y,1,Y,0.308584,1,1
79,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201627,2016,27,1586.0,Y,1,Y,0.308584,1,1
80,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201628,2016,28,1616.0,Y,1,Y,0.308584,1,1
81,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201629,2016,29,1318.0,Y,1,Y,0.308584,1,1


In [75]:
trainingDataLabel.loc[treeStep3.index].QTY.mean()

1434.6

# graphviz 시각화

In [76]:
from sklearn.tree import export_graphviz
import graphviz

export_graphviz(decision_tree=model_rf.estimators_[30],
                out_file="tree.dot",
                impurity=True)
with open("tree.dot") as f:
    dot_graph = f.read()
display(graphviz.Source(dot_graph))

ModuleNotFoundError: No module named 'graphviz'

# 모델 저장

In [77]:
import pickle

In [78]:
filename = 'finalized_model.sav'
pickle.dump(model_dt, open(filename, 'wb'))