In [2]:
import pandas as pd
import numpy as np
from select_feature import make_data
import os
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# 성능 평가 함수 생성
def mape(t, y):
    return 100*(np.abs((y-t)/t).sum())/t.shape[0]

In [73]:
data=pd.read_csv("train_0926.csv")
all_data = pd.read_csv("./prepared.csv")

In [23]:
from lightgbm import LGBMRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_predict
from sklearn.ensemble import BaggingRegressor
from sklearn.linear_model import LinearRegression

### 사이킷런의 배깅을 통해 앙상블

In [388]:
def feature_selection(data,del_features=[]): # 제거시킬 feature을 인자로 받아 해당 feature을 제거시킴
    data = data.drop(["new주문량","new판매단가"]+del_features, axis=1)
    if not del_features or del_features[0]!="분류":
        #data = pd.concat([data, pd.get_dummies(data["분류"])], axis=1).drop(["분류"], axis=1)
        data["분류"] = LabelEncoder().fit_transform(data["분류"])
    else:
        pass
    for c,x in data.groupby(["상품군"]):
        del x["상품군"]
        # 카테고리 이름, X(표준화한), y 반환
        yield c, x.drop(["주문량"], axis=1), x["주문량"]


In [70]:
def print_result(data , model, log = True): # MAPE 결과를 출력하는 함수

    predict_mapes = {}
    for c, X, y in feature_selection(data):
        
        # cross_val_score
        model = model
        predicted = cross_val_predict(model, X,y, cv=3) # 예측값
        if log:
            error = mape(np.exp(predicted), np.exp(y)) # mape
        else:
            error = mape(predicted, y)
        print(c,error)
        predict_mapes[c]=error
        
    for k in predict_mapes.keys():
        long = data.loc[data["상품군"]==k,:].shape[0]
        predict_mapes[k]*=(long/data.shape[0])
    
    cost = 0
    for i in predict_mapes.values():
        cost += i
    
    print("\nAverage MAPE :" , cost)

In [411]:
print_result(data, BaggingRegressor(LGBMRegressor(),random_state = 1111), log=False)

beauty 35.25752898700034
bedding 56.692580209469654
cloth 50.650422237194626
elec 64.6607880886969
etc 65.00497872179473
food 33.47905150391918
furniture 76.69072571294991
health 42.61605638425123
inner 45.72029861636061
life 79.07949544695398
living 50.06913556107695

Average MAPE : 54.41881001146014


### feature 유형을 고려 -> 스태킹

In [416]:
data.columns

Index(['노출(분)', 'holiday(includeSS)', '계절', '분류', 'Active Users', 'new판매단가',
       'new주문량', 'prime_time', 'prime_day', 'top_code', 'top_cat', 'x1_cat',
       'x2_cat', 'top_real_weather', '실제_최고기온', '실제_최저기온', '실제_강수량', '실제_평균풍속',
       '세일', '판매단가', '주문량', '예보_최고기온', '예보_최저기온', '예보_강수확률', '예보_강수량', '예보_풍속',
       '무이자', '일시불', '상품군', '요일'],
      dtype='object')

In [418]:
# 제거시킬 특성 후보
del_features = {"실제날씨":['실제_최고기온', '실제_최저기온', '실제_강수량', '실제_평균풍속',"top_code"],
               "예보날씨":['예보_최고기온', '예보_최저기온', '예보_강수확률', '예보_강수량', '예보_풍속'],
                "날씨":['실제_최고기온', '실제_최저기온', '실제_강수량', '실제_평균풍속',"top_code",
                      '예보_최고기온', '예보_최저기온', '예보_강수확률', '예보_강수량', '예보_풍속'],
               "top_real_weather":["top_real_weather"],
               "분류":["분류"],
               "cat":["x1_cat","x2_cat"],
                "code":["top_code","top_cat"],
               "요일":["요일"],
               "기온":['실제_최고기온', '실제_최저기온','예보_최고기온', '예보_최저기온'],
               "강수량":["실제_강수량","예보_강수량"],
               "prime":["prime_time","prime_day"]}

In [414]:
# 제거시킬 특성 후보2
del_features = {"실제날씨":['실제_최고기온', '실제_최저기온', '실제_강수량', '실제_평균풍속'],
               "예보날씨":['예보_최고기온', '예보_최저기온', '예보_강수확률', '예보_강수량', '예보_풍속'],
               "top_real_weather":["top_real_weather"],
               "분류":["분류"],
               "cat":["x1_cat","x2_cat"],
               "top_code":["top_code"],
               "top_cat":["top_cat"]}

In [431]:
class feature_ensemble:
    def __init__(self,data,del_features,log=False):
        from lightgbm import LGBMRegressor
        
        self.data = data.copy()
        self.num_data = len(data)
        self.del_features = del_features
        
        self.cate_models = {}
        for c in self.data["상품군"].unique():
            self.cate_models[c]=[]
            for i in range(len(del_features)):
                self.cate_models[c].append(LGBMRegressor(random_seed = 1111))
            self.cate_models[c].append(LGBMRegressor())
        self.make_dataset(log)
        
    def make_dataset(self,log=False):
        self.cate_datasets = {}
        
        for c, data in self.data.groupby(["상품군"]):
            self.cate_datasets[c]=[]
            for del_feature in self.del_features.values():
                if not log:
                    train = data.drop(["new주문량","new판매단가"]+del_feature, axis=1)
                else:
                    train = data.drop(["주문량","판매단가"]+del_feature, axis=1)

                if del_feature!=["분류"]:
                    train["분류"] = LabelEncoder().fit_transform(train["분류"])

                else:
                    pass
                
                del train["상품군"]
                self.cate_datasets[c].append(train)

    def Train(self):
        
        for c, dataset in self.cate_datasets.items():
            num_data = dataset[0].shape[0]
            self.bool_idx = np.zeros(num_data).astype(bool)
            self.bool_idx[np.random.choice(range(num_data), int(num_data*0.7))]=1

            for i in range(len(del_features)):
                x_y = dataset[i][self.bool_idx]
                X_train, y_train = x_y.drop(["주문량"], axis=1).values, x_y["주문량"].values
                
                if i:
                    self.cate_models[c][i].fit(X_train, y_train)
                    train = np.concatenate([train,self.cate_models[c][i].predict(X_train).reshape(-1,1)], axis=1)
                else:
                    self.cate_models[c][i].fit(X_train, y_train)
                    train = self.cate_models[c][i].predict(X_train).reshape(-1,1)
                    
            self.cate_models[c][-1].fit(train, y_train)
            
            self.Test(c,dataset)
            
    def Test(self,c,dataset):
        
        for i in range(len(del_features)):
            x_y = dataset[i][~self.bool_idx]
            X_test, y_test = x_y.drop(["주문량"], axis=1).values, x_y["주문량"].values
            if i:
                test = np.concatenate([test,self.cate_models[c][i].predict(X_test).reshape(-1,1)], axis=1)
            else:
                test = self.cate_models[c][i].predict(X_test).reshape(-1,1)
                
        predicted = self.cate_models[c][-1].predict(test)
        error = mape(y_test, predicted)
        print(c, error)
        
    def predict(self, X, y):
        pass

In [432]:
re = feature_ensemble(data, del_features)
re.Train()

beauty 46.6797488821553
bedding 92.02087483095842
cloth 79.32970253763537
elec 87.4981361412363
etc 112.450149989827
food 44.996375110536015
furniture 94.83850912207856
health 74.22034401667257
inner 77.93881769156656
life 103.5018959416473
living 83.93284357917848


### 전체 데이터셋으로

In [427]:
train_data = data.copy()
train_data["분류"] = LabelEncoder().fit_transform(train_data["분류"])
train_data["상품군"] = LabelEncoder().fit_transform(train_data["상품군"])

In [433]:
X, y = train_data.drop(["new주문량","new판매단가","주문량"], axis=1), train_data["주문량"]
br = BaggingRegressor(LGBMRegressor(),random_state = 1111)
predicted = cross_val_predict(br, X,y, cv=3)
mape(predicted, y)

54.569947597352495