In [3]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.rcParams['font.family'] = 'Gulim'

pd.set_option('display.max_row', 500)

pd.set_option('display.max_columns', 100)

RuntimeError: module compiled against API version 0xe but this version of numpy is 0xd

In [4]:
from workalendar.asia import SouthKorea
import pendulum

In [5]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

In [6]:
train.columns

Index(['일자', '요일', '본사정원수', '본사휴가자수', '본사출장자수', '본사시간외근무명령서승인건수',
       '현본사소속재택근무자수', '조식메뉴', '중식메뉴', '석식메뉴', '중식계', '석식계'],
      dtype='object')

In [7]:
test.columns

Index(['일자', '요일', '본사정원수', '본사휴가자수', '본사출장자수', '본사시간외근무명령서승인건수',
       '현본사소속재택근무자수', '조식메뉴', '중식메뉴', '석식메뉴'],
      dtype='object')

In [8]:
train = train[['일자','요일', '본사정원수', '본사휴가자수', '본사출장자수', '본사시간외근무명령서승인건수',
       '현본사소속재택근무자수', '조식메뉴', '중식메뉴', '석식메뉴', '중식계', '석식계']]
test = test[['일자','요일', '본사정원수', '본사휴가자수', '본사출장자수', '본사시간외근무명령서승인건수',
       '현본사소속재택근무자수', '조식메뉴', '중식메뉴', '석식메뉴']]

## 전처리
- 일자에서 월과 일을 분리
- 요일을 레이블 인코딩화(EDA로 요일의 중요도 순 파악)
- 월 별, 일 별 중식 석식 수요 차이 파악

In [9]:
train['월'] = pd.DatetimeIndex(train['일자']).month
test['월'] = pd.DatetimeIndex(test['일자']).month
train['주'] = pd.DatetimeIndex(train['일자']).week
test['주'] = pd.DatetimeIndex(test['일자']).week
train['일'] = pd.DatetimeIndex(train['일자']).day
test['일'] = pd.DatetimeIndex(test['일자']).day

train['출근'] = train['본사정원수']-(train['본사휴가자수']+train['본사출장자수']+train['현본사소속재택근무자수'])
train['휴가비율'] = train['본사휴가자수']/train['본사정원수']
train['출장비율'] = train['본사출장자수']/train['본사정원수']
train['야근비율'] = train['본사시간외근무명령서승인건수']/train['출근']
train['재택비율'] = train['현본사소속재택근무자수']/train['본사정원수']

test['출근'] = test['본사정원수']-(test['본사휴가자수']+test['본사출장자수']+test['현본사소속재택근무자수'])
test['휴가비율'] = test['본사휴가자수']/test['본사정원수']
test['출장비율'] = test['본사출장자수']/test['본사정원수']
test['야근비율'] = test['본사시간외근무명령서승인건수']/test['출근']
test['재택비율'] = test['현본사소속재택근무자수']/test['본사정원수']

train['식사가능자수'] = train['본사정원수'] - train['본사휴가자수'] - train['현본사소속재택근무자수']
test['식사가능자수'] = test['본사정원수'] - test['본사휴가자수'] - test['현본사소속재택근무자수']

# train['중식참여율'] = train['중식계'] / train['식사가능자수']

  train['주'] = pd.DatetimeIndex(train['일자']).week
  test['주'] = pd.DatetimeIndex(test['일자']).week


In [10]:
month_rank4dinner = {
    1: 11,
    2: 2,
    3: 1,
    4: 4,
    5: 7,
    6: 6,
    7: 10,
    8: 8,
    9: 5,
    10: 3,
    11: 9,
    12: 12
}
train['월(석식)'] = train['월'].map(month_rank4dinner)
test['월(석식)'] = test['월'].map(month_rank4dinner)

month_rank4lunch = {
    1: 3,
    2: 1,
    3: 2,
    4: 6,
    5: 7,
    6: 8,
    7: 10,
    8: 9,
    9: 5,
    10: 4,
    11: 11,
    12: 12
}
train['월(중식)'] = train['월'].map(month_rank4lunch)
test['월(중식)'] = test['월'].map(month_rank4lunch)

weekday_rank4dinner = {
    '월': 1,
    '화': 2,
    '수': 4,
    '목': 3,
    '금': 5,
}

weekday_rank4lunch = {
    '월': 1,
    '화': 2,
    '수': 3,
    '목': 4,
    '금': 5,
}

train['요일(석식)'] = train['요일'].map(weekday_rank4dinner)
test['요일(석식)'] = test['요일'].map(weekday_rank4dinner)

train['요일(중식)'] = train['요일'].map(weekday_rank4lunch)
test['요일(중식)'] = test['요일'].map(weekday_rank4lunch)

In [11]:
rank = pd.DataFrame(range(1,53))
week_rank_lunch = pd.pivot_table(train,values='중식계',index='주').sort_values(by='중식계').reset_index().drop('중식계',axis=1)
week_rank_dinner = pd.pivot_table(train,values='석식계',index='주').sort_values(by='석식계').reset_index().drop('석식계',axis=1)


week_rank4lunch = {}
for i in range(len(rank)):
    week_rank4lunch[week_rank_lunch['주'][i]] = rank[0][i]


week_rank4dinner = {}
for i in range(len(rank)):
    week_rank4dinner[week_rank_dinner['주'][i]] = rank[0][i]
    
    
train['주(중식)'] = train['주'].map(week_rank4lunch)
test['주(중식)'] = test['주'].map(week_rank4lunch)

train['주(석식)'] = train['주'].map(week_rank4dinner)
test['주(석식)'] = test['주'].map(week_rank4dinner)

In [12]:
def is_holiday(date):
    holidays = list(map(str, pd.Series(np.array(SouthKorea().holidays(int(date[:4])))[:, 0])))
    
    yesterday = str(np.datetime64(date) - 1)
    tomorrow = str(np.datetime64(date) + 1)

    if tomorrow in holidays and yesterday in holidays:
        return 3
    if tomorrow in holidays:
        return 2
    elif yesterday in holidays:
        return 1
    else : 
        return 0

def week_of_month(x):
    dt = pendulum.parse(x)
    
    wom = dt.week_of_month
    if wom < 0:
        wom += 52
    return wom
    

df = pd.concat([train[['본사정원수', '일자']], test[['본사정원수', '일자']]])
df['년월'] = df['일자'].apply(lambda x : x[:7])
df = df[['년월', '본사정원수']].groupby(by=['년월'], as_index=False).mean()

def member_change(date):
    this_month = date[:7]
    last_month = str(np.datetime64(this_month) - 1)
    
    this_month_member = int(df[df['년월'] == this_month]['본사정원수'])
    last_month_member = int(df[df['년월'] == last_month]['본사정원수'])
    
    
    return  this_month_member - last_month_member

train['공휴일전후'] = train['일자'].apply(is_holiday)
test['공휴일전후'] = test['일자'].apply(is_holiday)

train['몇주차'] = train['일자'].apply(week_of_month)
test['몇주차'] = test['일자'].apply(week_of_month)

train = train[train['일자'] > '2016-03']
train['인원변화'] = train['일자'].apply(member_change)
test['인원변화'] = test['일자'].apply(member_change)

## 공휴일 변수 생성

In [13]:
train.columns

Index(['일자', '요일', '본사정원수', '본사휴가자수', '본사출장자수', '본사시간외근무명령서승인건수',
       '현본사소속재택근무자수', '조식메뉴', '중식메뉴', '석식메뉴', '중식계', '석식계', '월', '주', '일',
       '출근', '휴가비율', '출장비율', '야근비율', '재택비율', '식사가능자수', '월(석식)', '월(중식)',
       '요일(석식)', '요일(중식)', '주(중식)', '주(석식)', '공휴일전후', '몇주차', '인원변화'],
      dtype='object')

In [14]:
# 메뉴 변수 없이 사용할떄 해당 코드 사용['공휴일전후', '몇주차', '인원변화']

lunch_train = train[['공휴일전후', '몇주차', '요일(중식)','인원변화','월(중식)','일','주(중식)','출근', '휴가비율', '출장비율', '야근비율', '재택비율','본사출장자수','본사휴가자수','식사가능자수','본사시간외근무명령서승인건수','중식계']]
lunch_test = test[['공휴일전후', '몇주차', '요일(중식)','인원변화','월(중식)','일','주(중식)','출근', '휴가비율', '출장비율', '야근비율', '재택비율','본사출장자수','본사휴가자수','식사가능자수','본사시간외근무명령서승인건수']]

dinner_train= train[['공휴일전후', '몇주차', '요일(석식)','인원변화','월(석식)','일','주(석식)','출근', '휴가비율', '출장비율', '야근비율', '재택비율','본사출장자수','본사휴가자수','식사가능자수','본사시간외근무명령서승인건수','석식계']]
dinner_test = test[['공휴일전후', '몇주차', '요일(석식)','인원변화','월(석식)','일','주(석식)','출근', '휴가비율', '출장비율', '야근비율', '재택비율','본사출장자수','본사휴가자수','식사가능자수','본사시간외근무명령서승인건수']]

In [15]:
lunch_train.columns

Index(['공휴일전후', '몇주차', '요일(중식)', '인원변화', '월(중식)', '일', '주(중식)', '출근', '휴가비율',
       '출장비율', '야근비율', '재택비율', '본사출장자수', '본사휴가자수', '식사가능자수', '본사시간외근무명령서승인건수',
       '중식계'],
      dtype='object')

In [16]:
cat_features = [f for f in lunch_train.columns if lunch_train[f].dtype == 'object']

def column_index(df, cat_features):
    cols = df.columns.values
    sidx = np.argsort(cols)
    return sidx[np.searchsorted(cols, cat_features, sorter=sidx)]

cat_features_idx = column_index(lunch_train, cat_features)    
print("Cat features are: %s" % [f for f in cat_features])
print(cat_features_idx)

Cat features are: []
[]


In [17]:
y_lunch = train[['중식계']]
y_dinner = train[['석식계']]

## 텍스트

In [18]:
menu_train = train[['조식메뉴','중식메뉴','석식메뉴']]
menu_test = test[['조식메뉴','중식메뉴','석식메뉴']]
print(menu_train.shape)
print(menu_test.shape)

(1187, 3)
(50, 3)


In [19]:
def clean_split(df):
    df = df.split()
    for j in df:
        if '(' in j:
            del df[df.index(j)]
    for i in df:
        if '쌀밥' in i:
            del df[df.index(i)] 
    for q in df:
        if '김치' in q:
            del df[df.index(q)]
    
    return df

In [20]:
menu_train['조식메뉴_prepro'] = menu_train['조식메뉴'].apply(clean_split)
menu_train['중식메뉴_prepro'] = menu_train['중식메뉴'].apply(clean_split)
menu_train['석식메뉴_prepro'] = menu_train['석식메뉴'].apply(clean_split)

menu_test['조식메뉴_prepro'] = menu_test['조식메뉴'].apply(clean_split)
menu_test['중식메뉴_prepro'] = menu_test['중식메뉴'].apply(clean_split)
menu_test['석식메뉴_prepro'] = menu_test['석식메뉴'].apply(clean_split)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  menu_train['조식메뉴_prepro'] = menu_train['조식메뉴'].apply(clean_split)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  menu_train['중식메뉴_prepro'] = menu_train['중식메뉴'].apply(clean_split)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  menu_train['석식메뉴_prepro'] = menu_train['석식메뉴'].apply(clean_split)
A value

In [21]:
lunch_train_txt= menu_train.중식메뉴_prepro.tolist()
lunch_test_txt= menu_test.중식메뉴_prepro.tolist()
dinner_train_txt = menu_train.석식메뉴_prepro.tolist()
dinner_test_txt = menu_test.석식메뉴_prepro.tolist()

## 벡터화

In [22]:
lunch_list_train = []
for i in lunch_train_txt:
    if len(i) == 0:
        continue
    else:
        string = i[0]
    for w in i[1:]:
        string += " "
        string += w
    lunch_list_train.append(string)
lunch_list_train

['쇠고기미역국 주꾸미볶음 맛살계란말이 아삭고추무침',
 '근대된장국 탕수육 새송이버섯조림 무생채 요구르트',
 '부대찌개 가자미튀김 메추리알조림 열무된장나물 깍두기',
 '콩나물국 닭갈비 모둠묵*양념장 톳두부무침',
 '차돌박이찌개 소세지구이 풋마늘초무침 깍두기',
 '북어국 돈육장조림 홍어무침 시금치나물',
 '고추장찌개 닭데리야끼조림 해물파전 취나물',
 '배추된장국 수제돈가스 탕평채 쫄면무침',
 '어묵국 돈육고추장볶음 모둠양채쌈*쌈장 콩나물파채무침',
 '쑥국 소불고기 골뱅이무침*소면 마늘쫑볶음',
 '곤드레밥 미니채소떡갈비 두부계란부침 무나물',
 '사골우거지국 탕수어 파래김*양념장 깻순나물 깍두기',
 '시금치국 훈제오리구이 연근땅콩조림 쌈무/부추생채',
 '꽃게탕 버섯불고기 계란말이 도토리묵무침',
 '쇠고기샤브국 코다리강정 유채나물 깍두기',
 '콩나물밥*달래장 맑은국 치킨텐더*요거트D 땅콩조림 실곤약초무침',
 '북어계란국 쇠고기장조림 오징어초무침 시래기나물',
 '닭개장 자반고등어구이 비엔나볶음 열무나물 배추겉절이',
 '얼갈이된장국 닭갈비 해파리냉채 취나물',
 '옹심이만두국 주꾸미볶음 새송이버섯전 치커리유자청생채',
 '봄새싹비빔밥 쪽파국 오징어튀김 알감자버터구이 오렌지',
 '매운콩나물국 돈육굴소스볶음 동태전 봄동나물',
 '버섯들깨탕 갈치조림 우엉잡채 아삭고추무침',
 '순두부찌개 쇠불고기 브로컬리맛살볶음 풋마늘초무침',
 '북어국 오징어볶음*소면 모둠소시지구이 시금치나물',
 '올갱이아욱국 제육볶음 두부조림 모둠쌈*쌈장',
 '어묵국 매운돼지갈비찜 해물파전 무생채',
 '배추된장국 순살양념치킨 메추리알조림 콩나물무침',
 '시래기국 훈제오리구이 무쌈/양파절임 풋마늘초무침',
 '부대찌개 쇠고기단호박조림 열무나물 시금치나물',
 '수제비국 돈육굴소스볶음 양배추쌈 무생채',
 '냉이된장찌개 닭데리야끼조림 도라지초무침 콩나물무침',
 '배추된장국 돈육강정 콩나물잡채 브로컬리두부무침',
 '육개장 꽁치한마리구이 고기전 해초무침',
 '콩나

In [23]:
lunch_list_test = []
for i in lunch_test_txt:
    if len(i) == 0:
        continue
    else:
        string = i[0]
    for w in i[1:]:
        string += " "
        string += w
    lunch_list_test.append(string)
lunch_list_test

['대구지리 매운돈갈비찜 오꼬노미계란말이 상추무침 양상추샐러드*딸기D',
 '우렁된장찌개 오리주물럭 청양부추전 수제삼색무쌈 양상추샐러드*오미자D',
 '팽이장국 수제돈까스*소스 가자미조림 동초나물무침 양상추샐러드*파인요거트D',
 '배추들깨국 오리대패불고기 시금치프리타타 부추고추장무침 양상추샐러드*망고D',
 '부대찌개 닭살데리야끼조림 버섯탕수 세발나물무침 양상추샐러드*오리엔탈D',
 '아욱국 매콤해물볶음 감자조림 미나리나물 콥샐러드*렌치D',
 '설렁탕 볼어묵굴소스볶음 브로콜리숙회*초장 석박지 양상추샐러드*키위D',
 '북엇국 닭볶음탕 채소전*장 솎음열무나물무침 양상추샐러드*황도D',
 '감자양파국 돈수육*씨앗쌈장 매콤어묵볶음 콩나물파채무침 양상추샐러드*자몽D',
 '장각백숙 적어양념장구이 채소스틱*쌈장 도라지오이초무침 양상추샐러드*참깨D',
 '유니짜장밥 짬뽕국 수제찹쌀꿔바로우 계란후라이 단무지락교무침 그린샐러드*딸기요거트D',
 '떡국 소갈비찜 한식잡채 참나물겉절이 양상추샐러드*블루베리요거트D',
 '육개장 닭살겨자냉채 오이스틱*쌈장 탕평채 깍두기/수박 양상추샐러드*오렌지D',
 '미니쌀국수 삼겹살고추장구이 스프링롤*타르타르D 동초나물무침 양상추샐러드*오리엔탈D',
 '수원왕갈비통닭 두부양념조림 연근깨소스무침 양상추샐러드*파인D',
 '유부장국 해물누룽지탕 마약계란장조림 양상추샐러드*딸기D',
 '호박고추장찌개 안동찜닭 마카로니치즈범벅 세발나물무침 양상추샐러드*감귤D',
 '근대국 감자채전*장 치커리무침 깍두기 파스타샐러드',
 '해물탕 쇠고기숙주볶음 맛살계란말이 물미역초고추장무침 양상추샐러드*석류D',
 '나주곰탕 생선까스*타르타르D 더덕양념구이 방풍나물무침 석박지 그린샐러드*키위요거트D',
 '옹심이국 목살스테이크 베이비크랩강정 양상추샐러드*망고D',
 '아욱국 치즈불닭 베이컨감자볶음 매운콩나물무침 양배추샐러드*사우전D',
 '황태미역국 동파육 느타리버섯볶음 참나물상추겉절이 양상추샐러드*블루베리요거트D',
 '매운쇠고기샤브샤브국 갈치조림 수수부꾸미 쑥

In [24]:
dinner_list_train = []
for i in dinner_train_txt:
    if len(i) == 0:
        i = ['.']
        string = i[0]
    else:
        string = i[0]
    for w in i[1:]:
        string += " "
        string += w
    dinner_list_train.append(string)
dinner_list_train

['된장찌개 버섯불고기 콩나물겨자채 양념깻잎지',
 '멸치주먹밥 우동 떡볶이 군고구마 단무지',
 '오징어국 미트볼조림 옥수수전 부추생채',
 '대구찌개 돈육굴소스볶음 감자채볶음 물파래무침',
 '카레덮밥 가쯔오장국 수제고로케*케찹 과일샐러드 오복지',
 '육개장 생선까스*탈탈소스 야채계란찜 콩나물무침',
 '볶음밥 짬뽕 개성감자만두 타코야끼 꼬들단무지',
 '연두부탕 코다리무조림 고추잡채*꽃빵 봄동겉절이',
 '달래된장찌개 간장찜닭 통도라지구이 참나물생채',
 '짜장잡채덮밥 계란파국 참치야채전 꽃맛살샐러드 오이무침',
 '뼈해장국 해물청경채볶음 버섯메밀전 야채스틱*쌈장',
 '토마토스파게티 양송이스프 미니햄버거 단호박범벅 무피클/음료',
 '닭곰탕 임연수찜 어묵곤약볶음 돌나물초장 석박지',
 '수제비국 아귀콩나물찜 새송이버섯볶음 봄동겉절이',
 '굴소스파인볶음밥 가쯔오장국 찹쌀순대볶음 궁중떡찜 두반장가지나물',
 '알탕 동그랑땡전 마파두부 시금치나물',
 '참치회덮밥 미소장국 군만두 콘샐러드 바나나',
 '냉이된장찌개 누룽지탕수육 감자채볶음 청경채무침',
 '콩비지찌개 유산슬 해물까스*탈탈소스 도라지나물',
 '햄볶음밥 유부장국 돈육씨앗강정 해초무침 깨찰빵',
 '어묵국 청포묵무침 애호박나물 깍두기',
 '주먹밥 잔치국수 또띠아피자 과일샐러드 오이피클',
 '쇠고기미역국 순살깐풍기 계란찜 숙주미나리나물',
 '된장찌개 고등어조림 고기전 부추생채',
 '가쯔오장국 감자치즈구이 꽃맛살샐러드 단무지 깍두기',
 '매운버섯국 찜닭 탕평채 청경채생채',
 '김밥 우동 야채튀김 조각사과 오복지',
 '오징어국 미니함박 멸치호두볶음 치커리사과무침',
 '쇠고기무국 가자미찜 계란야채말이 돌나물오이무침',
 '치킨마요덮밥 유부주머니국 떡볶이 단무지무침 오렌지',
 '비빔밥 팽이버섯장국 새우또띠아 견과류조림 피크닉',
 '들깨미역국 수제돈가스 감자범벅 참나물생채',
 '꽃게탕 유산슬 야채계란찜 두반장가지나물',
 '낙지비빔밥 유부장국 새싹피자 과일샐러드 꼬들단무지',
 '어묵

In [25]:
dinner_list_test = []
for i in dinner_test_txt:
    if len(i) == 0:
        i = ['.']
        string = i[0]
    else:
        string = i[0]
    for w in i[1:]:
        string += " "
        string += w
    dinner_list_test.append(string)
dinner_list_test

['흑미밥 얼큰순두부찌개 쇠고기우엉볶음 버섯햄볶음',
 '충무김밥 우동국물 오징어무침 꽃맛살샐러드 얼갈이쌈장무침 석박지',
 '흑미밥 물만둣국 카레찜닭 숯불양념꼬지어묵 꼬시래기무침',
 '흑미밥 동태탕 돈육꽈리고추장조림 당면채소무침 모자반무침',
 '흑미밥 바지락살국 쇠고기청경채볶음 머위된장무침',
 '오므라이스 가쓰오장국 빌소세지구이*구운채소 단감치커리무침 양념고추지',
 '흑미밥 계란파국 돈육두루치기 감자채파프리카볶음 세발나물오리엔탈무침',
 '유부초밥/추가밥 온메밀소바 국물떡볶이 순대찜*소금 청경채겉절이',
 '흑미밥 냉이국 반반치킨 꼬막채소무침 청경채찜',
 '흑미밥 미역국 매운소불고기 단호박두부탕수 메추리알장조림 석박지',
 '흑미밥 오징어굴소스볶음 차돌비빔국수 건새우무나물',
 '흑미밥 순두부백탕 수제치킨까스 쫄면채소무침 얼갈이나물',
 '흑미밥 손수제비국 쇠고기낙지볶음 카레홍합찜 쑥갓나물',
 '곤드레밥 황태국 찰떡떡갈비조림 계란후라이 재래김*달래양념장 무생채',
 '흑미밥 바지락된장찌개 제육볶음 양배추숙*쌈장 노가리고추조림',
 '흑미밥 버섯들깨탕 아귀콩나물찜 콤비네이션피자 돌나물&된장소스',
 '흑미밥 동태알탕 깐풍육 고사리볶음 오이무침',
 '흑미밥 쇠고기무국 춘전닭갈비 뉴욕핫도그 유채나물된장무침',
 '애플카레라이스 팽이장국 가지탕수 소떡소떡 오복지무침',
 '흑미밥 계란파국 쭈꾸미불고기 모둠채소전*장 씨앗콩자반',
 '흑미밥 삼치구이*와사비장 브로콜리깨소스무침 연근조림',
 '흑미밥 냉이김칫국 해물우동볶음 날치알계란찜 솎음열무나물',
 '흑미밥 매운족발볶음 크래미오이보트샐러드 청경채나물',
 '흑미밥 짬뽕국 쇠고기탕수 고추잡채*꽃빵 해초배무침',
 '샐러드김밥 미소시루 라볶이 단무지채무침',
 '흑미밥 달래된장찌개 코코뱅 고구마치즈구이 치커리무침',
 '흑미밥 맑은콩나물국 수제두부동그랑땡 유채나물무침',
 '흑미밥 순두부백탕 낙지볶음 쇠고기들깨소스무침 쪽파무침',
 '꽁보리밥*볶음고추장 닭칼국수 왕만두찜*양념장 버섯맛살볶음 양파장아찌 얼갈

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
lunch_matrix_train = vectorizer.fit_transform(lunch_list_train)
lunch_df_train = pd.DataFrame(lunch_matrix_train.toarray(), columns=vectorizer.get_feature_names())

vectorizer = TfidfVectorizer()
dinner_matrix_train = vectorizer.fit_transform(dinner_list_train)
dinner_df_train = pd.DataFrame(dinner_matrix_train.toarray(), columns=vectorizer.get_feature_names())

vectorizer = TfidfVectorizer()
lunch_matrix_test = vectorizer.fit_transform(lunch_list_test)
lunch_df_test = pd.DataFrame(lunch_matrix_test.toarray(), columns=vectorizer.get_feature_names())

vectorizer = TfidfVectorizer()
dinner_matrix_test = vectorizer.fit_transform(dinner_list_test)
dinner_df_test = pd.DataFrame(dinner_matrix_test.toarray(), columns=vectorizer.get_feature_names())

In [27]:
# 중복 메뉴 선정
lunch_intersection = list(set(lunch_df_train.columns) & set(lunch_df_test.columns))
print(len(lunch_intersection))
dinner_intersection = list(set(dinner_df_train.columns) & set(dinner_df_test.columns))
print(len(dinner_intersection))

146
109


## 병합

In [28]:
lunch_train_f = pd.concat([lunch_train.reset_index(drop=True),lunch_df_train[lunch_intersection].reset_index(drop=True)],axis=1)
lunch_test_f = pd.concat([lunch_test.reset_index(drop=True),lunch_df_test[lunch_intersection].reset_index(drop=True)],axis=1)


dinner_train_f = pd.concat([dinner_train.reset_index(drop=True),dinner_df_train[dinner_intersection].reset_index(drop=True)],axis=1)
dinner_test_f = pd.concat([dinner_test.reset_index(drop=True),dinner_df_test[dinner_intersection].reset_index(drop=True)],axis=1)

In [29]:
lunch_train_f

Unnamed: 0,공휴일전후,몇주차,요일(중식),인원변화,월(중식),일,주(중식),출근,휴가비율,출장비율,야근비율,재택비율,본사출장자수,본사휴가자수,식사가능자수,본사시간외근무명령서승인건수,중식계,묵은지닭찜,오리주물럭,방풍나물,쫄면채소무침,호박고추장찌개,근대된장국,건새우호박채전,오리엔탈d,열무된장국,적어양념장구이,한식잡채,감자채전,닭살겨자냉채,쑥국,춘천닭갈비,시저d,자몽d,맛살계란말이,소갈비찜,쇠고기미역국,유채나물무침,채소스틱,마카로니치즈범벅,쌈장,키위d,감자조림,흑임자d,순남시래기국,유부채소겨자냉채,탕평채,감자양파국,딸기d,수박,...,바나나,돈갈비찜,아욱국,석박지,오리불고기,떡국,쑥갓두부무침,소스,파스타샐러드,문어꽈리고추조림,해물누룽지탕,쇠고기숙주볶음,취나물무침,두부양념조림,양념장,목살스테이크,어묵매운탕,갈비탕,콩가루배추국,황태미역국,감귤d,매실d,근대국,오꼬노미계란말이,소불고기,씨앗쌈장,동태매운탕,냉이된장국,양념간장,와사비장,치커리무침,요구르트,오미자d,황도d,두부까스,파인d,가지나물,오이생채,수원왕갈비통닭,청경채찜,유부장국,계란후라이,북엇국,김말이강정,상추무침,딸기요거트d,육개장,나주곰탕,오리대패불고기,부추고추장무침
0,1,1,3,23,2,2,47,2315.0,0.048399,0.069360,0.112743,0.000000,182,127,2497.0,261,1127.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.613235,0.0,0.477925,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,1,4,23,2,3,47,2339.0,0.023247,0.085366,0.150064,0.000000,224,61,2563.0,351,1000.0,0.0,0.0,0.0,0.0,0.0,0.447668,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.447668,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,1,5,23,2,4,47,2294.0,0.031631,0.094131,0.034438,0.000000,247,83,2541.0,79,837.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,2,1,23,2,7,49,2379.0,0.019055,0.074314,0.191677,0.000000,195,50,2574.0,456,1326.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.339316,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,2,2,23,2,8,49,2365.0,0.017912,0.080793,0.180127,0.000000,212,47,2577.0,426,1026.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1182,0,3,3,-7,3,20,31,2319.0,0.025142,0.066376,0.001725,0.131076,198,75,2517.0,4,1093.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.344747,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.313065,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1183,0,3,4,-7,3,21,31,2309.0,0.030841,0.077439,0.200087,0.117667,231,92,2540.0,462,832.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.376471,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.376471,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1184,0,3,5,-7,3,22,31,2177.0,0.085484,0.083138,0.000459,0.101576,248,255,2425.0,1,579.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.355831,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1185,0,4,1,-7,3,25,36,2396.0,0.035870,0.051291,0.257095,0.109621,153,107,2549.0,616,1145.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# 중식 예측모델

In [None]:
lunch_train.columns

In [37]:
lunch_train_ols = lunch_train_f[['공휴일전후',
                                 '인원변화','몇주차', '요일(중식)','월(중식)', '일', '주(중식)', '출근', '휴가비율',
       '출장비율', '야근비율', '본사출장자수','재택비율', '본사휴가자수', '식사가능자수', '본사시간외근무명령서승인건수',
#                                  '평균 현지기압(hPa)','체감온도',
#                                  '평균 현지기압(hPa)',
                                 '콩나물불고기', '동태매운탕', '쇠고기숙주볶음', '삼색유자청무침', '소불고기', '바나나',
       '황도d', '콩나물파채무침', '버섯매운탕', '돈육간장불고기', '양상추샐러드', '석박지', '수박', '오리대패불고기',
       '오이생채', '고등어구이', '닭볶음탕', '청경채찜', '우렁된장찌개', '요거트d']]

In [38]:
lunch_test_ols = lunch_test_f[['공휴일전후','인원변화','몇주차','월(중식)', '요일(중식)', '일', '주(중식)', '출근', '휴가비율',
       '출장비율', '야근비율', '본사출장자수','재택비율', '식사가능자수', '본사휴가자수', '본사시간외근무명령서승인건수',
#                                '체감온도','평균 현지기압(hPa)',
#                                '평균 현지기압(hPa)',
                               '콩나물불고기', '동태매운탕', '쇠고기숙주볶음', '삼색유자청무침', '소불고기', '바나나',
       '황도d', '콩나물파채무침', '버섯매운탕', '돈육간장불고기', '양상추샐러드', '석박지', '수박', '오리대패불고기',
       '오이생채', '고등어구이', '닭볶음탕', '청경채찜', '우렁된장찌개', '요거트d']]

## tabnet

In [45]:
from sklearn.model_selection import KFold
from pytorch_tabnet.tab_model import TabNetRegressor 
import torch
import torch.nn as nn

In [46]:
X = lunch_train_ols
y = lunch_train_f['중식계']

In [47]:
X      = X.to_numpy()
y      = y.to_numpy().reshape(-1, 1)
X_test = lunch_test_ols.to_numpy()

In [50]:
torch.cuda.is_available()

False

In [53]:
kf = KFold(n_splits=5, random_state=42, shuffle=True)
predictions_array =[]
CV_score_array    =[]
for train_index, test_index in kf.split(X):
    X_train, X_valid = X[train_index], X[test_index]
    y_train, y_valid = y[train_index], y[test_index]
    regressor = TabNetRegressor(optimizer_fn=torch.optim.Adam,
                                optimizer_params=dict(lr=1e-2),
                                scheduler_params={"step_size":50,
                                                 "gamma":0.9},
                                scheduler_fn=torch.optim.lr_scheduler.StepLR,
                                mask_type='sparsemax',
                                verbose=1,
                                seed=42,
                                device_name='cuda')

    regressor.fit(X_train,y_train,
                  eval_set=[(X_train, y_train), (X_valid, y_valid)],
                  batch_size=64, virtual_batch_size=16,
                  patience=100,
                  eval_metric=['mae'],
                  drop_last=False,)
    CV_score_array.append(regressor.best_cost)
    predictions_array.append(regressor.predict(X_test))

predictions = np.mean(predictions_array,axis=0)

Device used : cpu
epoch 0  | loss: 834280.93138| val_0_mae: 888.59778| val_1_mae: 889.14724|  0:00:00s
epoch 1  | loss: 831768.40516| val_0_mae: 885.60044| val_1_mae: 886.51011|  0:00:01s
epoch 2  | loss: 828012.12902| val_0_mae: 882.05166| val_1_mae: 881.92626|  0:00:01s
epoch 3  | loss: 822029.28392| val_0_mae: 877.50259| val_1_mae: 877.71656|  0:00:02s
epoch 4  | loss: 813015.73149| val_0_mae: 868.90889| val_1_mae: 868.70733|  0:00:02s
epoch 5  | loss: 798230.13646| val_0_mae: 857.32089| val_1_mae: 857.12872|  0:00:03s
epoch 6  | loss: 780732.97313| val_0_mae: 849.13819| val_1_mae: 849.56176|  0:00:03s
epoch 7  | loss: 759099.89081| val_0_mae: 835.29886| val_1_mae: 836.04993|  0:00:04s
epoch 8  | loss: 732815.35518| val_0_mae: 821.51256| val_1_mae: 820.87068|  0:00:04s
epoch 9  | loss: 699078.00632| val_0_mae: 800.50634| val_1_mae: 800.41989|  0:00:05s
epoch 10 | loss: 659410.66458| val_0_mae: 777.05862| val_1_mae: 776.68184|  0:00:05s
epoch 11 | loss: 618189.11236| val_0_mae: 753.8

epoch 99 | loss: 10167.52935| val_0_mae: 67.61995| val_1_mae: 77.88476|  0:00:55s
Stop training because you reached max_epochs = 100 with best_epoch = 73 and best_val_1_mae = 77.39639
Best weights from best epoch are automatically used!
Device used : cpu
epoch 0  | loss: 831353.85867| val_0_mae: 885.11407| val_1_mae: 896.51883|  0:00:00s
epoch 1  | loss: 828770.31915| val_0_mae: 882.70878| val_1_mae: 894.06783|  0:00:01s
epoch 2  | loss: 825081.94251| val_0_mae: 878.67805| val_1_mae: 890.0102|  0:00:01s
epoch 3  | loss: 819089.19409| val_0_mae: 876.74718| val_1_mae: 888.14182|  0:00:02s
epoch 4  | loss: 808884.54116| val_0_mae: 861.66669| val_1_mae: 872.165 |  0:00:02s
epoch 5  | loss: 794278.26093| val_0_mae: 850.48464| val_1_mae: 861.21539|  0:00:03s
epoch 6  | loss: 775591.33423| val_0_mae: 820.63494| val_1_mae: 830.95102|  0:00:03s
epoch 7  | loss: 752164.10122| val_0_mae: 800.82605| val_1_mae: 810.64275|  0:00:04s
epoch 8  | loss: 723115.03148| val_0_mae: 790.28536| val_1_mae: 799

epoch 96 | loss: 11288.03157| val_0_mae: 74.39766| val_1_mae: 90.44344|  0:00:51s
epoch 97 | loss: 11140.68679| val_0_mae: 72.52282| val_1_mae: 86.36219|  0:00:51s
epoch 98 | loss: 10624.21249| val_0_mae: 72.30336| val_1_mae: 95.4465 |  0:00:52s
epoch 99 | loss: 11682.58136| val_0_mae: 74.75186| val_1_mae: 99.48215|  0:00:52s
Stop training because you reached max_epochs = 100 with best_epoch = 68 and best_val_1_mae = 82.82463
Best weights from best epoch are automatically used!
Device used : cpu
epoch 0  | loss: 832361.62592| val_0_mae: 886.90046| val_1_mae: 896.49453|  0:00:00s
epoch 1  | loss: 829743.17553| val_0_mae: 883.02829| val_1_mae: 892.58127|  0:00:00s
epoch 2  | loss: 825937.00921| val_0_mae: 878.89995| val_1_mae: 888.94915|  0:00:01s
epoch 3  | loss: 819377.55013| val_0_mae: 872.86806| val_1_mae: 882.25551|  0:00:02s
epoch 4  | loss: 808587.79263| val_0_mae: 864.95843| val_1_mae: 874.67231|  0:00:02s
epoch 5  | loss: 792659.54092| val_0_mae: 856.36564| val_1_mae: 864.423 | 

epoch 93 | loss: 10772.84928| val_0_mae: 68.83642| val_1_mae: 75.6326 |  0:00:49s
epoch 94 | loss: 10843.87311| val_0_mae: 73.26822| val_1_mae: 77.06594|  0:00:49s
epoch 95 | loss: 11406.32526| val_0_mae: 69.39231| val_1_mae: 73.87506|  0:00:50s
epoch 96 | loss: 10130.96021| val_0_mae: 71.9523 | val_1_mae: 77.55818|  0:00:50s
epoch 97 | loss: 10934.81528| val_0_mae: 76.26735| val_1_mae: 79.75874|  0:00:51s
epoch 98 | loss: 10741.78757| val_0_mae: 69.99899| val_1_mae: 76.21338|  0:00:51s
epoch 99 | loss: 10963.68651| val_0_mae: 73.21221| val_1_mae: 77.84682|  0:00:52s
Stop training because you reached max_epochs = 100 with best_epoch = 80 and best_val_1_mae = 73.56305
Best weights from best epoch are automatically used!
Device used : cpu
epoch 0  | loss: 838872.38474| val_0_mae: 897.03313| val_1_mae: 890.22485|  0:00:00s
epoch 1  | loss: 836134.16132| val_0_mae: 887.50139| val_1_mae: 880.77255|  0:00:00s
epoch 2  | loss: 832336.67697| val_0_mae: 883.77785| val_1_mae: 876.85926|  0:00:01

epoch 91 | loss: 9316.00676| val_0_mae: 67.46626| val_1_mae: 74.11577|  0:00:47s
epoch 92 | loss: 9692.83755| val_0_mae: 72.97842| val_1_mae: 79.76843|  0:00:48s
epoch 93 | loss: 9378.18133| val_0_mae: 64.49466| val_1_mae: 75.54877|  0:00:48s
epoch 94 | loss: 9909.64639| val_0_mae: 68.6891 | val_1_mae: 78.22606|  0:00:49s
epoch 95 | loss: 10573.18816| val_0_mae: 66.63871| val_1_mae: 77.64114|  0:00:49s
epoch 96 | loss: 9476.81492| val_0_mae: 70.5728 | val_1_mae: 77.73451|  0:00:50s
epoch 97 | loss: 9708.38844| val_0_mae: 69.34622| val_1_mae: 78.2331 |  0:00:50s
epoch 98 | loss: 9519.2107| val_0_mae: 70.62786| val_1_mae: 76.85348|  0:00:51s
epoch 99 | loss: 9577.24104| val_0_mae: 66.81201| val_1_mae: 78.73176|  0:00:51s
Stop training because you reached max_epochs = 100 with best_epoch = 80 and best_val_1_mae = 72.41736
Best weights from best epoch are automatically used!
Device used : cpu
epoch 0  | loss: 840001.89053| val_0_mae: 891.14784| val_1_mae: 876.6639|  0:00:00s
epoch 1  | los

epoch 88 | loss: 10728.37802| val_0_mae: 70.64478| val_1_mae: 76.42489|  0:01:00s
epoch 89 | loss: 9858.20208| val_0_mae: 69.92139| val_1_mae: 77.19289|  0:01:00s


KeyboardInterrupt: 

In [55]:
predictions_array[0]


array([[ 796.21277],
       [ 770.6365 ],
       [ 640.0823 ],
       [1201.3564 ],
       [1154.8345 ],
       [ 817.65314],
       [1116.5122 ],
       [ 621.88306],
       [1213.7411 ],
       [1177.6799 ],
       [ 802.71643],
       [1264.5591 ],
       [ 796.5654 ],
       [ 838.8812 ],
       [1070.2457 ],
       [ 640.1704 ],
       [1208.0303 ],
       [1209.0906 ],
       [1041.9952 ],
       [1071.9639 ],
       [ 628.1738 ],
       [1085.1509 ],
       [ 800.72723],
       [ 948.1631 ],
       [ 688.9347 ],
       [1114.9521 ],
       [1143.2727 ],
       [ 799.88763],
       [ 935.5468 ],
       [ 660.55493],
       [1103.6953 ],
       [1040.0509 ],
       [ 803.61127],
       [ 903.6698 ],
       [ 650.0372 ],
       [1063.8644 ],
       [ 993.25073],
       [ 746.4576 ],
       [ 838.83307],
       [ 433.6612 ],
       [1008.7748 ],
       [ 984.85913],
       [ 780.31116],
       [ 724.87177],
       [ 651.4395 ],
       [ 944.1112 ],
       [ 926.23486],
       [ 786.

In [38]:
submission = pd.read_csv('../data/sample_submission.csv')

In [483]:
pred_lunch = prep_pipe_lunch.predict(lunch_test_ols)

submission.iloc[:,1] = pred_lunch
submission.head()

Unnamed: 0,일자,중식계,석식계
0,2021-01-27,993.398359,389.235157
1,2021-01-28,922.987336,422.298441
2,2021-01-29,644.993602,227.673868
3,2021-02-01,1290.144309,547.743589
4,2021-02-02,1071.126532,471.184352


## 석식

In [47]:
dinner_train_ols = dinner_train_f[['공휴일전후', '몇주차', '인원변화', '요일(석식)', '월(석식)', '일', '주(석식)', '출근', '휴가비율',
       '출장비율', '야근비율', '재택비율', '본사출장자수', '본사휴가자수', '식사가능자수', '본사시간외근무명령서승인건수',
#                                    'THI', '체감온도',
                                               '흑미밥', 
                                   '부대찌개',
                                   '오므라이스',
                                   '석식계']]

In [48]:
dinner_test_ols = dinner_test_f[['공휴일전후', '몇주차', '인원변화', '요일(석식)', '월(석식)', '일', '주(석식)', '출근', '휴가비율',
       '출장비율', '야근비율', '재택비율', '본사출장자수', '본사휴가자수', '식사가능자수', '본사시간외근무명령서승인건수',
#                                    'THI', '체감온도'
                                               '흑미밥', 
                                 '부대찌개',
                                 '오므라이스'
                                ]]

In [50]:
X = dinner_train_ols
y = dinner_train_f['석식계']

In [54]:
dinner_test_ols.shape

(50, 19)

In [57]:
dinner_test_ols

Unnamed: 0,공휴일전후,몇주차,인원변화,요일(석식),월(석식),일,주(석식),출근,휴가비율,출장비율,야근비율,재택비율,본사출장자수,본사휴가자수,식사가능자수,본사시간외근무명령서승인건수,흑미밥,부대찌개,오므라이스
0,0,4,-7,4,11,27,4,2355.0,0.029501,0.061012,0.002123,0.120013,182,88,2537.0,5,0.177077,0.0,0.0
1,0,4,-7,3,11,28,4,2319.0,0.034864,0.071069,0.176369,0.116661,212,104,2531.0,409,0.0,0.0,0.0
2,0,4,-7,5,11,29,4,2170.0,0.090513,0.083473,0.0,0.098558,249,270,2419.0,0,0.153958,0.0,0.0
3,0,1,-59,1,2,1,22,2340.0,0.036936,0.052668,0.229915,0.110123,154,108,2494.0,538,0.153958,0.0,0.0
4,0,1,-59,2,2,2,22,2362.0,0.021204,0.063611,0.192633,0.107387,186,62,2548.0,455,0.177077,0.0,0.0
5,0,1,-59,4,2,3,22,2380.0,0.020178,0.068057,0.002101,0.097811,199,59,2579.0,5,0.0,0.0,0.400875
6,0,1,-59,3,2,4,22,2364.0,0.020862,0.072161,0.201354,0.098495,211,61,2575.0,476,0.159931,0.0,0.0
7,0,1,-59,5,2,5,22,2247.0,0.057798,0.086183,0.0,0.087551,252,169,2499.0,0,0.0,0.0,0.0
8,0,2,-59,1,2,8,46,2333.0,0.030096,0.059508,0.295757,0.112517,174,88,2507.0,690,0.153958,0.0,0.0
9,0,2,-59,2,2,9,46,2318.0,0.032148,0.062585,0.233822,0.112517,183,94,2501.0,542,0.140567,0.0,0.0


In [56]:
dinner_test_ols

Unnamed: 0,공휴일전후,몇주차,인원변화,요일(석식),월(석식),일,주(석식),출근,휴가비율,출장비율,야근비율,재택비율,본사출장자수,본사휴가자수,식사가능자수,본사시간외근무명령서승인건수,흑미밥,부대찌개,오므라이스
0,0,4,-7,4,11,27,4,2355.0,0.029501,0.061012,0.002123,0.120013,182,88,2537.0,5,0.177077,0.0,0.0
1,0,4,-7,3,11,28,4,2319.0,0.034864,0.071069,0.176369,0.116661,212,104,2531.0,409,0.0,0.0,0.0
2,0,4,-7,5,11,29,4,2170.0,0.090513,0.083473,0.0,0.098558,249,270,2419.0,0,0.153958,0.0,0.0
3,0,1,-59,1,2,1,22,2340.0,0.036936,0.052668,0.229915,0.110123,154,108,2494.0,538,0.153958,0.0,0.0
4,0,1,-59,2,2,2,22,2362.0,0.021204,0.063611,0.192633,0.107387,186,62,2548.0,455,0.177077,0.0,0.0
5,0,1,-59,4,2,3,22,2380.0,0.020178,0.068057,0.002101,0.097811,199,59,2579.0,5,0.0,0.0,0.400875
6,0,1,-59,3,2,4,22,2364.0,0.020862,0.072161,0.201354,0.098495,211,61,2575.0,476,0.159931,0.0,0.0
7,0,1,-59,5,2,5,22,2247.0,0.057798,0.086183,0.0,0.087551,252,169,2499.0,0,0.0,0.0,0.0
8,0,2,-59,1,2,8,46,2333.0,0.030096,0.059508,0.295757,0.112517,174,88,2507.0,690,0.153958,0.0,0.0
9,0,2,-59,2,2,9,46,2318.0,0.032148,0.062585,0.233822,0.112517,183,94,2501.0,542,0.140567,0.0,0.0


In [51]:
X      = X.to_numpy()
y      = y.to_numpy().reshape(-1, 1)
X_test = dinner_test_ols.to_numpy()

In [52]:
kf = KFold(n_splits=12, random_state=42, shuffle=True)
predictions_array =[]
CV_score_array    =[]
for train_index, test_index in kf.split(X):
    X_train, X_valid = X[train_index], X[test_index]
    y_train, y_valid = y[train_index], y[test_index]
    regressor = TabNetRegressor(verbose=1,seed=42)
    regressor.fit(X_train,y_train,
              eval_set=[(X_valid, y_valid)],
              patience=50, max_epochs=1000,
              eval_metric=['mae'],)
    CV_score_array.append(regressor.best_cost)
    predictions_array.append(regressor.predict(X_test))

predictions = np.mean(predictions_array,axis=0)

Device used : cpu
epoch 0  | loss: 231595.29228| val_0_mae: 478.03845|  0:00:00s
epoch 1  | loss: 230856.00735| val_0_mae: 456.16075|  0:00:00s
epoch 2  | loss: 230167.82812| val_0_mae: 472.65372|  0:00:01s
epoch 3  | loss: 229501.51654| val_0_mae: 467.60018|  0:00:01s
epoch 4  | loss: 228901.7307| val_0_mae: 464.17259|  0:00:01s
epoch 5  | loss: 228160.0671| val_0_mae: 451.87514|  0:00:02s
epoch 6  | loss: 227359.6875| val_0_mae: 446.6206|  0:00:02s
epoch 7  | loss: 226542.29412| val_0_mae: 439.10714|  0:00:02s
epoch 8  | loss: 225579.81434| val_0_mae: 435.97501|  0:00:03s
epoch 9  | loss: 224540.05055| val_0_mae: 432.22016|  0:00:03s
epoch 10 | loss: 223476.86397| val_0_mae: 428.64514|  0:00:03s
epoch 11 | loss: 222224.03676| val_0_mae: 424.4893|  0:00:04s
epoch 12 | loss: 220940.05147| val_0_mae: 421.64225|  0:00:04s
epoch 13 | loss: 219547.69026| val_0_mae: 418.46669|  0:00:05s
epoch 14 | loss: 218001.48254| val_0_mae: 423.99463|  0:00:05s
epoch 15 | loss: 216423.90993| val_0_mae: 

RuntimeError: running_mean should contain 19 elements not 20

In [35]:
pred_lunch = prep_pipe_lunch.predict(dinner_test_ols)

submission.iloc[:,2] = pred_lunch
submission.head()

NameError: name 'prep_pipe_lunch' is not defined

In [104]:
submission = pd.read_csv('../data/sample_submission.csv')

In [249]:
submission.iloc[:,1] = predicts_Auto_lunch
submission.iloc[:,2] = predicts_Auto_dinner
submission.head()

Unnamed: 0,일자,중식계,석식계
0,2021-01-27,984.501164,393.523061
1,2021-01-28,919.408014,413.598578
2,2021-01-29,622.868644,246.65625
3,2021-02-01,1280.056279,517.615221
4,2021-02-02,1076.343886,444.709699


In [111]:
import datetime
today = str(datetime.datetime.now().date()).replace("-","")
print("오늘 날짜 : " + today)

submission.to_csv(f'../submission/{today}_lgbm_autoML.csv', index =False)

오늘 날짜 : 20210716


# 저장

In [472]:
import datetime
today = str(datetime.datetime.now().date()).replace("-","")
print("오늘 날짜 : " + today)

submission.to_csv(f'../submission/{today}_pycaret.csv', index =False)

오늘 날짜 : 20210719


In [250]:
answer = pd.read_csv('../submission/20210701_pycaret_(2)-69.8998.csv')

In [251]:
best_submission = pd.read_csv('../data/sample_submission.csv')
best_submission.iloc[:,1:] = submission.iloc[:,1:]*5/9 +  answer.iloc[:,1:]*4/9
best_submission.to_csv(f'../submission/{today}_lgbm_autoML_ensenble_3.csv', index =False)