In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore", category=UserWarning, module='bs4')

from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from lightgbm import LGBMRegressor

from gensim.models import Word2Vec
from gensim.models import KeyedVectors

In [4]:
train = pd.read_csv('drive/MyDrive/data/구내식당/train.csv')
test = pd.read_csv('drive/MyDrive/data/구내식당/test.csv')

In [4]:
train.head()

Unnamed: 0,일자,요일,본사정원수,본사휴가자수,본사출장자수,본사시간외근무명령서승인건수,현본사소속재택근무자수,조식메뉴,중식메뉴,석식메뉴,중식계,석식계
0,2016-02-01,월,2601,50,150,238,0.0,모닝롤/찐빵 우유/두유/주스 계란후라이 호두죽/쌀밥 (쌀:국내산) 된장찌개 쥐...,"쌀밥/잡곡밥 (쌀,현미흑미:국내산) 오징어찌개 쇠불고기 (쇠고기:호주산) 계란찜 ...","쌀밥/잡곡밥 (쌀,현미흑미:국내산) 육개장 자반고등어구이 두부조림 건파래무침 ...",1039.0,331.0
1,2016-02-02,화,2601,50,173,319,0.0,모닝롤/단호박샌드 우유/두유/주스 계란후라이 팥죽/쌀밥 (쌀:국내산) 호박젓국찌...,"쌀밥/잡곡밥 (쌀,현미흑미:국내산) 김치찌개 가자미튀김 모둠소세지구이 마늘쫑무...","콩나물밥*양념장 (쌀,현미흑미:국내산) 어묵국 유산슬 (쇠고기:호주산) 아삭고추무...",867.0,560.0
2,2016-02-03,수,2601,56,180,111,0.0,모닝롤/베이글 우유/두유/주스 계란후라이 표고버섯죽/쌀밥 (쌀:국내산) 콩나물국...,"카레덮밥 (쌀,현미흑미:국내산) 팽이장국 치킨핑거 (닭고기:국내산) 쫄면야채무침 ...","쌀밥/잡곡밥 (쌀,현미흑미:국내산) 청국장찌개 황태양념구이 (황태:러시아산) 고기...",1017.0,573.0
3,2016-02-04,목,2601,104,220,355,0.0,"모닝롤/토마토샌드 우유/두유/주스 계란후라이 닭죽/쌀밥 (쌀,닭:국내산) 근대국...","쌀밥/잡곡밥 (쌀,현미흑미:국내산) 쇠고기무국 주꾸미볶음 부추전 시금치나물 ...","미니김밥*겨자장 (쌀,현미흑미:국내산) 우동 멕시칸샐러드 군고구마 무피클 포...",978.0,525.0
4,2016-02-05,금,2601,278,181,34,0.0,모닝롤/와플 우유/두유/주스 계란후라이 쇠고기죽/쌀밥 (쌀:국내산) 재첩국 방...,"쌀밥/잡곡밥 (쌀,현미흑미:국내산) 떡국 돈육씨앗강정 (돼지고기:국내산) 우엉잡채...","쌀밥/잡곡밥 (쌀,현미흑미:국내산) 차돌박이찌개 (쇠고기:호주산) 닭갈비 (닭고기:...",925.0,330.0


In [5]:
train.describe()

Unnamed: 0,본사정원수,본사휴가자수,본사출장자수,본사시간외근무명령서승인건수,현본사소속재택근무자수,중식계,석식계
count,1205.0,1205.0,1205.0,1205.0,1205.0,1205.0,1205.0
mean,2807.815768,157.913693,241.142739,274.117012,43.506224,890.33444,461.772614
std,171.264404,144.190572,43.532298,246.239651,109.9374,209.505057,139.179202
min,2601.0,23.0,41.0,0.0,0.0,296.0,0.0
25%,2645.0,71.0,217.0,4.0,0.0,758.0,406.0
50%,2760.0,105.0,245.0,299.0,0.0,879.0,483.0
75%,2962.0,185.0,272.0,452.0,0.0,1032.0,545.0
max,3305.0,1224.0,378.0,1044.0,533.0,1459.0,905.0


In [5]:
train[['현본사소속재택근무자수', '중식계', '석식계']] = train[['현본사소속재택근무자수', '중식계', '석식계']].astype('int')
test['현본사소속재택근무자수'] = test['현본사소속재택근무자수'].astype('int')

train['일자'] = pd.to_datetime(train['일자'])
test['일자'] = pd.to_datetime(test['일자'])

train['년'] = train['일자'].dt.year
train['월'] = train['일자'].dt.month
train['요일'] = train['일자'].dt.weekday
train['출근'] = train['본사정원수']-(train['본사휴가자수']+train['본사출장자수']+train['현본사소속재택근무자수'])
train['휴가비율'] = train['본사휴가자수']/train['본사정원수']
train['출장비율'] = train['본사출장자수']/train['본사정원수']
train['야근비율'] = train['본사시간외근무명령서승인건수']/train['출근']
train['재택비율'] = train['현본사소속재택근무자수']/train['본사정원수']

test['년'] = test['일자'].dt.year
test['월'] = test['일자'].dt.month
test['요일'] = test['일자'].dt.weekday
test['출근'] = test['본사정원수']-(test['본사휴가자수']+test['본사출장자수']+test['현본사소속재택근무자수'])
test['휴가비율'] = test['본사휴가자수']/test['본사정원수']
test['출장비율'] = test['본사출장자수']/test['본사정원수']
test['야근비율'] = test['본사시간외근무명령서승인건수']/test['출근']
test['재택비율'] = test['현본사소속재택근무자수']/test['본사정원수']

In [21]:
def split_process(x, q):
    x_ = []
    x = x.split(' ')
    for i in x:
        if '(' in i and ':' in i and ')' in i:
            continue
        if '/' in i:
            x_.extend(i.split('/'))
        else:
            x_.append(i)
    x_ = list(set(x_))
    x_.remove('')
    return x_

food_combinations = []
for i in ['조식메뉴', '중식메뉴', '석식메뉴']:
    food_combinations += train[i].apply(lambda x: split_process(x, i)).to_list()

TRAIN_W2V = True
try:
    model = Word2Vec.load('drive/MyDrive/data/구내식당/food_embedding.model')
    print("Model loaded")
except:
    if TRAIN_W2V:
        model = Word2Vec(sentences=food_combinations, size=200, window=7, min_count=0, workers=4, sg=0, iter=5000)
        model.save('drive/MyDrive/data/구내식당/food_embedding.model')
    else:
        print("Model loading failed. Do not train.")    

Model loaded


In [22]:
def get_food_embedding(x):
    x_ = []
    x = x.split(' ')
    for i in x:
        if '(' in i and ':' in i and ')' in i:
            continue
        if '/' in i:
            x_.extend(i.split('/'))
        else:
            x_.append(i)
    x_ = list(set(x_))
    x_.remove('')
    vec_ = np.zeros(200)
    for i in x_:
      try:
        vec = model.wv.get_vector(i)
        vec_ += vec
      except:
        vec = 0.00001
    vec_ /= len(x_)
    return vec_

train['조식메뉴_embedding'] = train['조식메뉴'].apply(lambda x: get_food_embedding(x))
train['중식메뉴_embedding'] = train['중식메뉴'].apply(lambda x: get_food_embedding(x))
train['석식메뉴_embedding'] = train['석식메뉴'].apply(lambda x: get_food_embedding(x))

test['조식메뉴_embedding'] = test['조식메뉴'].apply(lambda x: get_food_embedding(x))
test['중식메뉴_embedding'] = test['중식메뉴'].apply(lambda x: get_food_embedding(x))
test['석식메뉴_embedding'] = test['석식메뉴'].apply(lambda x: get_food_embedding(x))
# train['중식메뉴_split'] = train['중식메뉴'].apply(lambda x: get_food_embedding(x))
# train['석식메뉴_split'] = train['석식메뉴'].apply(lambda x: get_food_embedding(x))

# test['중식메뉴_split'] = test['중식메뉴'].apply(lambda x: get_food_embedding(x))
# test['석식메뉴_split'] = test['석식메뉴'].apply(lambda x: get_food_embedding(x))

In [6]:
y_lunch = train['중식계']
y_dinner = train['석식계']
train.drop(['일자','조식메뉴', '중식메뉴', '석식메뉴', '중식계', '석식계'], axis=1, inplace=True)
test.drop(['일자','조식메뉴', '중식메뉴', '석식메뉴'], axis=1, inplace=True)

In [24]:
X_common = train.iloc[:, :13]

emb_arr_lunch = np.array(train.iloc[:, 14].to_numpy().tolist()) # Ver 2
emb_arr_dinner = np.array(train.iloc[:, 15].to_numpy().tolist()) # Ver 2

X_train_lunch = np.concatenate((X_common.to_numpy(), emb_arr_lunch), axis=1)
X_train_dinner = np.concatenate((X_common.to_numpy(), emb_arr_dinner), axis=1)

X_train_lunch, X_test_lunch, y_train_lunch, y_test_lunch = train_test_split(X_train_lunch, y_lunch, test_size=0.1, random_state=42)
X_train_dinner, X_test_dinner, y_train_dinner, y_test_dinner = train_test_split(X_train_dinner, y_dinner, test_size=0.1, random_state=42)

In [7]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
train_sc = sc.fit_transform(train)
test_sc = sc.transform(test)

In [8]:
X_train_lunch, X_test_lunch, y_train_lunch, y_test_lunch = train_test_split(train_sc, y_lunch, test_size=0.1, random_state=42)
X_train_dinner, X_test_dinner, y_train_dinner, y_test_dinner = train_test_split(train_sc, y_dinner, test_size=0.1, random_state=42)

In [9]:
model_lunch = LGBMRegressor(learning_rate=0.03,n_estimators=500,colsample_bytree=0.95,reg_alpha=0.01)
model_lunch.fit(X_train_lunch, y_train_lunch)

model_dinner = LGBMRegressor(learning_rate=0.03,n_estimators=500,colsample_bytree=0.95,reg_alpha=0.01)
model_dinner.fit(X_train_dinner, y_train_dinner)

# Validate
pred_lunch = model_lunch.predict(X_test_lunch)
pred_dinner = model_dinner.predict(X_test_dinner)

print("lunch mae: ", mean_absolute_error(y_test_lunch, pred_lunch))
print("dinner mae: ", mean_absolute_error(y_test_dinner, pred_dinner))

lunch mae:  65.20953166150126
dinner mae:  64.83066538832188


In [11]:
# X_common = test.iloc[:, :13]

# test_emb_arr_lunch = np.array(test.iloc[:, 14].to_numpy().tolist()) # Ver 2
# test_emb_arr_dinner  = np.array(test.iloc[:, 15].to_numpy().tolist()) # Ver 2

# test_lunch = np.concatenate((X_common.to_numpy(), test_emb_arr_lunch), axis=1)
# test_dinner = np.concatenate((X_common.to_numpy(), test_emb_arr_dinner), axis=1)

test_pred_lunch = model_lunch.predict(test_sc)
test_pred_dinner = model_dinner.predict(test_sc)

In [12]:
submission_df = pd.read_csv('drive/MyDrive/data/구내식당/sample_submission.csv')
submission_df['중식계'] = test_pred_lunch
submission_df['석식계'] = test_pred_dinner
submission_df.to_csv('drive/MyDrive/data/구내식당/sub_2.csv', index=False)

In [13]:
submission_df

Unnamed: 0,일자,중식계,석식계
0,2021-01-27,1039.530149,391.461467
1,2021-01-28,983.955703,372.564278
2,2021-01-29,576.334806,196.5735
3,2021-02-01,1184.227532,510.052146
4,2021-02-02,937.690368,400.989487
5,2021-02-03,972.186122,377.318884
6,2021-02-04,927.053408,414.415793
7,2021-02-05,600.291285,274.487357
8,2021-02-08,1197.26432,650.800916
9,2021-02-09,1010.436545,501.886328
