<a href="https://colab.research.google.com/github/GodJiLee/Dacon/blob/main/%EA%B5%AC%EB%82%B4%EC%8B%9D%EB%8B%B9%20%EC%8B%9D%EC%88%98%EC%9D%B8%EC%9B%90%20%EC%98%88%EC%B8%A1/90_1263367339_W2V_%EA%B5%AC%EB%82%B4%EC%8B%9D%EB%8B%B9.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **메뉴를 임베딩하여 모델의 피처로 사용하기 (Word2Vec, LGBM Baseline)**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder # 수치형 자료가 아닌 자료에 대해 모델 학습을 위해 정수형으로 바꿔주는 기능
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from lightgbm import LGBMRegressor # Tree 기반 학습 알고리즘 (Gradient Boosting 프레임워크)

from gensim.models import Word2Vec

from gensim.models import KeyedVectors # 워드 벡터를 저장하고 관리하는 기능, 유사성 조회
from sklearn.manifold import TSNE # PCA 기반 차원 축소의 문제를 해결하기 위한 방법 (티스니)
from sklearn.decomposition import PCA

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Hyper Parameter
class CFG:
    emb_dim = 200

args = CFG

In [None]:
df_train = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/dacon_contest1/data/train.csv')
df_test = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/dacon_contest1/data/test.csv')

#**Word Embedding**

In [None]:
df_all = pd.concat([df_train, df_test]) # train, test 모두 사용 (data leakage 문제)

In [None]:
# Basic text preprocessing
def split_process(x, q):
    x_ = []
    x = x.split(' ')
    for i in x:
        if '(' in i and ':' in i and ')' in i:
            continue
        
        if '/' in i:
            x_.extend(i.split('/')) # 리스트 풀어서 할당 (append와 다른 점)
        
        else:
            x_.append(i)
    
    x_ = list(set(x_))
    x_.remove('')
    return x_

In [None]:
# Get all combinations for training w2v (train + test)
food_combinations = []
for i in ['조식메뉴', '중식메뉴', '석식메뉴']:
    food_combinations += df_all[i].apply(lambda x: split_process(x, i)).to_list() # 조식, 중식, 석식 메뉴를 모두 메뉴 단위로 split해서 리스트로 만듦

In [None]:
food_combinations

[['찐빵',
  '쥐어채무침',
  '주스',
  '계란후라이',
  '포기김치',
  '된장찌개',
  '두유',
  '모닝롤',
  '우유',
  '호두죽',
  '쌀밥'],
 ['호박젓국찌개',
  '단호박샌드',
  '주스',
  '시래기조림',
  '계란후라이',
  '두유',
  '팥죽',
  '모닝롤',
  '우유',
  '포기김치',
  '쌀밥'],
 ['느타리호박볶음',
  '콩나물국',
  '주스',
  '계란후라이',
  '표고버섯죽',
  '두유',
  '쌀밥',
  '모닝롤',
  '우유',
  '포기김치',
  '베이글'],
 ['멸치볶음',
  '근대국',
  '주스',
  '계란후라이',
  '토마토샌드',
  '두유',
  '닭죽',
  '모닝롤',
  '우유',
  '포기김치',
  '쌀밥'],
 ['주스', '계란후라이', '두유', '쇠고기죽', '와플', '재첩국', '모닝롤', '우유', '방풍나물', '포기김치', '쌀밥'],
 ['찐빵',
  '팬케익',
  '견과류죽',
  '주스',
  '계란후라이',
  '명엽채무침',
  '감자찌개',
  '두유',
  '우유',
  '포기김치',
  '쌀밥'],
 ['주스',
  '계란후라이',
  '고구마죽',
  '두유',
  '봄동된장국',
  '야채샌드',
  '모닝롤',
  '우유',
  '숙주나물',
  '포기김치',
  '쌀밥'],
 ['치즈프레즐',
  '민물새우찌개',
  '주스',
  '잣죽',
  '콩조림',
  '계란후라이',
  '두유',
  '모닝롤',
  '우유',
  '포기김치',
  '쌀밥'],
 ['단호박죽', '김구이', '주스', '어묵국', '계란후라이', '두유', '모닝롤', '우유', '마늘빵', '포기김치', '쌀밥'],
 ['무생채',
  '참치샌드',
  '주스',
  '흑임자죽',
  '계란후라이',
  '두유',
  '북어계란국',
  '모닝롤',
  '우유',
  '포기김치',
  '쌀밥'],
 ['선지해장국', '주스'

In [None]:
# Train or load w2v model
TRAIN_W2V = True
try:
    model = Word2Vec.load('food_embedding.model')
    print('Model loaded')

except:
    if TRAIN_W2V:
        print('Training w2v')
        model = Word2Vec(sentences=food_combinations, size=args.emb_dim, window=7, min_count=0, workers=4, sg=0, iter=5000) # , epochs=5000
        model.save('food_embedding.model')

    else:
        print('Model loading failed. Do not train')

Training w2v


In [None]:
# w2v demo
model.wv.most_similar('된장찌개')

[('차돌박이찌개', 0.5681772828102112),
 ('민물새우찌개', 0.5379247665405273),
 ('소고기무국', 0.5364925861358643),
 ('동태탕', 0.5347940921783447),
 ('북어계란국', 0.5309130549430847),
 ('오징어국', 0.529982328414917),
 ('고추장찌개', 0.5242332816123962),
 ('감자국', 0.513068437576294),
 ('어묵국', 0.512531042098999),
 ('얼갈이국', 0.5036494731903076)]

# **Preprocess**

In [None]:
def process_date(df):
    df['일자'] = pd.to_datetime(df['일자'], format="%Y-%m-%d")
    df['year'] = df['일자'].dt.year
    df['month'] = df['일자'].dt.month
    df['day'] = df['일자'].dt.day
    df = df.drop('일자', axis=1)
    return df

def get_food_embedding(x):
    x_ = []
    x = x.split(' ')
    for i in x:
        if '(' in i and ':' in i and ')' in i:
            continue
        if '/' in i:
            x_.extend(i.split('/'))
        else:
            x_.append(i)
    x_ = list(set(x_))
    x_.remove('')
    vec_ = np.zeros(args.emb_dim)
    for i in x_:
        vec = model.wv.get_vector(i)
        vec_ += vec
    vec_ /= len(x_)
    return vec_

In [None]:
# General preprocessing
df_train = process_date(df_train)
day_encoder = LabelEncoder()
df_train['요일'] = day_encoder.fit_transform(df_train['요일']) # 요일을 숫자로

In [None]:
# Get embedding
df_train['조식메뉴_embedding'] = df_train['조식메뉴'].apply(lambda x: get_food_embedding(x))
df_train['중식메뉴_embedding'] = df_train['중식메뉴'].apply(lambda x: get_food_embedding(x))
df_train['석식메뉴_embedding'] = df_train['석식메뉴'].apply(lambda x: get_food_embedding(x))

In [None]:
# 필요없는 column 없애기
y_lunch = df_train['중식계']
y_dinner = df_train['석식계']
df_train.drop(['조식메뉴', '중식메뉴', '석식메뉴', '중식계', '석식계'], axis = 1, inplace = True) # axis = 1은 열단위 수정, inplace = T 는 원본에서 바로 바꾸기

In [None]:
X_common = df_train.iloc[:, :9]

In [None]:
df_train.head(2)

Unnamed: 0,요일,본사정원수,본사휴가자수,본사출장자수,본사시간외근무명령서승인건수,현본사소속재택근무자수,year,month,day,조식메뉴_embedding,중식메뉴_embedding,석식메뉴_embedding
0,3,2601,50,150,238,0.0,2016,2,1,"[-1.1181462992998688, -0.5244570374488831, -0....","[-0.2748572360724211, 0.17433152347803116, 0.1...","[-0.5140441499949832, -0.5150682266269412, -0...."
1,4,2601,50,173,319,0.0,2016,2,2,"[-0.9256239181215112, -0.625089555978775, -0.6...","[0.24973702430725098, 0.762308269739151, -0.11...","[0.03318041066328684, -0.0709381935497125, 0.0..."


In [None]:
X_common.head(2)

Unnamed: 0,요일,본사정원수,본사휴가자수,본사출장자수,본사시간외근무명령서승인건수,현본사소속재택근무자수,year,month,day
0,3,2601,50,150,238,0.0,2016,2,1
1,4,2601,50,173,319,0.0,2016,2,2


In [None]:
# 조식 포함한 버전, 포함하지 않은 버전
# ver2의 결과가 더 좋음

# emb_arr_lunch = np.array(df_train.iloc[:, 9].to_numpy().tolist()) + np.array(df_train.iloc[:, 10].to_numpy().tolist()) # Ver 1
# emb_arr_dinner = np.array(df_train.iloc[:, 9].to_numpy().tolist()) + np.array(df_train.iloc[:, 11].to_numpy().tolist()) # Ver 1
emb_arr_lunch = np.array(df_train.iloc[:, 10].to_numpy().tolist()) # Ver 2
emb_arr_dinner = np.array(df_train.iloc[:, 11].to_numpy().tolist()) # Ver 2

X_train_lunch = np.concatenate((X_common.to_numpy(), emb_arr_lunch), axis = 1)
X_train_dinner = np.concatenate((X_common.to_numpy(), emb_arr_dinner), axis = 1)

In [None]:
X_train_lunch, X_test_lunch, y_train_lunch, y_test_lunch = train_test_split(X_train_lunch, y_lunch, test_size = 0.1, random_state = 42)
X_train_dinner, X_test_dinner, y_train_dinner, y_test_dinner = train_test_split(X_train_dinner, y_dinner, test_size = 0.1, random_state = 42)

# **Modeling**

In [None]:
# Simple LGBM Regressor w/o tuning
model_lunch = LGBMRegressor()
model_lunch.fit(X_train_lunch, y_train_lunch)

model_dinner = LGBMRegressor()
model_dinner.fit(X_train_dinner, y_train_dinner)

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.1, max_depth=-1,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
              random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [None]:
# Validate
pred_lunch = model_lunch.predict(X_test_lunch)
pred_dinner = model_dinner.predict(X_test_dinner)

print('lunch mae:', mean_absolute_error(y_test_lunch, pred_lunch))
print('dinner mae:', mean_absolute_error(y_test_dinner, pred_dinner))

lunch mae: 81.78454477307534
dinner mae: 51.09935581350808


# **Inference**

In [None]:
df_test = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/dacon_contest1/data/test.csv')

# Apply general preprocessing
df_test = process_date(df_test)
df_test['요일'] = day_encoder.transform(df_test['요일'])
df_test['조식메뉴_embedding'] = df_test['조식메뉴'].apply(lambda x: get_food_embedding(x))
df_test['중식메뉴_embedding'] = df_test['중식메뉴'].apply(lambda x: get_food_embedding(x))
df_test['석식메뉴_embedding'] = df_test['석식메뉴'].apply(lambda x: get_food_embedding(x))
df_test.drop(['조식메뉴', '중식메뉴', '석식메뉴'], axis = 1, inplace = True)

X_test_common = df_test.iloc[:, :9]

In [None]:
emb_arr_lunch = np.array(df_train.iloc[:, 10].to_numpy().tolist()) # Ver 2
emb_arr_dinner = np.array(df_train.iloc[:, 11].to_numpy().tolist()) # Ver 2

In [None]:
# Get embedding
test_emb_arr_lunch = np.array(df_test.iloc[:, 10].to_numpy().tolist()) # Ver 2
test_emb_arr_dinner = np.array(df_test.iloc[:, 11].to_numpy().tolist()) # Ver 2

# Concat
test_lunch = np.concatenate((X_test_common.to_numpy(), test_emb_arr_lunch), axis = 1)
test_dinner = np.concatenate((X_test_common.to_numpy(), test_emb_arr_dinner), axis = 1)

In [None]:
# Inference
test_pred_lunch = model_lunch.predict(test_lunch)
test_pred_dinner = model_dinner.predict(test_dinner)

submission_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/dacon_contest1/data/sample_submission.csv')

In [None]:
submission_df.head()

Unnamed: 0,일자,중식계,석식계
0,2021-01-27,0,0
1,2021-01-28,0,0
2,2021-01-29,0,0
3,2021-02-01,0,0
4,2021-02-02,0,0


In [None]:
submission_df['중식계'] = test_pred_lunch
submission_df['석식계'] = test_pred_dinner

In [None]:
# Save

submission_df.to_csv('sub_2.csv', index = False)