# 메뉴 예측 모델

### 데이터 전처리

In [None]:
import pandas as pd

url = './data/freetips.level0.csv'

df_ori = pd.read_csv(url)
df = df_ori[:]
print('데이터 {} 개'.format(len(df)))
df.head()

데이터 19305 개


Unnamed: 0,_id,timestamp,cafe_id,seat,gender,act,group,menu
0,64ed854c241a11ac3221126c,2023-08-02T14:40:24.265Z,20,60.0,"{'male': 0, 'female': 0}",chatting,"{'average': 3.0, 'cluster': [4, 0, 2, 3, 0]}",ice
1,64ed854c241a11ac3221126d,2023-08-02T14:41:24.680Z,20,100.0,"{'male': 50.0, 'female': 50.0}",chatting,"{'average': 3.2, 'cluster': [4, 2, 2, 4, 4]}",ice
2,64ed854c241a11ac3221126e,2023-08-02T14:42:25.622Z,20,80.0,"{'male': 0, 'female': 0}",chatting,"{'average': 2.75, 'cluster': [4, 0, 2, 4, 1]}",ice
3,64ed854c241a11ac3221126f,2023-08-02T14:43:25.765Z,20,40.0,"{'male': 0, 'female': 0}",chatting,"{'average': 2.5, 'cluster': [4, 0, 2, 3, 1]}",ice
4,64ed854c241a11ac32211270,2023-08-02T14:44:26.528Z,20,60.0,"{'male': 20.0, 'female': 80.0}",chatting,"{'average': 3.5, 'cluster': [4, 0, 2, 4, 4]}",ice


In [2]:
df.dtypes

_id           object
timestamp     object
cafe_id        int64
seat         float64
gender        object
act           object
group         object
menu          object
dtype: object

## 데이터 프로파일
### 변수
1. _id
    - 고유값. ( 데이터에서 제거 )
2. timestamp
    - 2023-08-02, 14:40:24.265 
    - 날짜, 시간으로 분리
        - Real number로 변경: 230802
        - 시간 시 분 초
3. seat
    - Real number
    - 좌석 점유율 %
4. gender
    - 남여 성비 %
    - dict 구조. -> 남자 비율로 변경 
    - 여자비율 = 100 - 남자 비율
4. act
    - Categorical
    - 어떤 행동이 많이 있는지.
    - chatting, reading, working
5. group
    - 평균 몇명인지, 각 테이블 별로 몇명이 앉아 있는지.
6. menu
    - Categorical
    - 어떤 음료가 많이 팔리는지
    - hot, ice

### 메뉴 예측에 필요
- 요일별로 예측 할 때 필요
    - cafe_id, day, seat
- 시간별로 예측 할 때 필요
    - cafe_id, time, seat
- 연, 월, 일, 요일, 시간에 따른 메뉴 예측

In [3]:
seat_occupancy_data = df[:].copy()
seat_occupancy_data['date'] = pd.to_datetime(seat_occupancy_data['timestamp'])
seat_occupancy_data = seat_occupancy_data.set_index('date')
seat_occupancy_data['year'] = seat_occupancy_data.index.year
seat_occupancy_data['month'] = seat_occupancy_data.index.month
seat_occupancy_data['day'] = seat_occupancy_data.index.day
seat_occupancy_data['dayofweek'] = seat_occupancy_data.index.dayofweek
seat_occupancy_data['hour'] = seat_occupancy_data.index.hour
seat_occupancy_data['minute'] = seat_occupancy_data.index.minute
seat_occupancy_data['second'] = seat_occupancy_data.index.second
seat_occupancy_data['time'] = seat_occupancy_data.index.time
seat_occupancy_data['date'] = seat_occupancy_data.index.date
seat_occupancy_data['day'] = seat_occupancy_data.index.day
seat_occupancy_data.head()

Unnamed: 0_level_0,_id,timestamp,cafe_id,seat,gender,act,group,menu,year,month,day,dayofweek,hour,minute,second,time,date
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2023-08-02 14:40:24.265000+00:00,64ed854c241a11ac3221126c,2023-08-02T14:40:24.265Z,20,60.0,"{'male': 0, 'female': 0}",chatting,"{'average': 3.0, 'cluster': [4, 0, 2, 3, 0]}",ice,2023,8,2,2,14,40,24,14:40:24.265000,2023-08-02
2023-08-02 14:41:24.680000+00:00,64ed854c241a11ac3221126d,2023-08-02T14:41:24.680Z,20,100.0,"{'male': 50.0, 'female': 50.0}",chatting,"{'average': 3.2, 'cluster': [4, 2, 2, 4, 4]}",ice,2023,8,2,2,14,41,24,14:41:24.680000,2023-08-02
2023-08-02 14:42:25.622000+00:00,64ed854c241a11ac3221126e,2023-08-02T14:42:25.622Z,20,80.0,"{'male': 0, 'female': 0}",chatting,"{'average': 2.75, 'cluster': [4, 0, 2, 4, 1]}",ice,2023,8,2,2,14,42,25,14:42:25.622000,2023-08-02
2023-08-02 14:43:25.765000+00:00,64ed854c241a11ac3221126f,2023-08-02T14:43:25.765Z,20,40.0,"{'male': 0, 'female': 0}",chatting,"{'average': 2.5, 'cluster': [4, 0, 2, 3, 1]}",ice,2023,8,2,2,14,43,25,14:43:25.765000,2023-08-02
2023-08-02 14:44:26.528000+00:00,64ed854c241a11ac32211270,2023-08-02T14:44:26.528Z,20,60.0,"{'male': 20.0, 'female': 80.0}",chatting,"{'average': 3.5, 'cluster': [4, 0, 2, 4, 4]}",ice,2023,8,2,2,14,44,26,14:44:26.528000,2023-08-02


In [5]:
import xgboost
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import explained_variance_score

X_train, X_test, y_train, y_test = train_test_split(cafe_20_seat_data, cafe_20_seat_target ,test_size=0.1)
xgb_model = xgboost.XGBRegressor(n_estimators=10000, learning_rate=0.04, gamma=0, subsample=0.75,
                           colsample_bytree=1, max_depth=3)

print(len(X_train), len(X_test))
xgb_model.fit(X_train,y_train)

3015 336
