In [379]:
import pandas as pd
import numpy as np

df = pd.read_csv('data/weather.csv').drop(['중식메뉴', '석식메뉴'], axis = 1)
df

Unnamed: 0,지점명,기온(°C),강수량(mm),시간,일자,요일,중식계,석식계,인원수
0,진주,1.8,0.0,11:00,2016-02-01,월,1039.0,331.0,2163.0
1,진주,3.0,0.0,12:00,2016-02-01,월,1039.0,331.0,2163.0
2,진주,4.2,0.0,13:00,2016-02-01,월,1039.0,331.0,2163.0
3,진주,3.0,0.0,17:00,2016-02-01,월,1039.0,331.0,2163.0
4,진주,0.9,0.0,18:00,2016-02-01,월,1039.0,331.0,2163.0
...,...,...,...,...,...,...,...,...,...
7242,진주,6.2,4.2,12:00,2021-01-26,화,1015.0,480.0,1818.0
7243,진주,7.0,0.0,13:00,2021-01-26,화,1015.0,480.0,1818.0
7244,진주,7.3,0.0,17:00,2021-01-26,화,1015.0,480.0,1818.0
7245,진주,6.2,0.6,18:00,2021-01-26,화,1015.0,480.0,1818.0


## 점심 식수 인원 예측을 위한 데이터 프레임 생성

In [380]:
# 같은 날짜의 12시에 관측된 기상청 데이터가 여러 개인 것들 존재,
# 단순히 두 번 count된 것인지 확인 후 제거
lunch = df[df['시간'] == '11:00'].drop('석식계', axis = 1)
dinner = df[df['시간'] == '18:00']
df[df['시간'] == '12:00']['일자'].value_counts()
df[df['시간'] == '18:00']['일자'].value_counts()

2017-02-01    2
2018-02-01    2
2019-02-01    2
2016-02-01    1
2019-05-20    1
             ..
2017-09-14    1
2017-09-13    1
2017-09-12    1
2017-09-11    1
2021-01-26    1
Name: 일자, Length: 1205, dtype: int64

In [381]:
#기온부터 강수량까지 모두 같은 데이터임을 확인.

#dinner[dinner['일자'] == '2017-02-01']
#dinner[dinner['일자'] == '2018-02-01']
#dinner[dinner['일자'] == '2019-02-01']

#lunch[lunch['일자'] == '2017-02-01']
#lunch[lunch['일자'] == '2018-02-01']
lunch[lunch['일자'] == '2019-02-01']

Unnamed: 0,지점명,기온(°C),강수량(mm),시간,일자,요일,중식계,인원수
4403,진주,1.1,0.0,11:00,2019-02-01,금,804.0,2341.0
4409,진주,1.1,0.0,11:00,2019-02-01,금,804.0,2341.0


In [382]:
lunch = lunch.drop(4409, axis = 0).drop(2969, axis = 0).drop(1487, axis = 0)
#dinner = dinner.drop(4413, axis = 0).drop(2973, axis = 0).drop(1491, axis = 0)

In [383]:
lunch

Unnamed: 0,지점명,기온(°C),강수량(mm),시간,일자,요일,중식계,인원수
0,진주,1.8,0.0,11:00,2016-02-01,월,1039.0,2163.0
6,진주,0.3,0.0,11:00,2016-02-02,화,867.0,2059.0
12,진주,1.3,0.0,11:00,2016-02-03,수,1017.0,2254.0
18,진주,1.2,0.0,11:00,2016-02-04,목,978.0,1922.0
24,진주,4.7,0.0,11:00,2016-02-05,금,925.0,2108.0
...,...,...,...,...,...,...,...,...
7217,진주,1.5,0.0,11:00,2021-01-20,수,1093.0,2315.0
7223,진주,5.4,0.0,11:00,2021-01-21,목,832.0,1847.0
7229,진주,7.4,0.0,11:00,2021-01-22,금,579.0,2176.0
7235,진주,10.3,0.0,11:00,2021-01-25,월,1145.0,1780.0


In [384]:
lunch = lunch.reset_index()

menu_one_hot = pd.read_csv('data/menu_one_hot.csv')
lunch_data = pd.concat([lunch,menu_one_hot], axis = 1, join='inner')
#lunch_data.drop(['지점명', '일자', '시간', '요일'], axis = 1, inplace = True)

In [385]:
lunch_data = lunch_data.drop(['지점명', '시간', '일자', '요일'], axis = 1)

In [386]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae

X = lunch_data.drop('중식계', axis = 1) 
y = lunch_data['중식계']

X_train, X_test, y_train, y_test = train_test_split(X, y)

LR = LinearRegression()
Rg = Ridge()

LR.fit(X_train, y_train)
Rg.fit(X_train, y_train)

y_predicted = Rg.predict(X_test)

In [387]:
LR.coef_
LR.intercept_

print(mse(y_test, y_predicted), mse(y_train, y_train))

43551.88024233744 0.0


In [388]:
label_encoding_menu = pd.read_csv('data/label_encoding.csv')

In [389]:
label_encoding_menu['rice'] =  label_encoding_menu['rice'].astype('category')
label_encoding_menu['rice'] = label_encoding_menu.rice.cat.codes

label_encoding_menu['soup'] =  label_encoding_menu['soup'].astype('category')
label_encoding_menu['soup'] = label_encoding_menu.soup.cat.codes

label_encoding_menu['main'] = label_encoding_menu['main'].astype('category')
label_encoding_menu['main'] = label_encoding_menu.main.cat.codes

label_encoding_menu


Unnamed: 0,rice,soup,main
0,29,188,235
1,29,26,7
2,40,250,333
3,29,150,307
4,29,73,105
...,...,...,...
1200,29,166,247
1201,29,40,114
1202,36,14,290
1203,29,215,93


In [390]:
lunch_data2 = pd.concat([lunch, label_encoding_menu], axis = 1, join='inner')



lunch_data2.drop(['지점명','시간'], axis = 1, inplace = True)
lunch_data2['day'] = lunch_data2['요일'].astype('category')
lunch_data2['day'] = lunch_data2.day.cat.codes

lunch_data2['일자'] = pd.to_datetime(lunch_data2['일자'])
lunch_data2['월'] = lunch_data2['일자'].dt.month

lunch_data2.drop(['요일', '일자'], axis = 1, inplace = True)
lunch_data2.reset_index(inplace = True)



In [391]:
df = lunch_data2.drop(['level_0', 'index'], axis = 1)


In [392]:
X = df.drop('중식계', axis = 1)
y = df['중식계']

X_train, X_test, y_train, y_test = train_test_split(X, y)

LR = LinearRegression()
Rg = Ridge()

LR.fit(X_train, y_train)
Rg.fit(X_train, y_train)

y_predicted = Rg.predict(X_test)

In [393]:
LR.coef_
LR.intercept_

print(mae(y_test, y_predicted), mae(y_train, y_train))

130.03304247801086 0.0


In [402]:
from sklearn.ensemble import RandomForestRegressor
forest = RandomForestRegressor(n_jobs = -1, random_state=42,criterion='mse')
forest.fit(X_train, y_train)

  warn(


RandomForestRegressor(criterion='mse', n_jobs=-1, random_state=42)

In [403]:
train_predicted = forest.predict(X_train)
test_predicted = forest.predict(X_test)
mae(y_train, train_predicted)
mae(y_test, test_predicted)

90.38033112582781

In [404]:
mae(y_train, train_predicted)

36.84482834994463

In [405]:
X_train

Unnamed: 0,기온(°C),강수량(mm),인원수,rice,soup,main,day,월
805,26.3,0.0,2015.0,29,139,7,1,5
740,4.3,0.0,1779.0,29,242,325,3,2
676,12.1,0.0,2116.0,29,230,94,4,11
229,3.8,0.0,2460.0,4,268,110,2,1
727,4.4,0.0,2531.0,29,154,310,0,1
...,...,...,...,...,...,...,...,...
1161,17.2,0.0,2523.0,29,157,1,2,11
16,6.2,0.0,2137.0,29,154,282,0,2
477,-1.5,0.0,2387.0,42,76,38,2,1
94,22.3,0.0,2332.0,29,161,90,2,6
