In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [8]:
df = pd.read_csv('delivery_raw.csv', sep='\t')

df = df.dropna(subset=["actual_delivery_time"])

In [9]:
df

Unnamed: 0,market_id,created_at,actual_delivery_time,store_id,store_primary_category,order_protocol,total_items,subtotal,num_distinct_items,min_item_price,max_item_price,total_onshift,total_busy,total_outstanding_orders,estimated_order_place_duration,estimated_store_to_consumer_driving_duration
0,1.0,2015-02-06 22:24:17,2015-02-06 23:27:16,1845,american,1.0,4,3441,4,557,1239,33.0,14.0,21.0,446,861.0
1,2.0,2015-02-10 21:49:25,2015-02-10 22:56:29,5477,mexican,2.0,1,1900,1,1400,1400,1.0,2.0,2.0,446,690.0
2,3.0,2015-01-22 20:39:28,2015-01-22 21:09:09,5477,,1.0,1,1900,1,1900,1900,1.0,0.0,0.0,446,690.0
3,3.0,2015-02-03 21:21:45,2015-02-03 22:13:00,5477,,1.0,6,6900,5,600,1800,1.0,1.0,2.0,446,289.0
4,3.0,2015-02-15 02:40:36,2015-02-15 03:20:26,5477,,1.0,3,3900,3,1100,1600,6.0,6.0,9.0,446,650.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
197423,1.0,2015-02-17 00:19:41,2015-02-17 01:24:48,2956,fast,4.0,3,1389,3,345,649,17.0,17.0,23.0,251,331.0
197424,1.0,2015-02-13 00:01:59,2015-02-13 00:58:22,2956,fast,4.0,6,3010,4,405,825,12.0,11.0,14.0,251,915.0
197425,1.0,2015-01-24 04:46:08,2015-01-24 05:36:16,2956,fast,4.0,5,1836,3,300,399,39.0,41.0,40.0,251,795.0
197426,1.0,2015-02-01 18:18:15,2015-02-01 19:23:22,3630,sandwich,1.0,1,1175,1,535,535,7.0,7.0,12.0,446,384.0


In [10]:
df['created_at'] = pd.to_datetime(df['created_at'])
df['actual_delivery_time'] = pd.to_datetime(df['actual_delivery_time'])

df['delivery_time'] = (df['actual_delivery_time'] - df['created_at']).dt.total_seconds()

df = df.drop(['created_at', 'actual_delivery_time'], axis=1)

In [12]:
encoder = LabelEncoder()

df['market_id'] = encoder.fit_transform(df['market_id'])
df['store_id'] = encoder.fit_transform(df['store_id'])
df['store_primary_category'] = encoder.fit_transform(df['store_primary_category'])
df['order_protocol'] = encoder.fit_transform(df['order_protocol'])

In [13]:
train_df, test_df = train_test_split(df, test_size=0.1, random_state=42)

In [14]:
X_train = train_df.drop('delivery_time', axis=1)
y_train = train_df['delivery_time']
X_test = test_df.drop('delivery_time', axis=1)
y_test = test_df['delivery_time']

In [15]:
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

In [16]:
predictions = model.predict(X_test)

In [17]:
mse = mean_squared_error(y_test, predictions)
rmse = np.sqrt(mse)

## 데이터 전처리와 속성 생성

결측치 제거 후 'created_at' 속성과 'actual_delivery_time' 속성의 차이를 계산하여 'delivery_time' 속성 생성

LabelEncoder 를 사용해 범주형 변수 처리

## 학습을 위해 사용한 모델과 손실함수

모델 : RandomForestRegressor

손실함수 : MSE

## 평가 지표

In [20]:
print("RMSE : ", rmse)
print("Under-Prediction ratio : ", (np.array(y_test) > predictions).sum() / len(y_test))

RMSE :  2820.2760324928336
Under-Prediction ratio :  0.408448564048017
