In [89]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import LabelEncoder

In [90]:
data = pd.read_csv(r'C:\Users\p\Desktop\open\open\train.csv')

In [91]:
import datetime
data['timestamp'] = pd.to_datetime(data['timestamp'])

data['year'] = data['timestamp'].dt.year
data['month'] = data['timestamp'].dt.month
data['day'] = data['timestamp'].dt.day
data['dayofweek'] = data['timestamp'].dt.dayofweek

data

Unnamed: 0,ID,timestamp,item,corporation,location,supply(kg),price(원/kg),year,month,day,dayofweek
0,TG_A_J_20190101,2019-01-01,TG,A,J,0.0,0.0,2019,1,1,1
1,TG_A_J_20190102,2019-01-02,TG,A,J,0.0,0.0,2019,1,2,2
2,TG_A_J_20190103,2019-01-03,TG,A,J,60601.0,1728.0,2019,1,3,3
3,TG_A_J_20190104,2019-01-04,TG,A,J,25000.0,1408.0,2019,1,4,4
4,TG_A_J_20190105,2019-01-05,TG,A,J,32352.0,1250.0,2019,1,5,5
...,...,...,...,...,...,...,...,...,...,...,...
59392,RD_F_J_20230227,2023-02-27,RD,F,J,452440.0,468.0,2023,2,27,0
59393,RD_F_J_20230228,2023-02-28,RD,F,J,421980.0,531.0,2023,2,28,1
59394,RD_F_J_20230301,2023-03-01,RD,F,J,382980.0,574.0,2023,3,1,2
59395,RD_F_J_20230302,2023-03-02,RD,F,J,477220.0,523.0,2023,3,2,3


In [92]:
encoding_columns = ['corporation', 'location']
not_encoding_columns = ['item','ID', 'timestamp', 'supply(kg)', 'price(원/kg)', 'year', 'month', 'day', 'dayofweek']

In [93]:
# 범주형 데이터 변환

enc_classes = {}

def encoding_labels(x):
    le = LabelEncoder()
    label = le.fit_transform(x)
    enc_classes[x.name] = le.classes_
    return label

In [94]:
d1 = data[encoding_columns].apply(encoding_labels)
d1.head()

Unnamed: 0,corporation,location
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0


In [95]:
d2 = data[not_encoding_columns]
d2.head()

Unnamed: 0,item,ID,timestamp,supply(kg),price(원/kg),year,month,day,dayofweek
0,TG,TG_A_J_20190101,2019-01-01,0.0,0.0,2019,1,1,1
1,TG,TG_A_J_20190102,2019-01-02,0.0,0.0,2019,1,2,2
2,TG,TG_A_J_20190103,2019-01-03,60601.0,1728.0,2019,1,3,3
3,TG,TG_A_J_20190104,2019-01-04,25000.0,1408.0,2019,1,4,4
4,TG,TG_A_J_20190105,2019-01-05,32352.0,1250.0,2019,1,5,5


In [98]:
data = d1.join(d2)
data

Unnamed: 0,corporation,location,item,ID,timestamp,supply(kg),price(원/kg),year,month,day,dayofweek
0,0,0,TG,TG_A_J_20190101,2019-01-01,0.0,0.0,2019,1,1,1
1,0,0,TG,TG_A_J_20190102,2019-01-02,0.0,0.0,2019,1,2,2
2,0,0,TG,TG_A_J_20190103,2019-01-03,60601.0,1728.0,2019,1,3,3
3,0,0,TG,TG_A_J_20190104,2019-01-04,25000.0,1408.0,2019,1,4,4
4,0,0,TG,TG_A_J_20190105,2019-01-05,32352.0,1250.0,2019,1,5,5
...,...,...,...,...,...,...,...,...,...,...,...
59392,5,0,RD,RD_F_J_20230227,2023-02-27,452440.0,468.0,2023,2,27,0
59393,5,0,RD,RD_F_J_20230228,2023-02-28,421980.0,531.0,2023,2,28,1
59394,5,0,RD,RD_F_J_20230301,2023-03-01,382980.0,574.0,2023,3,1,2
59395,5,0,RD,RD_F_J_20230302,2023-03-02,477220.0,523.0,2023,3,2,3


In [99]:
data = data[data['item'] == 'TG']
data

Unnamed: 0,corporation,location,item,ID,timestamp,supply(kg),price(원/kg),year,month,day,dayofweek
0,0,0,TG,TG_A_J_20190101,2019-01-01,0.0,0.0,2019,1,1,1
1,0,0,TG,TG_A_J_20190102,2019-01-02,0.0,0.0,2019,1,2,2
2,0,0,TG,TG_A_J_20190103,2019-01-03,60601.0,1728.0,2019,1,3,3
3,0,0,TG,TG_A_J_20190104,2019-01-04,25000.0,1408.0,2019,1,4,4
4,0,0,TG,TG_A_J_20190105,2019-01-05,32352.0,1250.0,2019,1,5,5
...,...,...,...,...,...,...,...,...,...,...,...
15225,4,1,TG,TG_E_S_20230227,2023-02-27,24204.0,3418.0,2023,2,27,0
15226,4,1,TG,TG_E_S_20230228,2023-02-28,13587.0,3141.0,2023,2,28,1
15227,4,1,TG,TG_E_S_20230301,2023-03-01,16187.0,4235.0,2023,3,1,2
15228,4,1,TG,TG_E_S_20230302,2023-03-02,17830.0,3960.0,2023,3,2,3
