In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import mglearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression # 선형회귀
from sklearn.neighbors import KNeighborsRegressor # KNN 회귀
from sklearn.tree import DecisionTreeRegressor # 결정트리회귀
from sklearn.ensemble import RandomForestRegressor # 랜덤포레스트 회귀
from xgboost import XGBRegressor # GBT 병행학습
from lightgbm import LGBMRegressor # GBT
from sklearn.model_selection import cross_val_score # 교차검증
from sklearn.model_selection import KFold # KFold 교차검증

In [2]:
total = pd.read_csv('./data/2013-2018년_가스공급량과_기온2.csv')

In [3]:
total = total.drop(columns=['Unnamed: 0', 'Unnamed: 0.1', '연월일'])

In [4]:
total

Unnamed: 0,시간,구분,공급량,year,month,day,weekday,기온
0,1,0,2497.129,2013,1,1,1,-6.4
1,1,1,2169.093,2013,1,1,1,-6.4
2,1,2,226.178,2013,1,1,1,-6.4
3,1,3,1434.516,2013,1,1,1,-6.4
4,1,4,3272.837,2013,1,1,1,-6.4
...,...,...,...,...,...,...,...,...
368083,24,2,237.911,2018,12,31,0,-2.9
368084,24,3,1422.478,2018,12,31,0,-2.9
368085,24,4,3534.260,2018,12,31,0,-2.9
368086,24,5,3982.757,2018,12,31,0,-2.9


In [5]:
X = total.drop(columns=['공급량'])
y = total['공급량']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [7]:
X_train

Unnamed: 0,시간,구분,year,month,day,weekday,기온
111594,7,0,2014,10,27,0,8.7
140874,13,6,2015,4,19,6,17.1
278684,21,0,2017,7,17,0,24.1
338641,18,2,2018,7,9,0,25.0
80117,22,2,2014,4,22,1,10.0
...,...,...,...,...,...,...,...
359783,14,4,2018,11,12,0,13.7
358083,11,5,2018,11,2,4,13.7
152315,16,2,2015,6,26,4,23.4
117952,3,2,2014,12,4,3,2.4


In [8]:
y_train

111594    1006.209
140874     210.060
278684     665.529
338641       2.756
80117        1.378
            ...   
359783    1256.708
358083    2583.374
152315      92.578
117952     154.178
305711    2206.809
Name: 공급량, Length: 276066, dtype: float64

In [9]:
X_test

Unnamed: 0,시간,구분,year,month,day,weekday,기온
341261,8,4,2018,7,25,2,26.5
209561,10,2,2016,6,1,2,23.4
254423,11,1,2017,2,23,3,3.5
25367,24,6,2013,5,31,4,17.3
271447,19,1,2017,6,4,6,24.0
...,...,...,...,...,...,...,...
145472,22,5,2015,5,16,5,14.3
12603,1,3,2013,3,17,6,0.7
202648,6,5,2016,4,21,3,15.0
148460,17,4,2015,6,3,2,24.8


In [10]:
y_test

341261     780.401
209561      95.778
254423    2034.405
25367      113.444
271447     445.813
            ...   
145472    1225.581
12603      766.516
202648    1420.685
148460     665.477
37239      102.332
Name: 공급량, Length: 92022, dtype: float64

In [11]:
model_linear = LinearRegression()
model_knn = KNeighborsRegressor()
model_tree = DecisionTreeRegressor()
model_random = RandomForestRegressor()
model_xgb = XGBRegressor()
model_lgb = LGBMRegressor()

In [12]:
model_linear.fit(X_train, y_train)
model_knn.fit(X_train, y_train)
model_tree.fit(X_train, y_train)
model_random.fit(X_train, y_train)
model_xgb.fit(X_train, y_train)
model_lgb.fit(X_train, y_train)

LGBMRegressor()

In [13]:
print(model_linear.score(X_test, y_test))
print(model_knn.score(X_test, y_test))
print(model_tree.score(X_test, y_test))
print(model_random.score(X_test, y_test))
print(model_xgb.score(X_test, y_test))
print(model_lgb.score(X_test, y_test))

0.2687987288593773
0.6131867256659032
0.9672279011646372
0.9845068056392455
0.9861678191556988
0.979456387828273


In [14]:
scores = cross_val_score(model_linear, X_test, y_test)
print("{} 교차 검증 점수 : {}".format(model_linear,scores))
scores = cross_val_score(model_knn, X_test, y_test)
print("{} 교차 검증 점수 : {}".format(model_knn,scores))
scores = cross_val_score(model_tree, X_test, y_test)
print("{} 교차 검증 점수 : {}".format(model_tree,scores))
scores = cross_val_score(model_random, X_test, y_test)
print("{} 교차 검증 점수 : {}".format(model_random,scores))
scores = cross_val_score(model_xgb, X_test, y_test)
print("{} 교차 검증 점수 : {}".format(model_xgb,scores))
scores = cross_val_score(model_lgb, X_test, y_test)
print("{} 교차 검증 점수 : {}".format(model_lgb,scores))

LinearRegression() 교차 검증 점수 : [0.27661319 0.27018589 0.26650212 0.26505649 0.26510171]
KNeighborsRegressor() 교차 검증 점수 : [0.48543325 0.4658092  0.46780046 0.46296974 0.45925029]
DecisionTreeRegressor() 교차 검증 점수 : [0.94938763 0.95396348 0.94864897 0.95341543 0.95116463]
RandomForestRegressor() 교차 검증 점수 : [0.97690562 0.97741997 0.97385458 0.97753339 0.97737777]
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.300000012,
             max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=100, n_jobs=12,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None) 교차 검증 점수 

In [15]:
model_random = RandomForestRegressor(max_depth= None, max_features= 'auto', min_samples_split= 0.01, n_estimators= 250)

In [16]:
model_random.fit(X_train, y_train)
print(model_random.score(X_test, y_test))
scores = cross_val_score(model_random, X_test, y_test)
print("{} 교차 검증 점수 : {}".format(model_random,scores))

0.9233511477993601
RandomForestRegressor(min_samples_split=0.01, n_estimators=250) 교차 검증 점수 : [0.92806809 0.9287181  0.92277428 0.93012671 0.92705781]


In [17]:
from sklearn.model_selection import GridSearchCV

In [18]:
param_grid = {
    'n_estimators': [100, 150, 200],
    'max_depth': [3, 6, 9, 12],
    'min_samples_split': [0.01, 0.05, 0.1]
}

In [19]:
kf = KFold(random_state=30,
           n_splits=10,
           shuffle=True,
          )

In [20]:
grid_search = GridSearchCV(estimator=model_random, 
                           param_grid=param_grid, 
                           cv=kf, 
                           n_jobs=-1, 
                           verbose=2
                          )

In [21]:
grid_search.fit(X_train, y_train)

Fitting 10 folds for each of 36 candidates, totalling 360 fits


GridSearchCV(cv=KFold(n_splits=10, random_state=30, shuffle=True),
             estimator=RandomForestRegressor(min_samples_split=0.01,
                                             n_estimators=250),
             n_jobs=-1,
             param_grid={'max_depth': [3, 6, 9, 12],
                         'min_samples_split': [0.01, 0.05, 0.1],
                         'n_estimators': [100, 150, 200]},
             verbose=2)

In [22]:
grid_search.best_params_

{'max_depth': 12, 'min_samples_split': 0.01, 'n_estimators': 200}

In [11]:
model_random = RandomForestRegressor(max_depth= 12, max_features= 'auto', min_samples_split= 0.01, n_estimators= 200)

In [12]:
model_random.fit(X_train, y_train)
print(model_random.score(X_test, y_test))
scores = cross_val_score(model_random, X_test, y_test)
print("{} 교차 검증 점수 : {}".format(model_random,scores))

0.923316646021543
RandomForestRegressor(max_depth=12, min_samples_split=0.01, n_estimators=200) 교차 검증 점수 : [0.92840213 0.92875514 0.92194443 0.92991874 0.92646072]


In [28]:
model_random = RandomForestRegressor(max_depth=None, max_features= 'auto', min_samples_split= 0.01, n_estimators= 200)

In [29]:
model_random.fit(X_train, y_train)
print(model_random.score(X_test, y_test))
scores = cross_val_score(model_random, X_test, y_test)
print("{} 교차 검증 점수 : {}".format(model_random,scores))

0.9238053441718781
RandomForestRegressor(min_samples_split=0.01, n_estimators=200) 교차 검증 점수 : [0.92848042 0.92841059 0.92261004 0.92998473 0.92675123]


In [13]:
sub = pd.read_csv('./data/sample_submission.csv')

In [14]:
sub['일자'] = sub['일자|시간|구분'].str.split().str[0]
sub['시간'] = sub['일자|시간|구분'].str.split().str[1]
sub['구분'] = sub['일자|시간|구분'].str.split().str[2]
d_map = {}
for i, d in enumerate(sub['구분'].unique()):
    d_map[d] = i
sub['구분'] = sub['구분'].map(d_map)
sub['일자'] = pd.to_datetime(sub['일자'])
sub['year'] = sub['일자'].dt.year
sub['month'] = sub['일자'].dt.month
sub['day'] = sub['일자'].dt.day
sub['weekday'] = sub['일자'].dt.weekday
X_sub = sub[["시간", '구분', 'year', 'month', 'day', 'weekday']]
X_sub['기온']

In [15]:
pred = model_random.predict(X_sub)

ValueError: X has 6 features, but DecisionTreeRegressor is expecting 7 features as input.