In [1]:
import pandas as pd
df = pd.read_excel('https://docs.google.com/spreadsheets/d/e/2PACX-1vQjMc_wQgzuMRjgU4LLBwgAekcaV2Bnz6mnFmU0Fkum-4oJWbhzOd5T1vWdBKWueg/pub?output=xlsx')
df.head()

Unnamed: 0,지점,일시,월,일,시간,풍속(m/s),풍향(deg),GUST풍속(m/s),현지기압(hPa),습도(%),기온(°C),수온(°C),최대파고(m),유의파고(m),평균파고(m),파주기(sec),파향(deg),타겟최대파고(m),타겟평균파고(m),타겟파주기(sec)
0,22107,2020-03-01 01:00:00,3,1,1,7.2,80.0,9.0,1016.6,77,14.3,16.1,1.7,1.2,0.8,5.8,116.0,0.5,0.2,5.8
1,22107,2020-03-01 02:00:00,3,1,2,8.6,82.0,10.1,1016.1,77,14.7,16.2,1.6,1.1,0.8,4.9,115.0,0.5,0.2,5.5
2,22107,2020-03-01 03:00:00,3,1,3,8.4,80.0,10.1,1016.1,76,14.8,16.1,1.6,1.1,0.8,4.9,102.0,0.4,0.2,5.5
3,22107,2020-03-01 04:00:00,3,1,4,8.5,71.0,10.7,1015.3,83,13.8,16.1,1.4,1.0,0.7,3.8,112.0,0.5,0.3,5.8
4,22107,2020-03-01 05:00:00,3,1,5,7.6,82.0,9.4,1015.9,75,14.4,16.1,1.7,1.1,0.7,4.0,102.0,0.5,0.2,5.3


In [2]:
import datetime as dt

df['일시'] = pd.to_datetime(df['일시'])
df['일시'] = df['일시'].map(dt.datetime.toordinal)

df.dtypes

지점               int64
일시               int64
월                int64
일                int64
시간               int64
풍속(m/s)        float64
풍향(deg)        float64
GUST풍속(m/s)    float64
현지기압(hPa)      float64
습도(%)            int64
기온(°C)         float64
수온(°C)         float64
최대파고(m)        float64
유의파고(m)        float64
평균파고(m)        float64
파주기(sec)       float64
파향(deg)        float64
타겟최대파고(m)      float64
타겟평균파고(m)      float64
타겟파주기(sec)     float64
dtype: object

In [3]:
from sklearn.model_selection import train_test_split

df = df.dropna()

train, test = train_test_split(df, train_size=0.8, random_state=10)

print(len(train), len(test))

6383 1596


In [4]:
target1 = '타겟평균파고(m)'
target2 = '타겟파주기(sec)'
target3 = '타겟최대파고(m)'

data = df.drop(columns=[target1,target2,target3])

features = [col for col in data.columns]

X_train = train[features]
X_test = test[features]

y_train1 = train[target1]
y_test1 = test[target1]

y_train2 = train[target2]
y_test2 = test[target2]

y_train3 = train[target3]
y_test3 = test[target3]

print(y_train1.head(2), y_train2.head(2), y_train3.head(2))

3270    0.5
8375    0.3
Name: 타겟평균파고(m), dtype: float64 3270    6.1
8375    5.0
Name: 타겟파주기(sec), dtype: float64 3270    1.2
8375    0.7
Name: 타겟최대파고(m), dtype: float64


###* 평균파고 예측 모델

In [7]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor

dists = {
    'max_depth': [10,20,40,50,100,120], 
    'n_estimators': [100,110,150,170,250]
}

rf = RandomForestRegressor(n_jobs=-1, random_state=10, oob_score=True)

model = RandomizedSearchCV(
    rf, 
    param_distributions=dists, 
    n_iter=30, 
    cv=3, 
    scoring='neg_mean_absolute_error',  
    verbose=1,
    n_jobs=-1
)

model.fit(X_train, y_train1)
print('최적의 하이퍼 파라미터:', model.best_params_)
print("MAE: ", model.best_score_)

Fitting 3 folds for each of 30 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:  5.8min finished


최적의 하이퍼 파라미터: {'n_estimators': 250, 'max_depth': 40}
MAE:  -0.06928952965394215


In [8]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

dists = {
    'max_depth': [30,35,40,45,50], 
    'n_estimators': [250,300,350,400,450]
}

rf = RandomForestRegressor(n_jobs=-1, random_state=10, oob_score=True)

search = GridSearchCV(rf, 
                      param_grid=dists, 
                      cv=3, 
                      scoring='neg_mean_absolute_error', 
                      verbose=1,
                      n_jobs=-1
)

search.fit(X_train, y_train1)
print('최적의 하이퍼 파라미터:', search.best_params_)
print("neg MAE: ", search.best_score_)

Fitting 3 folds for each of 25 candidates, totalling 75 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:  6.9min
[Parallel(n_jobs=-1)]: Done  75 out of  75 | elapsed: 11.3min finished


최적의 하이퍼 파라미터: {'max_depth': 30, 'n_estimators': 350}
neg MAE:  -0.06918281981337576


In [17]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

rf = RandomForestRegressor(n_jobs=-1, random_state=10, max_depth=30, n_estimators=350, oob_score=True)

rf.fit(X_train,y_train1)
pred = rf.predict(X_test)
print("test MAE", mean_absolute_error(y_test1, pred))

test MAE 0.058865244671491394


###* 파주기 예측 모델

In [18]:
dists = {
    'max_depth': [10,20,40,50,100,120], 
    'n_estimators': [100,110,150,170,250]
}

rf = RandomForestRegressor(n_jobs=-1, random_state=10, oob_score=True)

model = RandomizedSearchCV(
    rf, 
    param_distributions=dists, 
    n_iter=30, 
    cv=3, 
    scoring='neg_mean_absolute_error',  
    verbose=1,
    n_jobs=-1
)

model.fit(X_train, y_train2)
print('최적의 하이퍼 파라미터:', model.best_params_)
print("MAE: ", model.best_score_)

Fitting 3 folds for each of 30 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:  6.5min finished


최적의 하이퍼 파라미터: {'n_estimators': 250, 'max_depth': 40}
MAE:  -0.4607923945382966


In [19]:
dists = {
    'max_depth': [25, 30, 35, 40, 45, 48], 
    'n_estimators': [250,300,350,400,450]
}

rf = RandomForestRegressor(n_jobs=-1, random_state=10, oob_score=True)

search = GridSearchCV(rf, 
                      param_grid=dists, 
                      cv=3, 
                      scoring='neg_mean_absolute_error', 
                      verbose=1,
                      n_jobs=-1
)

search.fit(X_train, y_train2)
print('최적의 하이퍼 파라미터:', search.best_params_)
print("neg MAE: ", search.best_score_)

Fitting 3 folds for each of 30 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:  8.1min
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed: 15.8min finished


최적의 하이퍼 파라미터: {'max_depth': 35, 'n_estimators': 300}
neg MAE:  -0.4604840160773294


In [21]:
dists = {
    'max_depth': range(31,40), 
    'n_estimators': range(321,349)
}

rf = RandomForestRegressor(n_jobs=-1, random_state=10, oob_score=True)

search = GridSearchCV(rf, 
                      param_grid=dists, 
                      cv=3, 
                      scoring='neg_mean_absolute_error', 
                      verbose=1,
                      n_jobs=-1
)

search.fit(X_train, y_train2)
print('최적의 하이퍼 파라미터:', search.best_params_)
print("neg MAE: ", search.best_score_)

Fitting 3 folds for each of 252 candidates, totalling 756 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:  7.6min
[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed: 32.8min
[Parallel(n_jobs=-1)]: Done 446 tasks      | elapsed: 74.7min
[Parallel(n_jobs=-1)]: Done 756 out of 756 | elapsed: 126.7min finished


최적의 하이퍼 파라미터: {'max_depth': 33, 'n_estimators': 341}
neg MAE:  -0.46037518693557455


###* 최고높이 예측 모델

In [22]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor

dists = {
    'max_depth': [10,20,40,50,100,120], 
    'n_estimators': [100,110,150,170,250]
}

rf = RandomForestRegressor(n_jobs=-1, random_state=10, oob_score=True)

model = RandomizedSearchCV(
    rf, 
    param_distributions=dists, 
    n_iter=30, 
    cv=3, 
    scoring='neg_mean_absolute_error',  
    verbose=1,
    n_jobs=-1
)

model.fit(X_train, y_train3)
print('최적의 하이퍼 파라미터:', model.best_params_)
print("MAE: ", model.best_score_)

Fitting 3 folds for each of 30 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:  6.2min finished


최적의 하이퍼 파라미터: {'n_estimators': 170, 'max_depth': 40}
MAE:  -0.1525658406348707


In [5]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

dists = {
    'max_depth': [30,35,40,45,47], 
    'n_estimators': [160,170,190,220,240]
}

rf = RandomForestRegressor(n_jobs=-1, random_state=10, oob_score=True)

search = GridSearchCV(rf, 
                      param_grid=dists, 
                      cv=3, 
                      scoring='neg_mean_absolute_error', 
                      verbose=1,
                      n_jobs=-1
)

search.fit(X_train, y_train3)
print('최적의 하이퍼 파라미터:', search.best_params_)
print("neg MAE: ", search.best_score_)

Fitting 3 folds for each of 25 candidates, totalling 75 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:  4.1min
[Parallel(n_jobs=-1)]: Done  75 out of  75 | elapsed:  6.8min finished


최적의 하이퍼 파라미터: {'max_depth': 40, 'n_estimators': 190}
neg MAE:  -0.15236647494165345


In [7]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

dists = {
    'max_depth': [36,38,40,42], 
    'n_estimators': [175,180,190,200,210,215]
}

rf = RandomForestRegressor(n_jobs=-1, random_state=10, oob_score=True)

search = GridSearchCV(rf, 
                      param_grid=dists, 
                      cv=3, 
                      scoring='neg_mean_absolute_error', 
                      verbose=1,
                      n_jobs=-1
)

search.fit(X_train, y_train3)
print('최적의 하이퍼 파라미터:', search.best_params_)
print("neg MAE: ", search.best_score_)

Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:  4.1min
[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:  6.5min finished


최적의 하이퍼 파라미터: {'max_depth': 36, 'n_estimators': 190}
neg MAE:  -0.15236545814527655
