In [1]:
# 라이브러리
import os
import warnings
from tqdm import tqdm

import pandas as pd 
import numpy as np 

from sklearn.model_selection import (
    KFold, StratifiedKFold, GridSearchCV, RandomizedSearchCV, RepeatedKFold
)
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import (
    BaggingRegressor, RandomForestRegressor, AdaBoostRegressor,GradientBoostingRegressor,
)

warnings.filterwarnings("ignore")

In [2]:
data_list = os.listdir("./data")
data_list

['test_csv.csv',
 'train_csv.csv',
 'submission_baseline_rf.csv',
 'train_pickle.pkl',
 'submission.csv',
 'train_feather.ftr',
 'FIFA_train.csv',
 'FIFA_test.csv']

In [3]:
train = pd.read_feather("./data/" + data_list[5])
test = pd.read_csv("./data/" + data_list[0])

In [5]:
# 데이터 x,y 분리
X_train = train.drop("value", axis=1)
y_train = train["value"]

In [7]:
# kfold
kfold = KFold(n_splits=7, shuffle=True, random_state=120)

In [8]:
# 분류에서 사용, 
# stratifiedkfold = StratifiedKFold()

In [9]:
for i, (t, v) in enumerate(kfold.split(train)):
    
    # train, val 분리
    trn = train.iloc[t]
    val = train.iloc[v]
    
    # x, y 분리
    x_tr = trn.drop("value", axis=1)
    y_tr = trn["value"]
    
    x_val = val.drop("value", axis=1)
    y_val = val["value"]
    
    # 모델 학습
    rf = RandomForestRegressor(n_estimators=300, random_state=130)
    rf.fit(x_tr, y_tr)
    
    # 예측
    pred = rf.predict(x_val)
    pred = np.expm1(pred)
    
    y_val = np.expm1(y_val)
    
    # rmse
    mse = mean_squared_error(y_val, pred)
    rmse = np.sqrt(mse)
    
    print(f"{i+1}번 모델 rmse : {rmse}")

1번 모델 rmse : 652619.7678497384
2번 모델 rmse : 621248.5911461753
3번 모델 rmse : 807486.4278201144
4번 모델 rmse : 452149.50608568726
5번 모델 rmse : 605271.4165033385
6번 모델 rmse : 871532.9223647178
7번 모델 rmse : 1449844.0548208528


In [11]:
# GridSearch
rf = RandomForestRegressor(random_state=120)

In [12]:
# parameters
params = {
    "n_estimators" : [300, 400, 500],
    "min_samples_split" : [2,3,4],
    "min_samples_leaf" : [1,2,3]
}

In [14]:
cv = KFold(n_splits=3, shuffle=True, random_state=120)

In [15]:
grid = GridSearchCV(
                    rf, 
                    param_grid=params,
                    cv=cv, 
                    scoring="neg_mean_squared_error", 
                    verbose=2,
                    n_jobs=-1
                   )

In [16]:
grid.fit(X_train, y_train)

Fitting 3 folds for each of 27 candidates, totalling 81 fits


GridSearchCV(cv=KFold(n_splits=3, random_state=120, shuffle=True),
             estimator=RandomForestRegressor(random_state=120), n_jobs=-1,
             param_grid={'min_samples_leaf': [1, 2, 3],
                         'min_samples_split': [2, 3, 4],
                         'n_estimators': [300, 400, 500]},
             scoring='neg_mean_squared_error', verbose=2)

In [17]:
# Gridsearch 결과 확인
print(grid.best_params_)
print(grid.best_estimator_)
print(grid.best_score_)
print(grid.best_index_)

{'min_samples_leaf': 1, 'min_samples_split': 3, 'n_estimators': 400}
RandomForestRegressor(min_samples_split=3, n_estimators=400, random_state=120)
-0.008861072314858711
4


In [18]:
RandomForestRegressor(**grid.best_params_)

RandomForestRegressor(min_samples_split=3, n_estimators=400)

In [19]:
# 서치 결과 가장 좋은 estimator로 학습
grid.best_estimator_.fit(X_train, y_train)

RandomForestRegressor(min_samples_split=3, n_estimators=400, random_state=120)

In [20]:
# 학습 후 예측
grid.best_estimator_.predict(test)

array([17.69301494, 18.1707276 , 18.02255398, ..., 10.96385975,
       10.74102437, 10.85496397])

In [21]:
# RandomSearch
rf = RandomForestRegressor(random_state=130)

In [22]:
from scipy.stats import randint, uniform, loguniform

In [24]:
params = {
    "n_estimators" : randint(100, 600) ,
    "min_samples_split" : randint(1,8),
    "min_samples_leaf" : randint(1,5)
}

In [25]:
# RepeatedKFold : KFrold를 반복해서 실행, n_repeats
cv = RepeatedKFold(n_splits=3, random_state=120, n_repeats=3)

In [26]:
random_search = RandomizedSearchCV(
                rf,
                param_distributions=params,
                cv = cv,
                n_iter=20,
                scoring="neg_mean_squared_error",
                verbose=1,
                n_jobs=-1
)

In [27]:
random_search.fit(X_train, y_train)

Fitting 9 folds for each of 20 candidates, totalling 180 fits


RandomizedSearchCV(cv=RepeatedKFold(n_repeats=3, n_splits=3, random_state=120),
                   estimator=RandomForestRegressor(random_state=130), n_iter=20,
                   n_jobs=-1,
                   param_distributions={'min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fb92c018c10>,
                                        'min_samples_split': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fb95388ef10>,
                                        'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fb92c03de10>},
                   scoring='neg_mean_squared_error', verbose=1)

In [32]:
print(random_search.best_estimator_)
print(random_search.best_params_)
print(random_search.best_index_)
print(random_search.best_score_)

RandomForestRegressor(n_estimators=427, random_state=130)
{'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 427}
4
-0.0093443305564786


In [33]:
# 모델 앙상블 
rf_1 = grid.best_estimator_
rf_2 = random_search.best_estimator_

In [34]:
rf_1.fit(X_train, y_train)
rf_2.fit(X_train, y_train)

RandomForestRegressor(n_estimators=427, random_state=130)

In [35]:
pred_1 = rf_1.predict(test)
pred_2 = rf_2.predict(test)

In [38]:
pred = (pred_1 * 0.5) + (pred_2 * 0.5)
pred

array([17.70216861, 18.16888124, 18.02009815, ..., 10.96014314,
       10.73967529, 10.85261511])

In [43]:
np.random.seed(123)

In [44]:
# 앙상블 모델에 + bagging
prediction_list = [] # 예측값을 저장할 리스트
for _ in tqdm(range(10)):
    # data_index = X_train.index
    data_index = [ idx for idx in range(X_train.shape[0]) ] # X_train 길이만큼 index를 저장
    random_index = np.random.choice(data_index, X_train.shape[0], replace=True) # 인덱스 복원추출
    
    rf = RandomForestRegressor(**random_search.best_params_) # search하여 찾은 하이퍼파라미터를 넣어줌
    rf.fit(X_train.iloc[random_index, ], y_train.iloc[random_index, ]) # 모델 학습
    
    pred = rf.predict(test)
    pred = np.expm1(pred)
    
    prediction_list.append(pred)

100%|██████████| 10/10 [01:22<00:00,  8.21s/it]


In [47]:
prediction = []

for idx2 in range(test.shape[0]):
    
    temp = []
    # 각 array line by line 으로 평균을 내어 prediction에 저장
    for idx in range(len(prediction_list)):
        temp.append(prediction_list[idx][idx2])
    prediction.append(np.mean(temp))

In [53]:
prediction_list[0][0]

35986911.38283238

In [54]:
prediction_list[1][0]

88984489.75434314

In [56]:
df = pd.DataFrame({
    "p0" : prediction_list[0],
    "p1" : prediction_list[1],
    "p2" : prediction_list[2],
    "p3" : prediction_list[3],
    "p4" : prediction_list[4],
    "p5" : prediction_list[5],
    "p6" : prediction_list[6],
    "p7" : prediction_list[7],
    "p8" : prediction_list[8],
    "p9" : prediction_list[9],
})

In [62]:
df.head()

Unnamed: 0,p0,p1,p2,p3,p4,p5,p6,p7,p8,p9
0,35986910.0,88984490.0,49796020.0,48994960.0,69292220.0,30714530.0,42195390.0,51419360.0,58696630.0,40909760.0
1,80139620.0,77647230.0,73916620.0,78765330.0,77982760.0,71909720.0,79754800.0,78313720.0,81384640.0,66262850.0
2,69555570.0,59752840.0,64883640.0,61844220.0,58204390.0,68898250.0,68359440.0,72493970.0,68242630.0,62179180.0
3,67289640.0,65500250.0,68906910.0,64427750.0,63988740.0,70158170.0,69993900.0,71999450.0,75539360.0,62133750.0
4,62138110.0,55509600.0,55492300.0,60318310.0,54492300.0,55154690.0,57016040.0,64071880.0,70407240.0,59449860.0


In [59]:
np.mean(df.iloc[0])

51699027.76683371

In [60]:
prediction[0]

51699027.76683371

In [49]:
len(prediction)

3828

In [50]:
len(test)

3828