In [37]:
import os 
import pandas as pd
import numpy as np 
from tqdm import tqdm

In [20]:
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import mean_squared_error

In [10]:
from sklearn.ensemble import RandomForestRegressor

In [3]:
data_list = os.listdir("./data")
data_list

['test_csv.csv',
 'train_csv.csv',
 'submission_baseline_rf.csv',
 'train_pickle.pkl',
 'submission.csv',
 'train_feather.ftr',
 'FIFA_train.csv',
 'FIFA_test.csv']

In [40]:
data = pd.read_feather("./data/"+data_list[5])
test = pd.read_csv("./data/test_csv.csv")

In [5]:
data.head()

Unnamed: 0,age,contract_until,reputation,stat_overall,stat_potential,stat_skill_moves,value,position_DF,position_GK,position_MF,position_ST,prefer_foot_left,prefer_foot_right,continent_africa,continent_asia,continent_europe,continent_oceania,continent_south america,stat_potent,retire_age_left
0,31,3,5.0,94,94,4.0,18.520526,0,0,0,1,1,0,0,0,0,0,1,0,4
1,27,2,4.0,91,93,1.0,18.092177,0,1,0,0,0,1,0,0,1,0,0,2,8
2,31,3,5.0,91,91,3.0,18.197537,0,0,0,1,0,1,0,0,0,0,1,0,4
3,32,2,4.0,91,91,3.0,17.747336,1,0,0,0,0,1,0,0,1,0,0,0,3
4,25,3,3.0,90,93,1.0,18.035018,0,1,0,0,0,1,0,0,1,0,0,3,10


In [7]:
data.shape

(8932, 20)

In [8]:
X_train = data.drop("value", axis=1)
y_train = data.value

In [18]:
kfold = KFold(n_splits=7, random_state=130, shuffle=True)

In [19]:
for i, (t, v) in enumerate(kfold.split(data)):
    
    train = data.iloc[t]
    val = data.iloc[v]
    
    x_tr = train.drop("value", axis=1)
    y_tr = train.value
    
    x_val = val.drop("value", axis=1)
    y_val = val.value
    
    rf = RandomForestRegressor(n_estimators=300, random_state=130)
    rf.fit(x_tr, y_tr)
    
    pred = rf.predict(x_val)
    pred = np.expm1(pred)
    
    mse = mean_squared_error(y_val, pred)
    rmse = np.sqrt(mse)
    
    print(f"{i+1}번 rmse : {rmse}")
    

1번 rmse : 6419072.381470054
2번 rmse : 5616388.309182598
3번 rmse : 5743746.892809601
4번 rmse : 6492984.316564658
5번 rmse : 6078373.124192117
6번 rmse : 6807325.622215642
7번 rmse : 6492402.594953468


In [21]:
rf = RandomForestRegressor()
params = {
    'n_estimators' : [300, 400, 500],
    'min_samples_split' : [2,3,4],
    'min_samples_leaf' : [1,2,3]
}

In [28]:
kfold = KFold(n_splits=3, random_state=130, shuffle=True)

In [29]:
grid = GridSearchCV(rf, param_grid=params, cv = kfold, scoring="neg_mean_squared_error", verbose=2 ,n_jobs=-1)

In [30]:
grid.fit(X_train, y_train)

Fitting 3 folds for each of 27 candidates, totalling 81 fits


GridSearchCV(cv=KFold(n_splits=3, random_state=130, shuffle=True),
             estimator=RandomForestRegressor(), n_jobs=-1,
             param_grid={'min_samples_leaf': [1, 2, 3],
                         'min_samples_split': [2, 3, 4],
                         'n_estimators': [300, 400, 500]},
             scoring='neg_mean_squared_error', verbose=2)

In [31]:
print(grid.best_params_)

{'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 500}


In [32]:
print(grid.best_estimator_)

RandomForestRegressor(n_estimators=500)


In [33]:
print(grid.best_score_)

-0.009175689420706954


In [41]:
prediction_list = []
for _ in tqdm(range(10)):
    data_index = [ idx for idx in range(X_train.shape[0])]
    random_index = np.random.choice(data_index, X_train.shape[0]) # 복원추출
    rf = RandomForestRegressor(**grid.best_params_)
    rf.fit(X_train.iloc[random_index, ], y_train.iloc[random_index,])
    pred = rf.predict(test)
    pred = np.expm1(pred)
    prediction_list.append(pred)

100%|██████████| 10/10 [01:33<00:00,  9.30s/it]


In [42]:
prediction = [] # 예측 값 리스트
for idx2 in range(test.shape[0]): # test의 index 만큼 iteration
    temp = []
    # 각 array line by line 으로 평균 내어 prediction에 저장
    for idx in range(len(prediction_list)): 
        temp.append(prediction_list[idx][idx2])
    prediction.append(np.mean(temp))

In [44]:
# prediction