In [18]:
import pandas as pd

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import KFold, cross_val_score

In [6]:
data = pd.read_csv('D:/Work/Data_files/working_dir/abalone.csv')
data.head()

Unnamed: 0,Sex,Length,Diameter,Height,WholeWeight,ShuckedWeight,VisceraWeight,ShellWeight,Rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [8]:
data['Sex'] = data['Sex'].map(lambda x: 1 if x=='M' else(-1 if x=='F' else 0))
data.head()

Unnamed: 0,Sex,Length,Diameter,Height,WholeWeight,ShuckedWeight,VisceraWeight,ShellWeight,Rings
0,1,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,1,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,-1,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,1,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,0,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [13]:
X = data.iloc[:, 0:-1]
X.head()

Unnamed: 0,Sex,Length,Diameter,Height,WholeWeight,ShuckedWeight,VisceraWeight,ShellWeight
0,1,0.455,0.365,0.095,0.514,0.2245,0.101,0.15
1,1,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07
2,-1,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21
3,1,0.44,0.365,0.125,0.516,0.2155,0.114,0.155
4,0,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055


In [17]:
y = data.Rings
y.head()

0    15
1     7
2     9
3    10
4     7
Name: Rings, dtype: int64

In [71]:
random_state = 42
n_jobs = 6    # 6-core processor used
kf = KFold(n_splits=5, random_state=random_state, shuffle=True)
res_score = []
for tree_number in range(1,51):
    regr = RandomForestRegressor(n_estimators=tree_number,
                                random_state=random_state,
                                n_jobs=n_jobs)
    res_score.append([tree_number] +
                     list(cross_val_score(regr, X, y, cv=kf, n_jobs=n_jobs, scoring='r2')))
result_df = pd.DataFrame(res_score, columns=['tree_number', 'cv_1',
                                            'cv_2', 'cv_3', 'cv_4', 'cv_5'])
result_df.head()

Unnamed: 0,tree_number,cv_1,cv_2,cv_3,cv_4,cv_5
0,1,0.077335,-0.047625,0.056594,0.099866,-0.14042
1,2,0.307946,0.226215,0.324561,0.344726,0.231656
2,3,0.36791,0.302724,0.415085,0.391367,0.304236
3,4,0.403169,0.360326,0.447602,0.429937,0.383793
4,5,0.426949,0.389591,0.473508,0.453257,0.396697


In [72]:
result_df['mean_cv'] = round(result_df.iloc[:, 1:6].mean(axis=1), 2)
result_df.head()

Unnamed: 0,tree_number,cv_1,cv_2,cv_3,cv_4,cv_5,mean_cv
0,1,0.077335,-0.047625,0.056594,0.099866,-0.14042,0.01
1,2,0.307946,0.226215,0.324561,0.344726,0.231656,0.29
2,3,0.36791,0.302724,0.415085,0.391367,0.304236,0.36
3,4,0.403169,0.360326,0.447602,0.429937,0.383793,0.4
4,5,0.426949,0.389591,0.473508,0.453257,0.396697,0.43


In [73]:
result_df.query('mean_cv>0.51').sort_values('mean_cv')

Unnamed: 0,tree_number,cv_1,cv_2,cv_3,cv_4,cv_5,mean_cv
21,22,0.523561,0.482519,0.548665,0.530578,0.493155,0.52
33,34,0.533317,0.490584,0.558901,0.533104,0.508877,0.52
32,33,0.532195,0.491734,0.558275,0.533358,0.507519,0.52
31,32,0.530993,0.492152,0.557364,0.532799,0.506595,0.52
29,30,0.530972,0.490311,0.55582,0.533169,0.505213,0.52
28,29,0.527842,0.489953,0.556424,0.533133,0.504004,0.52
30,31,0.530761,0.490481,0.557273,0.534013,0.506453,0.52
26,27,0.524657,0.488528,0.555632,0.532186,0.50028,0.52
25,26,0.525679,0.489025,0.555393,0.530904,0.497079,0.52
24,25,0.52552,0.489163,0.555704,0.530756,0.494498,0.52
