In [31]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import r2_score

In [28]:
######--TEST
# X = np.array([[1, 2], [3, 4], [5, 6]])
# y = np.array([-3, 1, 10])
# clf = RandomForestRegressor(n_estimators=100)
# clf.fit(X, y)
# predictions = clf.predict(X)
# predictions

In [29]:
##-- Данные о ракушках
data = pd.read_csv('abalone.csv')
data

Unnamed: 0,Sex,Length,Diameter,Height,WholeWeight,ShuckedWeight,VisceraWeight,ShellWeight,Rings
0,M,0.455,0.365,0.095,0.5140,0.2245,0.1010,0.1500,15
1,M,0.350,0.265,0.090,0.2255,0.0995,0.0485,0.0700,7
2,F,0.530,0.420,0.135,0.6770,0.2565,0.1415,0.2100,9
3,M,0.440,0.365,0.125,0.5160,0.2155,0.1140,0.1550,10
4,I,0.330,0.255,0.080,0.2050,0.0895,0.0395,0.0550,7
...,...,...,...,...,...,...,...,...,...
4172,F,0.565,0.450,0.165,0.8870,0.3700,0.2390,0.2490,11
4173,M,0.590,0.440,0.135,0.9660,0.4390,0.2145,0.2605,10
4174,M,0.600,0.475,0.205,1.1760,0.5255,0.2875,0.3080,9
4175,F,0.625,0.485,0.150,1.0945,0.5310,0.2610,0.2960,10


In [4]:
##-- Sex переводим в числовой вид
if isinstance(data['Sex'][0], np.int64) == False:
    data['Sex'] = data['Sex'].map(lambda x: -1 if x=='F' else (0 if x=='I' else 1))

In [6]:
##-- Признаки
y_true = data['Rings']
X = data.iloc[:, :-1]
y_true

0       15
1        7
2        9
3       10
4        7
        ..
4172    11
4173    10
4174     9
4175    10
4176    12
Name: Rings, Length: 4177, dtype: int64

In [34]:
##-- Объекты
X

Unnamed: 0,Sex,Length,Diameter,Height,WholeWeight,ShuckedWeight,VisceraWeight,ShellWeight
0,1,0.455,0.365,0.095,0.5140,0.2245,0.1010,0.1500
1,1,0.350,0.265,0.090,0.2255,0.0995,0.0485,0.0700
2,-1,0.530,0.420,0.135,0.6770,0.2565,0.1415,0.2100
3,1,0.440,0.365,0.125,0.5160,0.2155,0.1140,0.1550
4,0,0.330,0.255,0.080,0.2050,0.0895,0.0395,0.0550
...,...,...,...,...,...,...,...,...
4172,-1,0.565,0.450,0.165,0.8870,0.3700,0.2390,0.2490
4173,1,0.590,0.440,0.135,0.9660,0.4390,0.2145,0.2605
4174,1,0.600,0.475,0.205,1.1760,0.5255,0.2875,0.3080
4175,-1,0.625,0.485,0.150,1.0945,0.5310,0.2610,0.2960


In [24]:
kf = KFold(n_splits=5, shuffle=True, random_state=1)
list_tree = []
list_count_tree = []
for i in range(1, 51):
    random_forest_clf = RandomForestRegressor(random_state=1, n_estimators=i)
    cvs = cross_val_score(random_forest_clf, X, y_true, cv=kf, scoring='r2')
    mean_quality = cvs.mean()
    if mean_quality > 0.52:
        list_count_tree.append(i)
        list_tree.append(mean_quality)
#     random_forest_clf.fit(X, y_true)
#     prediction = random_forest_clf.predict(X)
#     quality = r2_score(y_true, prediction)
print("List count tree: " + str(list_count_tree))
print("List tree: " + str(list_tree))
print("Min trees: " + str(min(list_tree)))
print("Min count trees: " + str(min(list_count_tree)))

List count tree: [21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50]
List tree: [0.520529096463528, 0.5208044230080824, 0.521742855685855, 0.5231059969795335, 0.5232486470488318, 0.5243076139284634, 0.5246393588459404, 0.5256557724971402, 0.5265556293057552, 0.5270858715838138, 0.5276420438225101, 0.5289244806388986, 0.5301073722643779, 0.5299613734264366, 0.5298209779129148, 0.529910050667947, 0.5294320415136227, 0.5294715388671245, 0.529515898349607, 0.5294703580378128, 0.5298087685207094, 0.5300433306143383, 0.5299135764090978, 0.5296814957917958, 0.528908112349864, 0.5290083070325597, 0.5291786571646144, 0.5300509396315634, 0.530813058616495, 0.5309509147417047]
Min trees: 0.520529096463528
Min count trees: 21


In [26]:
##-- Кроссвалидация для разного количества деревьев
for i in range(1, 51):
    random_forest_clf = RandomForestRegressor(random_state=1, n_estimators=i)
    cvs = cross_val_score(random_forest_clf, X, y_true, cv=kf, scoring='r2')
    mean_quality = cvs.mean()
    print(str(i) + " : " + str(mean_quality))

1 : 0.10967482068860261
2 : 0.3413000096365689
3 : 0.406433829066129
4 : 0.4447745857536912
5 : 0.46503241426823594
6 : 0.47139595825898917
7 : 0.4766658451893487
8 : 0.4829348394224631
9 : 0.4894370162945041
10 : 0.4954085552428177
11 : 0.4944111155773555
12 : 0.49902817866563326
13 : 0.5030578549564464
14 : 0.5073168234618861
15 : 0.5091809969556578
16 : 0.5114105314179662
17 : 0.5148917747729636
18 : 0.5172203573170132
19 : 0.5198293095329432
20 : 0.51948435033775
21 : 0.520529096463528
22 : 0.5208044230080824
23 : 0.521742855685855
24 : 0.5231059969795335
25 : 0.5232486470488318
26 : 0.5243076139284634
27 : 0.5246393588459404
28 : 0.5256557724971402
29 : 0.5265556293057552
30 : 0.5270858715838138
31 : 0.5276420438225101
32 : 0.5289244806388986
33 : 0.5301073722643779
34 : 0.5299613734264366
35 : 0.5298209779129148
36 : 0.529910050667947
37 : 0.5294320415136227
38 : 0.5294715388671245
39 : 0.529515898349607
40 : 0.5294703580378128
41 : 0.5298087685207094
42 : 0.5300433306143383
43 :

In [41]:
##-- Спрогнозированный возраст ракушек
forest_clf = RandomForestRegressor(random_state=1, n_estimators=50)
forest_clf.fit(X, y_true)
y_score = forest_clf.predict(X)
y_score

array([12.66,  7.16,  9.44, ...,  9.6 ,  9.76, 11.86])

In [42]:
##-- Критерий качества - 91% верно
r2_score(y_score, y_true)

0.9137627298156976