## Getting Data

In [2]:
import os 
import requests 

DATASET = (
    "https://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data",
    "https://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.names"
)

def download_data(path='data', urls=DATASET):
    if not os.path.exists(path):
        os.mkdir(path) 
    
    for url in urls:
        response = requests.get(url)
        name = os.path.basename(url) 
        with open(os.path.join(path, name), 'wb') as f: 
            f.write(response.content)

download_data()

## Load Data

In [5]:
import pandas as pd

columns = [
    "sex",
    "length",
    "diameter",
    "height",
    "wweight",
    "shuweight",
    "vweight",
    "sheweight",
    "rings"
]

data = pd.read_csv('data/abalone.data', names=columns)

## Check it out

In [7]:
data.head(10)

Unnamed: 0,sex,length,diameter,height,wweight,shuweight,vweight,sheweight,rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7
5,I,0.425,0.3,0.095,0.3515,0.141,0.0775,0.12,8
6,F,0.53,0.415,0.15,0.7775,0.237,0.1415,0.33,20
7,F,0.545,0.425,0.125,0.768,0.294,0.1495,0.26,16
8,M,0.475,0.37,0.125,0.5095,0.2165,0.1125,0.165,9
9,F,0.55,0.44,0.15,0.8945,0.3145,0.151,0.32,19


In [8]:
data.describe()

Unnamed: 0,length,diameter,height,wweight,shuweight,vweight,sheweight,rings
count,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0
mean,0.523992,0.407881,0.139516,0.828742,0.359367,0.180594,0.238831,9.933684
std,0.120093,0.09924,0.041827,0.490389,0.221963,0.109614,0.139203,3.224169
min,0.075,0.055,0.0,0.002,0.001,0.0005,0.0015,1.0
25%,0.45,0.35,0.115,0.4415,0.186,0.0935,0.13,8.0
50%,0.545,0.425,0.14,0.7995,0.336,0.171,0.234,9.0
75%,0.615,0.48,0.165,1.153,0.502,0.253,0.329,11.0
max,0.815,0.65,1.13,2.8255,1.488,0.76,1.005,29.0


## Features and Targets

In [9]:
y = data["rings"]

In [10]:
y

0       15
1        7
2        9
3       10
4        7
5        8
6       20
7       16
8        9
9       19
10      14
11      10
12      11
13      10
14      10
15      12
16       7
17      10
18       7
19       9
20      11
21      10
22      12
23       9
24      10
25      11
26      11
27      12
28      15
29      11
        ..
4147    11
4148    11
4149     6
4150     7
4151     6
4152     7
4153     8
4154     6
4155     6
4156     8
4157     8
4158     8
4159     9
4160    11
4161    11
4162     8
4163     7
4164     7
4165     7
4166    10
4167     9
4168     8
4169    10
4170    10
4171     8
4172    11
4173    10
4174     9
4175    10
4176    12
Name: rings, Length: 4177, dtype: int64

In [15]:
X = data.iloc[:,1:-1]
X

Unnamed: 0,length,diameter,height,wweight,shuweight,vweight,sheweight
0,0.455,0.365,0.095,0.5140,0.2245,0.1010,0.1500
1,0.350,0.265,0.090,0.2255,0.0995,0.0485,0.0700
2,0.530,0.420,0.135,0.6770,0.2565,0.1415,0.2100
3,0.440,0.365,0.125,0.5160,0.2155,0.1140,0.1550
4,0.330,0.255,0.080,0.2050,0.0895,0.0395,0.0550
5,0.425,0.300,0.095,0.3515,0.1410,0.0775,0.1200
6,0.530,0.415,0.150,0.7775,0.2370,0.1415,0.3300
7,0.545,0.425,0.125,0.7680,0.2940,0.1495,0.2600
8,0.475,0.370,0.125,0.5095,0.2165,0.1125,0.1650
9,0.550,0.440,0.150,0.8945,0.3145,0.1510,0.3200


In [18]:
from sklearn.ensemble import RandomForestRegressor as rfr

estimator = rfr(n_estimators=7)
estimator.fit(X,y)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=7, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [19]:
y_hat = estimator.predict(X)
print(y_hat)

[15.          6.85714286  8.85714286 ... 10.71428571 10.
 12.        ]
