In [1]:
import pandas as pd
import numpy as np

#### Quick look at the data structure

In [2]:
housing = pd.read_csv("housing.csv")

In [3]:
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [4]:
housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
longitude             20640 non-null float64
latitude              20640 non-null float64
housing_median_age    20640 non-null float64
total_rooms           20640 non-null float64
total_bedrooms        20433 non-null float64
population            20640 non-null float64
households            20640 non-null float64
median_income         20640 non-null float64
median_house_value    20640 non-null float64
ocean_proximity       20640 non-null object
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [5]:
housing["ocean_proximity"].value_counts()

<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2658
NEAR BAY      2290
ISLAND           5
Name: ocean_proximity, dtype: int64

In [6]:
print(housing.describe())

          longitude      latitude  housing_median_age   total_rooms  \
count  20640.000000  20640.000000        20640.000000  20640.000000   
mean    -119.569704     35.631861           28.639486   2635.763081   
std        2.003532      2.135952           12.585558   2181.615252   
min     -124.350000     32.540000            1.000000      2.000000   
25%     -121.800000     33.930000           18.000000   1447.750000   
50%     -118.490000     34.260000           29.000000   2127.000000   
75%     -118.010000     37.710000           37.000000   3148.000000   
max     -114.310000     41.950000           52.000000  39320.000000   

       total_bedrooms    population    households  median_income  \
count    20433.000000  20640.000000  20640.000000   20640.000000   
mean       537.870553   1425.476744    499.539680       3.870671   
std        421.385070   1132.462122    382.329753       1.899822   
min          1.000000      3.000000      1.000000       0.499900   
25%        296.00000

#### Looking for correlation

In [7]:
corr_matrix = housing.corr()
corr_matrix["median_house_value"].sort_values(ascending = False)

median_house_value    1.000000
median_income         0.688075
total_rooms           0.134153
housing_median_age    0.105623
households            0.065843
total_bedrooms        0.049686
population           -0.024650
longitude            -0.045967
latitude             -0.144160
Name: median_house_value, dtype: float64

In [8]:
housing["rooms_per_household"] = housing["total_rooms"] / housing["population"]
housing["population_per_household"] = housing["population"] / housing["households"]
corr_matrix = housing.corr()
corr_matrix["median_house_value"].sort_values(ascending=False)

median_house_value          1.000000
median_income               0.688075
rooms_per_household         0.209482
total_rooms                 0.134153
housing_median_age          0.105623
households                  0.065843
total_bedrooms              0.049686
population_per_household   -0.023737
population                 -0.024650
longitude                  -0.045967
latitude                   -0.144160
Name: median_house_value, dtype: float64

#### Prepare the data for Machine Learning algorithms

In [9]:
housing[housing.isnull().any(axis=1)].head(10)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,rooms_per_household,population_per_household
290,-122.16,37.77,47.0,1256.0,,570.0,218.0,4.375,161900.0,NEAR BAY,2.203509,2.614679
341,-122.17,37.75,38.0,992.0,,732.0,259.0,1.6196,85100.0,NEAR BAY,1.355191,2.826255
538,-122.28,37.78,29.0,5154.0,,3741.0,1273.0,2.5762,173400.0,NEAR BAY,1.377706,2.938727
563,-122.24,37.75,45.0,891.0,,384.0,146.0,4.9489,247100.0,NEAR BAY,2.320312,2.630137
696,-122.1,37.69,41.0,746.0,,387.0,161.0,3.9063,178400.0,NEAR BAY,1.927649,2.403727
738,-122.14,37.67,37.0,3342.0,,1635.0,557.0,4.7933,186900.0,NEAR BAY,2.044037,2.935368
1097,-121.77,39.66,20.0,3759.0,,1705.0,600.0,4.712,158600.0,INLAND,2.204692,2.841667
1350,-121.95,38.03,5.0,5526.0,,3207.0,1012.0,4.0767,143100.0,INLAND,1.723106,3.168972
1456,-121.98,37.96,22.0,2987.0,,1420.0,540.0,3.65,204100.0,INLAND,2.103521,2.62963
1493,-122.01,37.94,23.0,3741.0,,1339.0,499.0,6.7061,322300.0,NEAR BAY,2.793876,2.683367


In [10]:
# housing = housing.dropna(subset=["total_bedrooms"])    # option 1
housing = housing.drop("total_bedrooms", axis=1)       # option 2
# median = housing["total_bedrooms"].median()            # option 3
# housing["total_bedrooms"].fillna(median, inplace=True) # option 3
housing

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,population,households,median_income,median_house_value,ocean_proximity,rooms_per_household,population_per_household
0,-122.23,37.88,41.0,880.0,322.0,126.0,8.3252,452600.0,NEAR BAY,2.732919,2.555556
1,-122.22,37.86,21.0,7099.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY,2.956685,2.109842
2,-122.24,37.85,52.0,1467.0,496.0,177.0,7.2574,352100.0,NEAR BAY,2.957661,2.802260
3,-122.25,37.85,52.0,1274.0,558.0,219.0,5.6431,341300.0,NEAR BAY,2.283154,2.547945
4,-122.25,37.85,52.0,1627.0,565.0,259.0,3.8462,342200.0,NEAR BAY,2.879646,2.181467
5,-122.25,37.85,52.0,919.0,413.0,193.0,4.0368,269700.0,NEAR BAY,2.225182,2.139896
6,-122.25,37.84,52.0,2535.0,1094.0,514.0,3.6591,299200.0,NEAR BAY,2.317185,2.128405
7,-122.25,37.84,52.0,3104.0,1157.0,647.0,3.1200,241400.0,NEAR BAY,2.682800,1.788253
8,-122.26,37.84,42.0,2555.0,1206.0,595.0,2.0804,226700.0,NEAR BAY,2.118574,2.026891
9,-122.25,37.84,52.0,3549.0,1551.0,714.0,3.6912,261100.0,NEAR BAY,2.288201,2.172269


#### Handling Text and Categorical Attributes

In [11]:
housing_cat = housing[["ocean_proximity"]]
housing_cat.head(10)

Unnamed: 0,ocean_proximity
0,NEAR BAY
1,NEAR BAY
2,NEAR BAY
3,NEAR BAY
4,NEAR BAY
5,NEAR BAY
6,NEAR BAY
7,NEAR BAY
8,NEAR BAY
9,NEAR BAY


In [12]:
# 分類作法 A

from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
housing_cat = housing["ocean_proximity"]
housing_cat_encoded = encoder.fit_transform(housing_cat)
housing_cat_encoded

array([3, 3, 3, ..., 1, 1, 1])

In [13]:
print(encoder.classes_)

['<1H OCEAN' 'INLAND' 'ISLAND' 'NEAR BAY' 'NEAR OCEAN']


In [14]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(categories='auto')
housing_cat_1hot = encoder.fit_transform(housing_cat_encoded.reshape(-1,1))
housing_cat_1hot

<20640x5 sparse matrix of type '<class 'numpy.float64'>'
	with 20640 stored elements in Compressed Sparse Row format>

In [15]:
housing_cat_1hot.toarray()

array([[0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0.],
       ...,
       [0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0.]])

In [16]:
# 分類作法 B
from sklearn.preprocessing import LabelBinarizer

encoder = LabelBinarizer()
encoder.fit_transform(housing_cat)

array([[0, 0, 0, 1, 0],
       [0, 0, 0, 1, 0],
       [0, 0, 0, 1, 0],
       ...,
       [0, 1, 0, 0, 0],
       [0, 1, 0, 0, 0],
       [0, 1, 0, 0, 0]])

#### combine housing and categorical 

In [17]:
pd.DataFrame(housing_cat_1hot.toarray()).iloc[:, 1:].head()

Unnamed: 0,1,2,3,4
0,0.0,0.0,1.0,0.0
1,0.0,0.0,1.0,0.0
2,0.0,0.0,1.0,0.0
3,0.0,0.0,1.0,0.0
4,0.0,0.0,1.0,0.0


In [18]:
housing_final = pd.concat([housing, pd.DataFrame(housing_cat_1hot.toarray()).iloc[:, 1:]], axis=1)

In [19]:
housing_final.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,population,households,median_income,median_house_value,ocean_proximity,rooms_per_household,population_per_household,1,2,3,4
0,-122.23,37.88,41.0,880.0,322.0,126.0,8.3252,452600.0,NEAR BAY,2.732919,2.555556,0.0,0.0,1.0,0.0
1,-122.22,37.86,21.0,7099.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY,2.956685,2.109842,0.0,0.0,1.0,0.0
2,-122.24,37.85,52.0,1467.0,496.0,177.0,7.2574,352100.0,NEAR BAY,2.957661,2.80226,0.0,0.0,1.0,0.0
3,-122.25,37.85,52.0,1274.0,558.0,219.0,5.6431,341300.0,NEAR BAY,2.283154,2.547945,0.0,0.0,1.0,0.0
4,-122.25,37.85,52.0,1627.0,565.0,259.0,3.8462,342200.0,NEAR BAY,2.879646,2.181467,0.0,0.0,1.0,0.0


# Prepare the data for Machine Learning algorithms

In [20]:
X = housing_final.drop("median_house_value", axis=1).drop("ocean_proximity", axis=1)
X.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,population,households,median_income,rooms_per_household,population_per_household,1,2,3,4
0,-122.23,37.88,41.0,880.0,322.0,126.0,8.3252,2.732919,2.555556,0.0,0.0,1.0,0.0
1,-122.22,37.86,21.0,7099.0,2401.0,1138.0,8.3014,2.956685,2.109842,0.0,0.0,1.0,0.0
2,-122.24,37.85,52.0,1467.0,496.0,177.0,7.2574,2.957661,2.80226,0.0,0.0,1.0,0.0
3,-122.25,37.85,52.0,1274.0,558.0,219.0,5.6431,2.283154,2.547945,0.0,0.0,1.0,0.0
4,-122.25,37.85,52.0,1627.0,565.0,259.0,3.8462,2.879646,2.181467,0.0,0.0,1.0,0.0


In [21]:
y = housing_final[["median_house_value"]]
y.head()

Unnamed: 0,median_house_value
0,452600.0
1,358500.0
2,352100.0
3,341300.0
4,342200.0


In [22]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [23]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
# sc_X.fit_transform(X_train["longitude"])

In [24]:
X_train.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,population,households,median_income,rooms_per_household,population_per_household,1,2,3,4
12069,-117.55,33.83,6.0,502.0,228.0,65.0,4.2386,2.201754,3.507692,1.0,0.0,0.0,0.0
15925,-122.44,37.73,52.0,2381.0,1485.0,447.0,4.3898,1.603367,3.322148,0.0,0.0,1.0,0.0
11162,-118.0,33.83,26.0,1718.0,1022.0,368.0,3.9333,1.681018,2.777174,0.0,0.0,0.0,0.0
4904,-118.26,34.01,38.0,697.0,749.0,206.0,1.4653,0.930574,3.635922,0.0,0.0,0.0,0.0
4683,-118.36,34.08,52.0,2373.0,1135.0,576.0,3.1765,2.090749,1.970486,0.0,0.0,0.0,0.0


In [25]:
X_train[['longitude', 'latitude']] = sc_X.fit_transform(X_train[['longitude', 'latitude']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.loc._setitem_with_indexer((slice(None), indexer), value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_array(key, value)


In [26]:
X_train.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,population,households,median_income,rooms_per_household,population_per_household,1,2,3,4
12069,1.003899,-0.840062,6.0,502.0,228.0,65.0,4.2386,2.201754,3.507692,1.0,0.0,0.0,0.0
15925,-1.434772,0.985364,52.0,2381.0,1485.0,447.0,4.3898,1.603367,3.322148,0.0,0.0,1.0,0.0
11162,0.779481,-0.840062,26.0,1718.0,1022.0,368.0,3.9333,1.681018,2.777174,0.0,0.0,0.0,0.0
4904,0.649818,-0.755812,38.0,697.0,749.0,206.0,1.4653,0.930574,3.635922,0.0,0.0,0.0,0.0
4683,0.599947,-0.723048,52.0,2373.0,1135.0,576.0,3.1765,2.090749,1.970486,0.0,0.0,0.0,0.0


In [27]:
# # Feature Scaling
# from sklearn.preprocessing import StandardScaler
# sc_X = StandardScaler()
# X_train = sc_X.fit_transform(X_train)
# X_test = sc_X.transform(X_test)
# sc_y = StandardScaler()
# y_train = sc_y.fit_transform(y_train)

In [28]:
type(y_train)

pandas.core.frame.DataFrame

In [29]:
type(X_train)

pandas.core.frame.DataFrame

#### null detection

In [30]:
# 方法 A
# y_train.isnull().any(axis=1)
# X_train.isnull().any(axis=1)

# 方法 B
# np.isnan(y_train).any()
# np.isnan(X_train).any()

# Prepare the data for Machine Learning algorithms

In [31]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [32]:
# lin_reg.predict(X_train)
# lin_reg.predict(X_test)

#### 檢驗 train set 績效

In [33]:
from sklearn.metrics import mean_squared_error

housing_predictions = lin_reg.predict(X_train)
lin_mse = mean_squared_error(y_train, housing_predictions)
lin_rmse = np.sqrt(lin_mse)
print(lin_rmse)

from sklearn.metrics import mean_absolute_error
lin_mae = mean_absolute_error(y_train, housing_predictions)
print(lin_mae)

67973.296447151
49461.47745791734


#### 檢驗 test set 績效

In [34]:
from sklearn.metrics import mean_squared_error

housing_predictions = lin_reg.predict(X_test)
lin_mse = mean_squared_error(y_test, housing_predictions)
lin_rmse = np.sqrt(lin_mse)
print(lin_rmse)

from sklearn.metrics import mean_absolute_error
lin_mae = mean_absolute_error(y_test, housing_predictions)
print(lin_mae)

4959090.752593165
4958542.753708021


#### 模型換決策樹

In [35]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor()
tree_reg.fit(X_train, y_train)
# housing_predictions = tree_reg.predict(housing_prepared)
# tree_mse = mean_squared_error(housing_labels, housing_predictions)
# tree_rmse = np.sqrt(tree_mse)
# tree_rmse

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

#### 檢驗 train set的績效

In [36]:
from sklearn.metrics import mean_squared_error

housing_predictions = tree_reg.predict(X_train)
lin_mse = mean_squared_error(y_train, housing_predictions)
lin_rmse = np.sqrt(lin_mse)
print(lin_rmse)

from sklearn.metrics import mean_absolute_error
lin_mae = mean_absolute_error(y_train, housing_predictions)
print(lin_mae)

0.0
0.0


#### 檢驗 test set 績效

In [37]:
from sklearn.metrics import mean_squared_error

housing_predictions = tree_reg.predict(X_test)
lin_mse = mean_squared_error(y_test, housing_predictions)
lin_rmse = np.sqrt(lin_mse)
print(lin_rmse)

from sklearn.metrics import mean_absolute_error
lin_mae = mean_absolute_error(y_test, housing_predictions)
print(lin_mae)

104619.45193577094
77397.00872093023


# Fine-tune your model

In [38]:
from sklearn.model_selection import cross_val_score

tree_scores = cross_val_score(tree_reg, X_train, y_train,
                              scoring="neg_mean_squared_error", cv=10)
tree_rmse_scores = np.sqrt(-tree_scores)

In [39]:
tree_rmse_scores

array([72335.84644414, 66531.5096737 , 67215.96709583, 70624.33104016,
       75960.69587806, 71984.95620928, 67601.53370637, 69785.89566985,
       71205.31211877, 64093.02656629])

In [40]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

display_scores(tree_rmse_scores)

Scores: [72335.84644414 66531.5096737  67215.96709583 70624.33104016
 75960.69587806 71984.95620928 67601.53370637 69785.89566985
 71205.31211877 64093.02656629]
Mean: 69733.90744024396
Standard deviation: 3263.935046679895


In [41]:
lin_scores = cross_val_score(lin_reg, X_train, y_train,
                             scoring="neg_mean_squared_error", cv=10)
lin_rmse_scores = np.sqrt(-lin_scores)
display_scores(lin_rmse_scores)

Scores: [68945.97646887 65590.9449141  69121.65720735 67253.67505489
 72308.16414768 67532.58484636 66712.83282223 68798.4654893
 71303.52108226 64654.42581677]
Mean: 68222.22478498144
Standard deviation: 2262.717725152657


In [42]:
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor()
forest_reg.fit(X_train, y_train)
housing_predictions = forest_reg.predict(X_train)
forest_mse = mean_squared_error(y_train, housing_predictions)
forest_rmse = np.sqrt(forest_mse)
forest_rmse

  after removing the cwd from sys.path.


21878.918340631742

In [43]:
from sklearn.model_selection import cross_val_score

forest_scores = cross_val_score(forest_reg, X_train, y_train,
                                scoring="neg_mean_squared_error", cv=10)
forest_rmse_scores = np.sqrt(-forest_scores)
display_scores(forest_rmse_scores)

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


Scores: [54025.80375175 52084.19205839 50539.89955877 50556.53140251
 54385.24129767 52793.65595638 51680.45864828 50969.78891397
 53832.60772781 47248.98860783]
Mean: 51811.716792335654
Standard deviation: 2034.3359134077882


In [44]:
scores = cross_val_score(lin_reg, X_train, y_train, scoring="neg_mean_squared_error", cv=10)
pd.Series(np.sqrt(-scores)).describe()

count       10.000000
mean     68222.224785
std       2385.113905
min      64654.425817
25%      66848.043380
50%      68165.525168
75%      69077.737023
max      72308.164148
dtype: float64

In [45]:
from sklearn.svm import SVR

svm_reg = SVR(kernel="linear")
svm_reg.fit(X_train, y_train)
housing_predictions = svm_reg.predict(X_train)
svm_mse = mean_squared_error(y_train, housing_predictions)
svm_rmse = np.sqrt(svm_mse)
svm_rmse

  y = column_or_1d(y, warn=True)


94179.55824305126

In [46]:
from sklearn.model_selection import GridSearchCV

param_grid = [
        {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
        {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
    ]

forest_reg = RandomForestRegressor()
grid_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  self.best_estimator_.fit(X, y, **fit_params)


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid=[{'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]}, {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=0)

In [47]:
grid_search.best_params_

{'max_features': 6, 'n_estimators': 30}

In [48]:
grid_search.best_estimator_

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=6, max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=30, n_jobs=None, oob_score=False,
           random_state=None, verbose=0, warm_start=False)

In [49]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

62132.49333373266 {'max_features': 2, 'n_estimators': 3}
53673.28817068045 {'max_features': 2, 'n_estimators': 10}
51163.25314837313 {'max_features': 2, 'n_estimators': 30}
59117.31165692548 {'max_features': 4, 'n_estimators': 3}
51598.55043785641 {'max_features': 4, 'n_estimators': 10}
49703.8637595398 {'max_features': 4, 'n_estimators': 30}
57541.7648832333 {'max_features': 6, 'n_estimators': 3}
51381.04090769564 {'max_features': 6, 'n_estimators': 10}
49617.696663810944 {'max_features': 6, 'n_estimators': 30}
57084.629945210385 {'max_features': 8, 'n_estimators': 3}
51483.665757421615 {'max_features': 8, 'n_estimators': 10}
49705.8307026469 {'max_features': 8, 'n_estimators': 30}
59905.58286198555 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}
52731.73827812246 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10}
59326.817912934726 {'bootstrap': False, 'max_features': 3, 'n_estimators': 3}
51090.37617572541 {'bootstrap': False, 'max_features': 3, 'n_estimators': 1

In [50]:
pd.DataFrame(grid_search.cv_results_)



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_features,param_n_estimators,param_bootstrap,params,split0_test_score,split1_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,0.073215,0.009016,0.004784,0.000746,2,3,,"{'max_features': 2, 'n_estimators': 3}",-3879755000.0,-3698058000.0,...,-3860447000.0,84583150.0,18,-1025773000.0,-1030998000.0,-965525400.0,-1035566000.0,-986809600.0,-1008934000.0,27761240.0
1,0.227392,0.007588,0.011367,0.001621,2,10,,"{'max_features': 2, 'n_estimators': 10}",-2841365000.0,-2827420000.0,...,-2880822000.0,151074100.0,11,-560581200.0,-546435000.0,-560136000.0,-534885500.0,-589315900.0,-558270700.0,18218530.0
2,0.575661,0.022214,0.028527,0.003811,2,30,,"{'max_features': 2, 'n_estimators': 30}",-2696419000.0,-2509175000.0,...,-2617678000.0,94113360.0,6,-410730500.0,-417443400.0,-394597400.0,-412048600.0,-405174800.0,-407998900.0,7754161.0
3,0.095946,0.001724,0.00458,0.000498,4,3,,"{'max_features': 4, 'n_estimators': 3}",-3577140000.0,-3416306000.0,...,-3494857000.0,86523600.0,15,-888981100.0,-919554200.0,-877984700.0,-939671000.0,-928304500.0,-910899100.0,23534810.0
4,0.340094,0.016158,0.011374,0.001467,4,10,,"{'max_features': 4, 'n_estimators': 10}",-2749393000.0,-2545233000.0,...,-2662410000.0,96232560.0,9,-495252800.0,-504259000.0,-476427900.0,-503383000.0,-511512500.0,-498167000.0,12029150.0
5,1.010508,0.031433,0.028531,0.00136,4,30,,"{'max_features': 4, 'n_estimators': 30}",-2488962000.0,-2356410000.0,...,-2470474000.0,93573500.0,2,-366411300.0,-382515700.0,-365402500.0,-385248200.0,-388417900.0,-377599100.0,9732960.0
6,0.155793,0.015644,0.005586,0.001013,6,3,,"{'max_features': 6, 'n_estimators': 3}",-3497132000.0,-3161511000.0,...,-3311055000.0,146150000.0,13,-876694400.0,-903334600.0,-874025000.0,-923430800.0,-847366600.0,-884970300.0,26148720.0
7,0.473469,0.023415,0.010574,0.000507,6,10,,"{'max_features': 6, 'n_estimators': 10}",-2675145000.0,-2535441000.0,...,-2640011000.0,129139900.0,7,-461839700.0,-483889000.0,-487421500.0,-494046600.0,-480309900.0,-481501400.0,10827110.0
8,1.363557,0.014165,0.030039,0.001014,6,30,,"{'max_features': 6, 'n_estimators': 30}",-2547315000.0,-2386750000.0,...,-2461916000.0,80791760.0,1,-362790500.0,-368151000.0,-367108000.0,-380028100.0,-382568700.0,-372129200.0,7741135.0
9,0.182224,0.010898,0.004992,0.001102,8,3,,"{'max_features': 8, 'n_estimators': 3}",-3417063000.0,-3192742000.0,...,-3258655000.0,165517600.0,12,-863913100.0,-861333400.0,-843890300.0,-861638700.0,-822859200.0,-850726900.0,15676650.0


In [51]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

param_distribs = {
        'n_estimators': randint(low=1, high=200),
        'max_features': randint(low=1, high=8),
    }

forest_reg = RandomForestRegressor()
rnd_search = RandomizedSearchCV(forest_reg, param_distributions=param_distribs,
                                n_iter=10, cv=5, scoring='neg_mean_squared_error')
rnd_search.fit(X_train, y_train)

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

RandomizedSearchCV(cv=5, error_score='raise-deprecating',
          estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
          fit_params=None, iid='warn', n_iter=10, n_jobs=None,
          param_distributions={'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000025428AA8080>, 'max_features': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000025428AA8048>},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring='neg_mean_squared_error',
          verbose=0)

In [52]:
cvres = rnd_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

48679.215715103965 {'max_features': 6, 'n_estimators': 77}
48467.49482652831 {'max_features': 5, 'n_estimators': 114}
48806.4498689856 {'max_features': 4, 'n_estimators': 97}
49763.58404289101 {'max_features': 3, 'n_estimators': 47}
61715.84186643642 {'max_features': 2, 'n_estimators': 3}
48663.014212662056 {'max_features': 4, 'n_estimators': 117}
48618.298696338425 {'max_features': 6, 'n_estimators': 130}
54126.15710799759 {'max_features': 1, 'n_estimators': 26}
49006.59497997398 {'max_features': 3, 'n_estimators': 171}
48680.58310950676 {'max_features': 4, 'n_estimators': 187}


In [53]:
feature_importances = grid_search.best_estimator_.feature_importances_
feature_importances

array([7.40328041e-02, 7.41099774e-02, 4.30277253e-02, 2.07420631e-02,
       1.87980285e-02, 2.09756739e-02, 4.08419559e-01, 9.96078809e-02,
       8.38964356e-02, 1.49446950e-01, 1.34646676e-04, 2.78596324e-03,
       4.02229227e-03])