In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

# _________________________________________________________________________________________________________

housing = pd.read_csv(r"C:\Users\georg\Desktop\Machine Learning\notebooks_detailed\datasets\housing\housing.csv") 

housing["income_category"] =pd.cut(housing["median_income"],bins=[0,1.5,3.0,4.5,6.,np.inf],labels=[1,2,3,4,5])
split_indices = StratifiedShuffleSplit(n_splits=1,test_size=0.2,random_state=42)
for train_index, test_index in split_indices.split(housing,housing["income_category"]): 
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]
    
for set_ in (strat_train_set, strat_test_set):  ## we are dropping the new attribute
    set_.drop("income_category", axis=1, inplace=True)  
    

housing = strat_train_set.copy()  # make a copy of original data 

housing = strat_train_set.drop("median_house_value", axis=1) # drop labels for training set
housing_labels = strat_train_set["median_house_value"].copy() # seprate the target column

housing_num = housing.drop("ocean_proximity", axis=1) # numerical attributes
housing_cat = housing[["ocean_proximity"]] # categorical attributes


# this is a very condesated form , don't worry about it
rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room=True): # no *args or **kargs
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self  # nothing else to do
    def transform(self, X):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household,
                         bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]
            
numerical_pipeline = Pipeline([    # The Pipeline constructor takes a list of name/estimator pairs defining a sequence of steps.
('imputer', SimpleImputer(strategy="median")),
('attribs_adder', CombinedAttributesAdder()), 
('std_scaler', StandardScaler()),
])

num_attribs = list(housing_num)  # list of numerical columns 
cat_attribs = ["ocean_proximity"] #list of categorical columns 

full_pipeline = ColumnTransformer([ # The constructor requires a list of tuples, where each tuple contains a name, a transformer and a list of names (or indices) of columns that the transformer should be applied to
("num", numerical_pipeline, num_attribs), # name : whatever u want| transfomer : numerical_pipeline defined earlier|target:num_attribs
("cat", OneHotEncoder(), cat_attribs),
])
housing_prepared = full_pipeline.fit_transform(housing)
housing_prepared

array([[-1.15604281,  0.77194962,  0.74333089, ...,  0.        ,
         0.        ,  0.        ],
       [-1.17602483,  0.6596948 , -1.1653172 , ...,  0.        ,
         0.        ,  0.        ],
       [ 1.18684903, -1.34218285,  0.18664186, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [ 1.58648943, -0.72478134, -1.56295222, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.78221312, -0.85106801,  0.18664186, ...,  0.        ,
         0.        ,  0.        ],
       [-1.43579109,  0.99645926,  1.85670895, ...,  0.        ,
         1.        ,  0.        ]])

In [2]:
from sklearn.ensemble import RandomForestRegressor
forest_regression = RandomForestRegressor(n_estimators=100,random_state=42)
forest_regression.fit(housing_prepared,housing_labels)
housing_predictions_forest = forest_regression.predict(housing_prepared)
print(mean_squared_error(housing_labels,housing_predictions_forest,squared=False)) #score on the training set 

18603.515021376355


In [3]:
forest_scores = cross_val_score(forest_regression,housing_prepared,housing_labels,scoring="neg_mean_squared_error",cv=10)
forest_scores_RMSE = np.sqrt(-forest_scores)
forest_scores_RMSE  #scores on the validation sets

array([49519.80364233, 47461.9115823 , 50029.02762854, 52325.28068953,
       49308.39426421, 53446.37892622, 48634.8036574 , 47585.73832311,
       53490.10699751, 50021.5852922 ])

## This is out best model. So now we need to fine tune it. 
### One way to do that would be to fiddle with the hyperparameters manually, until you find a great combination of hyperparameter values.
### Instead you should get Scikit-Learn’s GridSearchCV to search for you. All you need to is tell it which hyperparameters you want it to experiment with, and what values to try out, and it will evaluate all the possible combinations of hyperparameter values, using cross-validation.

In [4]:
from sklearn.model_selection import GridSearchCV
# https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html

**param_grid parameter is the most important** :  it accepts a list or a dictionary of hyperparameters 

In [5]:
param_grid = [{"n_estimators" :[3,10,30],"max_features":[2,4,6,8]}, # try 12 (3×4) combinations of hyperparameters 
              {"bootstrap":[False],"n_estimators":[3,10],"max_features":[2,3,4]}] 
             # then try 6 (2×3) combinations with bootstrap set as False
forest_reg =RandomForestRegressor()

### Each decision tree in the ensemble is fit on a bootstrap sample drawn from the training dataset. This can be turned off by setting the “bootstrap” argument to False, if you desire. In that case, the whole training dataset will be used to train each decision tree. 

All in all, the grid search will explore 12 + 6 = 18 combinations of RandomForestRegressor hyperparameter values, and it will train each model five times (since we are using five-fold cross validation). In other words, all in all, there will be 18 × 5 = 90 rounds of training! It may take quite a long time, but when it is done you can get the best combination of parameters

In [6]:
grid_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring='neg_mean_squared_error', return_train_score=True)
grid_search.fit(housing_prepared, housing_labels)

GridSearchCV(cv=5, estimator=RandomForestRegressor(),
             param_grid=[{'max_features': [2, 4, 6, 8],
                          'n_estimators': [3, 10, 30]},
                         {'bootstrap': [False], 'max_features': [2, 3, 4],
                          'n_estimators': [3, 10]}],
             return_train_score=True, scoring='neg_mean_squared_error')

In [7]:
grid_search.best_params_    # best hyperparameter combination

{'max_features': 6, 'n_estimators': 30}

In [8]:
grid_search.best_estimator_

RandomForestRegressor(max_features=6, n_estimators=30)

## Since the best combination is or max values , we can try again to search with higher values and maybe the score will improve

In [9]:
## See all scores 
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

64835.363644927624 {'max_features': 2, 'n_estimators': 3}
55403.54864273394 {'max_features': 2, 'n_estimators': 10}
52716.910287490864 {'max_features': 2, 'n_estimators': 30}
59113.86194194443 {'max_features': 4, 'n_estimators': 3}
53063.119868028065 {'max_features': 4, 'n_estimators': 10}
50735.30468120093 {'max_features': 4, 'n_estimators': 30}
59933.86840051729 {'max_features': 6, 'n_estimators': 3}
52121.86149777997 {'max_features': 6, 'n_estimators': 10}
50000.74005700329 {'max_features': 6, 'n_estimators': 30}
58341.14731268872 {'max_features': 8, 'n_estimators': 3}
52032.09710860755 {'max_features': 8, 'n_estimators': 10}
50295.733224025455 {'max_features': 8, 'n_estimators': 30}
62010.72908073058 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}
54805.54177386228 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10}
59757.20247402998 {'bootstrap': False, 'max_features': 3, 'n_estimators': 3}
53008.4978146372 {'bootstrap': False, 'max_features': 3, 'n_estimators':

In [11]:
cvres  
# is a dictionary with certain key value pairs

{'mean_fit_time': array([0.05121117, 0.16543708, 0.49231   , 0.08081832, 0.26745949,
        0.82838526, 0.11062469, 0.37268343, 1.11665006, 0.14923344,
        0.48710866, 1.42571945, 0.08541927, 0.26906047, 0.10862398,
        0.35067897, 0.13262939, 0.43889852]),
 'std_fit_time': array([0.00116597, 0.0025776 , 0.0035449 , 0.00074837, 0.00531543,
        0.05472401, 0.00185519, 0.00683087, 0.01405497, 0.00407023,
        0.01282452, 0.01118508, 0.00224524, 0.00660457, 0.00224566,
        0.00294   , 0.00307309, 0.00594785]),
 'mean_score_time': array([0.00260053, 0.00720162, 0.02000465, 0.00260024, 0.0072021 ,
        0.02080483, 0.00200057, 0.00720196, 0.02020431, 0.00320082,
        0.00720196, 0.02000461, 0.00300074, 0.00820203, 0.00280075,
        0.00860167, 0.0028007 , 0.00840197]),
 'std_score_time': array([4.89862464e-04, 4.00567179e-04, 2.43140197e-07, 4.90115593e-04,
        4.00090569e-04, 1.16633486e-03, 1.50789149e-07, 3.99804382e-04,
        4.00424185e-04, 4.00400403e-

### The grid search approach is fine when you are exploring relatively few combinations, like in the previous example, but when the hyperparameter search space is large, it is often preferable to use RandomizedSearchCV instead.
https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html
### It works simmillary with GridSearch class , but has to main advantages:
    - If you let the randomized search run for, say, 1,000 iterations, this approach will explore 1,000 different values for each hyperparameter (instead of just a few values per hyperparameter with the grid search approach).
    - You have more control over the computing budget you want to allocate to hyperparameter search, simply by setting the number of iterations.

In [12]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

param_distribs = {
        'n_estimators': randint(low=1, high=200),
        'max_features': randint(low=1, high=8),
    }

In [13]:
forest_reg = RandomForestRegressor(random_state=42)
rnd_search = RandomizedSearchCV(forest_reg, param_distributions=param_distribs,
                                n_iter=10, cv=5, scoring='neg_mean_squared_error', random_state=42)
rnd_search.fit(housing_prepared, housing_labels)

RandomizedSearchCV(cv=5, estimator=RandomForestRegressor(random_state=42),
                   param_distributions={'max_features': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001E0043C7DF0>,
                                        'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001E0044B62E0>},
                   random_state=42, scoring='neg_mean_squared_error')

In [14]:
cvres = rnd_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

49150.70756927707 {'max_features': 7, 'n_estimators': 180}
51389.889203389284 {'max_features': 5, 'n_estimators': 15}
50796.155224308866 {'max_features': 3, 'n_estimators': 72}
50835.13360315349 {'max_features': 5, 'n_estimators': 21}
49280.9449827171 {'max_features': 7, 'n_estimators': 122}
50774.90662363929 {'max_features': 3, 'n_estimators': 75}
50682.78888164288 {'max_features': 3, 'n_estimators': 88}
49608.99608105296 {'max_features': 5, 'n_estimators': 100}
50473.61930350219 {'max_features': 3, 'n_estimators': 150}
64429.84143294435 {'max_features': 5, 'n_estimators': 2}
