# Training and Evaluating on the Training Set

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer

In [2]:
housing = pd.read_csv(r"C:\Users\georg\Desktop\Machine Learning\notebooks_detailed\datasets\housing\housing.csv") 

housing["income_category"] =pd.cut(housing["median_income"],bins=[0,1.5,3.0,4.5,6.,np.inf],labels=[1,2,3,4,5])
split_indices = StratifiedShuffleSplit(n_splits=1,test_size=0.2,random_state=42)
for train_index, test_index in split_indices.split(housing,housing["income_category"]): 
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

for set_ in (strat_train_set, strat_test_set):  ## we are dropping the new attribute
    set_.drop("income_category", axis=1, inplace=True)  
    

housing = strat_train_set.copy()  # make a copy of original data 

housing = strat_train_set.drop("median_house_value", axis=1) # drop labels for training set
housing_labels = strat_train_set["median_house_value"].copy() # seprate the target column

housing_num = housing.drop("ocean_proximity", axis=1) # numerical attributes
housing_cat = housing[["ocean_proximity"]] # categorical attributes


# this is a very condesated form , don't worry about it

rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room=True): # no *args or **kargs
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self  # nothing else to do
    def transform(self, X):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household,
                         bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]
            
numerical_pipeline = Pipeline([    # The Pipeline constructor takes a list of name/estimator pairs defining a sequence of steps.
('imputer', SimpleImputer(strategy="median")),
('attribs_adder', CombinedAttributesAdder()), 
('std_scaler', StandardScaler()),
])

num_attribs = list(housing_num)  # list of numerical columns 
cat_attribs = ["ocean_proximity"] #list of categorical columns 

full_pipeline = ColumnTransformer([ # The constructor requires a list of tuples, where each tuple contains a name, a transformer and a list of names (or indices) of columns that the transformer should be applied to
("num", numerical_pipeline, num_attribs), # name : whatever u want| transfomer : numerical_pipeline defined earlier|target:num_attribs
("cat", OneHotEncoder(), cat_attribs),
])
housing_prepared = full_pipeline.fit_transform(housing)
housing_prepared

array([[-1.15604281,  0.77194962,  0.74333089, ...,  0.        ,
         0.        ,  0.        ],
       [-1.17602483,  0.6596948 , -1.1653172 , ...,  0.        ,
         0.        ,  0.        ],
       [ 1.18684903, -1.34218285,  0.18664186, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [ 1.58648943, -0.72478134, -1.56295222, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.78221312, -0.85106801,  0.18664186, ...,  0.        ,
         0.        ,  0.        ],
       [-1.43579109,  0.99645926,  1.85670895, ...,  0.        ,
         1.        ,  0.        ]])

In [3]:
from sklearn.linear_model import LinearRegression
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html
linear_regression = LinearRegression()
linear_regression.fit(housing_prepared,housing_labels)  #the algorithm "learns" from the training data 

LinearRegression()

### What we did is only create an instance of the linear regression model and calling the fit method on the training data so now the algorithm made some "correlations" that will use to make some predictions

In [4]:
original_data = housing.iloc[:5]
original_target = housing_labels.iloc[:5]
list(original_target)   # this are the original target values we are trying to predict

[286600.0, 340600.0, 196900.0, 46300.0, 254500.0]

In [5]:
data_prepared = full_pipeline.transform(original_data)  #we make our transformations with the help of the pipeline we created 
predicted_target = linear_regression.predict(data_prepared) #we use predict method to make predictions on the newly prepared data
# !! the linear regression model is already trained 
list(predicted_target)  

[210644.60459285564,
 317768.80697210855,
 210956.43331178208,
 59218.98886849053,
 189747.55849878516]

## Let's check our accuracy with the mean_squared_error
### First a view of the MSE : Mean squared error   and RMSE : root squared error

**Mean Squared Error** (MSE) is a measure of how close a fitted line is to data points. For every data point, you take the distance vertically from the point to the corresponding y value on the curve fit (the error), and square the value.
<br> **Root Mean** Squared Error (RMSE). It is just the square root of the mean square error. That is probably the most easily interpreted statistic, since it has the same units as the quantity plotted on the vertical axis.


In [6]:
from sklearn.metrics import mean_squared_error
housing_predictions = linear_regression.predict(housing_prepared)
linear_RMSE = mean_squared_error(housing_labels,housing_predictions,squared=False) #squared parameter set to false to get RMSE
linear_RMSE
# so we have a prediction error of 68 628 $ which is pretty bad.

68628.19819848923

### This is an example of a model underfitting the training data. When this happens it can mean that the features do not provide enough information to make good predictions, or that the model is not powerful enough. As we saw in the previous chapter, the main ways to fix underfitting are to select a more powerful model, to feed the training algorithm with better features, or to reduce the constraints on the model.

### Now will try to train a DecisionTreeRegressor capable of finding complex nonlinear relationships in the data.

In [7]:
from sklearn.tree import DecisionTreeRegressor
tree_regression = DecisionTreeRegressor()
tree_regression.fit(housing_prepared,housing_labels)

DecisionTreeRegressor()

In [8]:
housing_predictions_tree = tree_regression.predict(housing_prepared)
tree_RMSE =mean_squared_error(housing_labels,housing_predictions_tree,squared=False)
tree_RMSE

0.0

### Wait, what!? No error at all? Could this model really be absolutely perfect? Of course, it is much more likely that the model has badly overfit the data. How can you be sure?
### As we saw earlier, you don’t want to touch the test set until you are ready to launch a model you are confident about, so you need to use part of the training set for training, and part for model validation.

### One way to evaluate the Decision Tree model would be to use the train_test_split function to split the training set into a smaller training set and a validation set, then train your models against the smaller training set and evaluate them against the validation set. It’s a bit of work, but nothing too difficult and it would work fairly well.

### A much simpler alternative is to use Scikit-Learn K-fold cross-validation feature.

https://scikit-learn.org/stable/modules/cross_validation.html
https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_val_score.html



In [13]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(tree_regression,housing_prepared,housing_labels,scoring="neg_mean_squared_error",cv=10) # cv is the parameter for number of folds default is 5
tree_RMSE_scores = np.sqrt(-scores) #!! u have to negate the them because they are negative already in the scoring parameter more details below
# https://stackoverflow.com/questions/21050110/sklearn-gridsearchcv-with-pipeline
tree_RMSE_scores

array([70460.82710592, 65600.68791566, 68772.02881682, 69630.99338318,
       71113.78937513, 72862.30236324, 71121.91068388, 70530.77393935,
       77396.79371089, 70370.96792194])

### now what cross_val_score is doing is it splits the training set into 10 distinct subsets called folds, then it trains and evaluates the Decision Tree model 10 times, picking a different fold for evaluation every time and training on the other 9 folds. The result is an array containing the 10 evaluation scores.

### We can see that Decision Tree doesn't do any better then Linear Regression, actually worse.

In [15]:
# we will do the same for the linear model
linear_scores = cross_val_score(linear_regression,housing_prepared,housing_labels,scoring="neg_mean_squared_error",cv=10)
linear_scores_RMSE = np.sqrt(-linear_scores)
linear_scores_RMSE

array([66782.73843989, 66960.118071  , 70347.95244419, 74739.57052552,
       68031.13388938, 71193.84183426, 64969.63056405, 68281.61137997,
       71552.91566558, 67665.10082067])

## Let's try one last model RandomForestRegressor - works by training many Decision Trees on random subsets of the features, then averaging out their predictions. Building a model on top of many other models is called Ensemble Learning

In [21]:
from sklearn.ensemble import RandomForestRegressor
forest_regression = RandomForestRegressor(n_estimators=100,random_state=42)
forest_regression.fit(housing_prepared,housing_labels)
housing_predictions_forest = forest_regression.predict(housing_prepared)
print(mean_squared_error(housing_labels,housing_predictions_forest,squared=False)) #score on the training set 

18603.515021376355


In [19]:
forest_scores = cross_val_score(forest_regression,housing_prepared,housing_labels,scoring="neg_mean_squared_error",cv=10)
forest_scores_RMSE = np.sqrt(-forest_scores)
forest_scores_RMSE  #scores on the validation sets

array([49519.80364233, 47461.9115823 , 50029.02762854, 52325.28068953,
       49308.39426421, 53446.37892622, 48634.8036574 , 47585.73832311,
       53490.10699751, 50021.5852922 ])

### RandomForestRegressor performs better then both previous models. However, note that the score on the training set is still much lower than on the validation sets, meaning that the model is still overfitting the training set.