In [None]:
from sklearn.metrics import mean_absolute_error
from sklearn.tree import DecisionTreeRegressor

### Controlling Tree Depth
# The max_leaf_nodes argument provides a very sensible way to control overfitting vs underfitting 
# The more leaves we allow the model to make, the more we move towards the overfitting
# We can use a utility function to help compare MAE scores from different values for max_leaf_nodes

def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return(mae)

In [None]:
# Code you have previously used to load data
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.metrics import accuracy_score


# Path of the file to read
iowa_file_path = '../input/home-data-for-ml-course/train.csv'

home_data = pd.read_csv(iowa_file_path)


# Create target object and call it y
y = home_data.SalePrice

# Create X
features = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']
X = home_data[features]

# Split into validation and training data
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)

In [None]:
# Hyper paramaters
min_samples_split_list = [2,10, 30, 50, 100, 200, 300, 700] # If the number is an integer, then it is the actual quantity of samples,
                                                            # Choosing a higher min_samples_split can reduce the number of splits and may help to reduce overfitting
    
# Since we passed hyper parameters as lists, we are going to iterate over the values to find the best accuracy and less overfitting value
# Experimenting with min_sample_split hyperparamter
accuracy_list_train = []
accuracy_list_val = []
for min_samples_split in min_samples_split_list: # Increasing the the number of min_samples_split reduces overfitting, 
    # Even though it does not improve the validation accuracy, it brings the training accuracy closer to it, showing a reduction in overfitting
    # You can fit the model at the same time you define it, because the fit function returns the fitted estimator.
    model = DecisionTreeClassifier(min_samples_split = min_samples_split,
                                   random_state = RANDOM_STATE).fit(X_train,y_train) 
    predictions_train = model.predict(X_train) ## The predicted values for the train dataset
    predictions_val = model.predict(X_val) ## The predicted values for the test dataset
    accuracy_train = accuracy_score(predictions_train,y_train)
    accuracy_val = accuracy_score(predictions_val,y_val)
    accuracy_list_train.append(accuracy_train)
    accuracy_list_val.append(accuracy_val)

In [None]:
max_depth_list = [1,2, 3, 4, 8, 16, 32, 64, None] # None means that there is no depth limit.
                                                  # Choosing a lower max_depth can reduce the number of splits and may help to reduce overfitting
# Experimenting with max_depth hyperparamter
accuracy_list_train = []
accuracy_list_val = []
for max_depth in max_depth_list:
    # You can fit the model at the same time you define it, because the fit function returns the fitted estimator.
    model = DecisionTreeClassifier(max_depth = max_depth,
                                   random_state = RANDOM_STATE).fit(X_train,y_train) 
    predictions_train = model.predict(X_train) ## The predicted values for the train dataset
    predictions_val = model.predict(X_val) ## The predicted values for the test dataset
    accuracy_train = accuracy_score(predictions_train,y_train)
    accuracy_val = accuracy_score(predictions_val,y_val)
    accuracy_list_train.append(accuracy_train)
    accuracy_list_val.append(accuracy_val)

In [None]:
# Underfitting: Both training and validation accuracy decreases; the tree cannot make enough splits to distinguish positives from negatives (the model is underfitting the training set)
# Good parameters: Increases validation accuracy closer to training accuracy, even if it significantly reduces training accuracy and the validation accuracy reaches its highest at these paramteres
# Overfitting: Validation accuracy decreases while training accuracy increases

# After this expirment, we can chose the correct and most accurate paramteres for our Tree 
decision_tree_model = DecisionTreeClassifier(min_samples_split = 50,
                                             max_depth = 3,
                                             random_state = RANDOM_STATE).fit(X_train,y_train)

                                                                              
accuracy_score(decision_tree_model.predict(X_train),y_train)
accuracy_score(decision_tree_model.predict(X_val),y_val)
               

Note that we are searching for the best value one hyperparameter while leaving the other hyperparameters at their default values.
- Ideally, we would want to check every combination of values for every hyperparameter that we are tuning.
- If we have 3 hyperparameters, and each hyperparameter has 4 values to try out, we should have a total of 4 x 4 x 4 = 64 combinations to try.
- When we only modify one hyperparameter while leaving the rest as their default value, we are trying 4 + 4 + 4 = 12 results. 
- To try out all combinations, we can use a sklearn implementation called GridSearchCV. GridSearchCV has a refit parameter that will automatically refit a model on the best combination so we will not need to program it explicitly. For more on GridSearchCV, please refer to its [documentation](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html).