In [20]:
import pandas as pd
# This code reads a dataset containing Melbourne housing data and prints a summary of the data.
from datetime import datetime


iowa_file_path = './datasets/train_housing.csv'
# read the data and store data in DataFrame titled home_data
home_data = pd.read_csv(iowa_file_path)
# print a summary of the data in home_data
print(home_data.describe())





                Id   MSSubClass  LotFrontage        LotArea  OverallQual  \
count  1460.000000  1460.000000  1201.000000    1460.000000  1460.000000   
mean    730.500000    56.897260    70.049958   10516.828082     6.099315   
std     421.610009    42.300571    24.284752    9981.264932     1.382997   
min       1.000000    20.000000    21.000000    1300.000000     1.000000   
25%     365.750000    20.000000    59.000000    7553.500000     5.000000   
50%     730.500000    50.000000    69.000000    9478.500000     6.000000   
75%    1095.250000    70.000000    80.000000   11601.500000     7.000000   
max    1460.000000   190.000000   313.000000  215245.000000    10.000000   

       OverallCond    YearBuilt  YearRemodAdd   MasVnrArea   BsmtFinSF1  ...  \
count  1460.000000  1460.000000   1460.000000  1452.000000  1460.000000  ...   
mean      5.575342  1971.267808   1984.865753   103.685262   443.639726  ...   
std       1.112799    30.202904     20.645407   181.066207   456.098091  ..

In [21]:
# What is the average lot size (rounded to nearest integer)?
avg_lot_size = home_data['LotArea'].mean().round()
print(f"Average Lot Size: {avg_lot_size} square feet")
# As of today, how old is the newest home (current year - the date in which it was built)
newest_home_age = home_data['YearBuilt'].max()
current_year = datetime.now().year
print(f"Newest Home Age: {current_year - newest_home_age} years")




Average Lot Size: 10517.0 square feet
Newest Home Age: 15 years


In [22]:

# save filepath to variable for easier access
melbourne_file_path = './datasets/melb_data.csv'

# read the data and store data in DataFrame titled melbourne_data
melbourne_data = pd.read_csv(melbourne_file_path) 
# print a summary of the data in Melbourne data
#print(melbourne_data.describe())
# Print the columns of the Melbourne data
print(melbourne_data.columns)

# dropna drops missing values (think of na as "not available")
melbourne_data = melbourne_data.dropna(axis=0)


Index(['Suburb', 'Address', 'Rooms', 'Type', 'Price', 'Method', 'SellerG',
       'Date', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Car',
       'Landsize', 'BuildingArea', 'YearBuilt', 'CouncilArea', 'Lattitude',
       'Longtitude', 'Regionname', 'Propertycount'],
      dtype='object')


In [23]:
#dot notation
#selecting the prediction target
#in this case, the price of the house
y = melbourne_data.Price

#choosing features to use for prediction
melbourne_features = ['Rooms', 'Bathroom', 'Landsize', 'Lattitude', 'Longtitude']

X = melbourne_data[melbourne_features]
#using these features to predict the price of the house
print("X consist of these features:", melbourne_features)
#sizes of the data
print("Shape of X:", X.shape)
print("Shape of y:", y.shape)
X.describe()



X consist of these features: ['Rooms', 'Bathroom', 'Landsize', 'Lattitude', 'Longtitude']
Shape of X: (6196, 5)
Shape of y: (6196,)


Unnamed: 0,Rooms,Bathroom,Landsize,Lattitude,Longtitude
count,6196.0,6196.0,6196.0,6196.0,6196.0
mean,2.931407,1.57634,471.00694,-37.807904,144.990201
std,0.971079,0.711362,897.449881,0.07585,0.099165
min,1.0,1.0,0.0,-38.16492,144.54237
25%,2.0,1.0,152.0,-37.855438,144.926198
50%,3.0,1.0,373.0,-37.80225,144.9958
75%,4.0,2.0,628.0,-37.7582,145.0527
max,8.0,8.0,37000.0,-37.45709,145.52635


In [24]:
# Display the first few rows of the DataFrame X
X.head()


Unnamed: 0,Rooms,Bathroom,Landsize,Lattitude,Longtitude
1,2,1.0,156.0,-37.8079,144.9934
2,3,2.0,134.0,-37.8093,144.9944
4,4,1.0,120.0,-37.8072,144.9941
6,3,2.0,245.0,-37.8024,144.9993
7,2,1.0,256.0,-37.806,144.9954


In [25]:
from sklearn.tree import DecisionTreeRegressor
# Define model. Specify a number for random_state to ensure same results each run
melbourne_model = DecisionTreeRegressor(random_state=1)

# Fit/train model
melbourne_model.fit(X, y)

0,1,2
,criterion,'squared_error'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,1
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [27]:
print("Making predictions for the following 5 houses:")
print(X.head())
print("The predictions are")
print(melbourne_model.predict(X.head()))



Making predictions for the following 5 houses:
   Rooms  Bathroom  Landsize  Lattitude  Longtitude
1      2       1.0     156.0   -37.8079    144.9934
2      3       2.0     134.0   -37.8093    144.9944
4      4       1.0     120.0   -37.8072    144.9941
6      3       2.0     245.0   -37.8024    144.9993
7      2       1.0     256.0   -37.8060    144.9954
The predictions are
[1035000. 1465000. 1600000. 1876000. 1636000.]


In [31]:
from sklearn.metrics import mean_absolute_error

predicted_home_prices = melbourne_model.predict(X)
# Calculate and print the Mean Absolute Error (MAE) of the predictions
print(mean_absolute_error(y, predicted_home_prices))

1115.7467183128902


In [35]:
from sklearn.model_selection import train_test_split
# Split the data into training and validation sets
train_X, test_X, train_y, test_y = train_test_split(X, y,random_state=0)
# Create a DecisionTreeRegressor model (no random_state for demonstration)
melbourne_model = DecisionTreeRegressor()

# Fit the model using the training data
melbourne_model.fit(train_X, train_y)

# Make predictions on the validation/test set
y_predictions = melbourne_model.predict(test_X)

# Print the Mean Absolute Error of the predictions
print(mean_absolute_error(test_y, y_predictions))

print("As can be seen, the MAE is much higher than the previous one. This is because the model was trained on a different set of data (the training set) and then tested on a separate validation set. This helps to ensure that the model generalizes well to new data, rather than just memorizing the training data.")

275184.36432106735
As can be seen, the MAE is much higher than the previous one. This is because the model was trained on a different set of data (the training set) and then tested on a separate validation set. This helps to ensure that the model generalizes well to new data, rather than just memorizing the training data.


In [44]:
def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return(mae)

# compare MAE with differing values of max_leaf_nodes
tempL=[]
for max_leaf_nodes in [5, 50, 500, 5000]:
    my_mae = get_mae(max_leaf_nodes, train_X, test_X, train_y, test_y)
    print("Max leaf nodes: %d  \t\t Mean Absolute Error:  %d" %(max_leaf_nodes, my_mae))
    tempL.append((max_leaf_nodes, my_mae))

    
best_max_leaf_nodes, min_mae = min(tempL, key=lambda x: x[1])
print(f"best fitting value here is: {best_max_leaf_nodes}")
print(f"Best max_leaf_nodes: {best_max_leaf_nodes} with minimum MAE: {min_mae}")


#Overfitting: capturing spurious patterns that won't recur in the future, leading to less accurate predictions, or
#Underfitting: failing to capture relevant patterns, again leading to less accurate predictions.

Max leaf nodes: 5  		 Mean Absolute Error:  385696
Max leaf nodes: 50  		 Mean Absolute Error:  279794
Max leaf nodes: 500  		 Mean Absolute Error:  261718
Max leaf nodes: 5000  		 Mean Absolute Error:  271320
best fitting value here is: 500
Best max_leaf_nodes: 500 with minimum MAE: 261718.1134423186


In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

forest_model = RandomForestRegressor(random_state=1)
forest_model.fit(train_X, train_y)
melb_preds = forest_model.predict(test_X)
print(mean_absolute_error(test_y, melb_preds))