In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

In [2]:
df = pd.read_csv("home_price_train.csv")

In [3]:
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [4]:
df.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

### 1. Selecting The Prediction Target

The columns that are inputted into our model (and later used to make predictions) are called "features." In our case, those would be the columns used to determine the home price. Sometimes, you will use all columns except the target as features. Other times you'll be better off with fewer features.

For now, we'll build a model with only a few features. Later on you'll see how to iterate and compare models built with different features.



In [5]:
y = df.SalePrice

### 2. Choosing "Features"




In [6]:
featurs = ["LotArea", "YearBuilt" ,"1stFlrSF" ,"2ndFlrSF",
                 "FullBath" ,"BedroomAbvGr" ,"TotRmsAbvGrd"]

In [7]:
X = df[featurs]

In [8]:
X.describe()

Unnamed: 0,LotArea,YearBuilt,1stFlrSF,2ndFlrSF,FullBath,BedroomAbvGr,TotRmsAbvGrd
count,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,10516.828082,1971.267808,1162.626712,346.992466,1.565068,2.866438,6.517808
std,9981.264932,30.202904,386.587738,436.528436,0.550916,0.815778,1.625393
min,1300.0,1872.0,334.0,0.0,0.0,0.0,2.0
25%,7553.5,1954.0,882.0,0.0,1.0,2.0,5.0
50%,9478.5,1973.0,1087.0,0.0,2.0,3.0,6.0
75%,11601.5,2000.0,1391.25,728.0,2.0,3.0,7.0
max,215245.0,2010.0,4692.0,2065.0,3.0,8.0,14.0


In [9]:
X.head()

Unnamed: 0,LotArea,YearBuilt,1stFlrSF,2ndFlrSF,FullBath,BedroomAbvGr,TotRmsAbvGrd
0,8450,2003,856,854,2,3,8
1,9600,1976,1262,0,2,3,6
2,11250,2001,920,866,2,3,6
3,9550,1915,961,756,1,3,7
4,14260,2000,1145,1053,2,4,9


In [10]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype
---  ------        --------------  -----
 0   LotArea       1460 non-null   int64
 1   YearBuilt     1460 non-null   int64
 2   1stFlrSF      1460 non-null   int64
 3   2ndFlrSF      1460 non-null   int64
 4   FullBath      1460 non-null   int64
 5   BedroomAbvGr  1460 non-null   int64
 6   TotRmsAbvGrd  1460 non-null   int64
dtypes: int64(7)
memory usage: 80.0 KB


In [11]:
X.isnull().sum()

LotArea         0
YearBuilt       0
1stFlrSF        0
2ndFlrSF        0
FullBath        0
BedroomAbvGr    0
TotRmsAbvGrd    0
dtype: int64

### 3.  Model Validation by Split Data into train and test data

**There are many metrics for summarizing model quality, but we'll start with one called Mean Absolute Error (also called MAE). Let's break down this metric starting with the last word, error.**


#### I. To solve validation issues we will split our data into train and test datasets using  `train_test_split` .


#### II. The prediction error for each house is: error=actual−predicted

> mean_absolute_error(test_y, predicted_prices)


In [12]:
train_X, test_X, train_y, test_y = train_test_split(X, y, random_state = 1)

### 4. Building Your Model

The steps to building and using a model are:

>1. Define: What type of model will it be? A decision tree? Some other type of model? Some other parameters of the model type are specified too.

>2. Fit: Capture patterns from provided data. This is the heart of modeling.

>3. Predict: Just what it sounds like.

>4. Evaluate: Determine how accurate the model's predictions are.

### We will chose between 

>**A. DecisionTree model**

>**B.  Random Forest model**

# A. DecisionTree model

In [13]:
# Define DecisionTree model

dt_model = DecisionTreeRegressor(random_state = 1)

# Fit model

dt_model.fit(train_X, train_y)

# get predicted prices on validation data

predictions = dt_model.predict(test_X)

In [14]:
# calculate mean_absolute_error

mean_absolute_error(test_y, predictions)

29652.931506849316

## 5. Underfitting and Overfitting

> **we will make a function that calsulates mean_absolute_error with diffrent number of leaves in decision tree.**

In [15]:
def get_mae(max_leaf_nodes, train_X, test_X, train_y, test_y):
    """
    INPUT:
    max_leaf_nodes: int Number of max leaves in decision tree. 
    train_X : list of X data for train
    test_X: list of X data for test
    train_y : list of y data for train
    test_y: list of y data for test
    
    OUTPUT:
    
    mae : int mean_absolute_error for this number of max leaves in decision tree.
    """
    
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    
    model.fit(train_X, train_y)
    preds = model.predict(test_X)
    mae = mean_absolute_error(test_y, preds)
    
    return(mae)

## 6. Check the optimal number of leaves.


In [17]:
#compare MAE with differing values of max_leaf_nodes

for max_leaf_nodes in [5, 50, 500, 5000]:
    my_mae = get_mae(max_leaf_nodes, train_X, test_X, train_y, test_y)
    print("Max leaf nodes: %d  \t\t Mean Absolute Error:  %d" %(max_leaf_nodes, my_mae))

Max leaf nodes: 5  		 Mean Absolute Error:  35044
Max leaf nodes: 50  		 Mean Absolute Error:  27405
Max leaf nodes: 500  		 Mean Absolute Error:  29454
Max leaf nodes: 5000  		 Mean Absolute Error:  30139


### Of the options listed, 50 is the optimal number of leaves.

# 7.Final model

**We will fit model on All dataset**

In [52]:
final_model = DecisionTreeRegressor(max_leaf_nodes = 50, random_state=0)

final_model.fit(X, y)

DecisionTreeRegressor(max_leaf_nodes=50, random_state=0)

# B. Random Forest model

In [53]:
# Define RandomForest model

forest_model = RandomForestRegressor(random_state=1)

# Fit Model

forest_model.fit(train_X, train_y)

# get predicted prices on validation data

preds = forest_model.predict(test_X)

In [54]:
# calculate mean_absolute_error

mean_absolute_error(test_y, preds)

21857.15912981083

### Conclusion

**There is likely room for further improvement, but this is a big improvement over the best decision tree error of 30,000. There are parameters which allow you to change the performance of the Random Forest much as we changed the maximum depth of the single decision tree. But one of the best features of Random Forest models is that they generally work reasonably even without this tuning**

# Finally Train a model for the competition (Using the better model)

In [55]:
df_test = pd.read_csv("home_price_test.csv")

In [56]:
test_X = df_test[featurs]

In [57]:
# Define RandomForest model

rf_model_on_full_data = RandomForestRegressor(random_state=1)

# Fit Model

rf_model_on_full_data.fit(X,y)

# get predicted prices on validation data

rf_model_on_full_data.predict(test_X)

array([122656.58, 156789.  , 182959.  , ..., 151283.01, 127878.  ,
       225959.8 ])