## pandas&csv

#### get data

In [68]:
import pandas as pd

- open csv file

In [69]:
path = 'data/melb_data.csv'
data = pd.read_csv(path)
#column count
data.describe() # data: DataFrame

Unnamed: 0,Rooms,Price,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount
count,13580.0,13580.0,13580.0,13580.0,13580.0,13580.0,13518.0,13580.0,7130.0,8205.0,13580.0,13580.0,13580.0
mean,2.937997,1075684.0,10.137776,3105.301915,2.914728,1.534242,1.610075,558.416127,151.96765,1964.684217,-37.809203,144.995216,7454.417378
std,0.955748,639310.7,5.868725,90.676964,0.965921,0.691712,0.962634,3990.669241,541.014538,37.273762,0.07926,0.103916,4378.581772
min,1.0,85000.0,0.0,3000.0,0.0,0.0,0.0,0.0,0.0,1196.0,-38.18255,144.43181,249.0
25%,2.0,650000.0,6.1,3044.0,2.0,1.0,1.0,177.0,93.0,1940.0,-37.856822,144.9296,4380.0
50%,3.0,903000.0,9.2,3084.0,3.0,1.0,2.0,440.0,126.0,1970.0,-37.802355,145.0001,6555.0
75%,3.0,1330000.0,13.0,3148.0,3.0,2.0,2.0,651.0,174.0,1999.0,-37.7564,145.058305,10331.0
max,10.0,9000000.0,48.1,3977.0,20.0,8.0,10.0,433014.0,44515.0,2018.0,-37.40853,145.52635,21650.0


- list of columns in dataset

In [70]:
data.columns

Index(['Suburb', 'Address', 'Rooms', 'Type', 'Price', 'Method', 'SellerG',
       'Date', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Car',
       'Landsize', 'BuildingArea', 'YearBuilt', 'CouncilArea', 'Lattitude',
       'Longtitude', 'Regionname', 'Propertycount'],
      dtype='object')

- dropna drops missing values (think of na as "not available")

In [71]:
data = data.dropna(axis=0)

- we use dot-notation to select the column we want to predict

In [72]:
y = data.Price

- the columns are inputted into model are called features

In [73]:
features = ['Rooms', 'Bathroom', 'Landsize', 'Lattitude', 'Longtitude']
X = data[features]
X.describe()

Unnamed: 0,Rooms,Bathroom,Landsize,Lattitude,Longtitude
count,6196.0,6196.0,6196.0,6196.0,6196.0
mean,2.931407,1.57634,471.00694,-37.807904,144.990201
std,0.971079,0.711362,897.449881,0.07585,0.099165
min,1.0,1.0,0.0,-38.16492,144.54237
25%,2.0,1.0,152.0,-37.855438,144.926198
50%,3.0,1.0,373.0,-37.80225,144.9958
75%,4.0,2.0,628.0,-37.7582,145.0527
max,8.0,8.0,37000.0,-37.45709,145.52635


- top few rows

In [74]:
X.head()

Unnamed: 0,Rooms,Bathroom,Landsize,Lattitude,Longtitude
1,2,1.0,156.0,-37.8079,144.9934
2,3,2.0,134.0,-37.8093,144.9944
4,4,1.0,120.0,-37.8072,144.9941
6,3,2.0,245.0,-37.8024,144.9993
7,2,1.0,256.0,-37.806,144.9954


#### use scikit-learn to build model
- decision tree

In [75]:
from sklearn.tree import DecisionTreeRegressor

# Define model. Specify a number for random_state to ensure same results each run
model = DecisionTreeRegressor()
# Fit model
model.fit(X, y)

DecisionTreeRegressor()

In [76]:
predict_value = model.predict(X)
from sklearn.metrics import mean_absolute_error

mean_absolute_error(y, predict_value)

1115.7467183128902

- spilt test and train data

In [77]:
from sklearn.model_selection import train_test_split

train_X, test_X, train_y, test_y = train_test_split(X, y, random_state=0)
model = DecisionTreeRegressor()
model.fit(train_X, train_y)
predict_value = model.predict(test_X)
mean_absolute_error(test_y, predict_value)

271823.50570260384

- control overfitting&underfitting by specify leaf nodes

In [78]:
def get_mae(max_leaf_nodes,train_X,test_X,train_y,test_y):
    my_model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes)
    my_model.fit(train_X, train_y)
    p_val = my_model.predict(test_X)
    mae = mean_absolute_error(test_y,p_val)
    return mae

- iteratively optimize model

In [79]:
import sys

nodes_num = 2 # initial value
step = 8
current_mae = sys.maxsize
while True:
    mae = get_mae(nodes_num,train_X,test_X,train_y,test_y)
    if current_mae-mae<1:
        break
    current_mae = mae
    nodes_num += step
print('current mae: {}, current leaf num: {}'.format(current_mae,nodes_num))

current mae: 269525.6425124663, current leaf num: 106


- use random forest instead of a single decision tree

In [82]:
from sklearn.ensemble import RandomForestRegressor

nodes_num = 2 # initial value
step = 10
current_mae = sys.maxsize
while True:
    forest_model = RandomForestRegressor(max_leaf_nodes=nodes_num)
    forest_model.fit(train_X,train_y)
    p_val = forest_model.predict(test_X)
    mae = mean_absolute_error(test_y,p_val)
    if current_mae-mae<1:
        break
    current_mae = mae
    nodes_num += step
print('mae: {}, leaf num: {}'.format(current_mae,nodes_num))

mae: 227681.39769992288, leaf num: 142
